#### **Import Libraries**

In [1]:
# Standard library imports
import os

# Deep learning libraries
import torch
from torch.utils.data import DataLoader
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding

# Text processing libraries
import tiktoken
import csv
import pandas as pd

# Utility libraries
import numpy as np
import random
import math
from tqdm import tqdm
from itertools import chain
from IPython.display import display, Markdown
import textwrap

# Custom libraries  
from llmft.train import EncoderTrainer, EarlyStopping
from llmft.metrics import compute_recall, compute_f1_score
from llmft.losses import FocalLoss
from llmft.utils import predict

# Visualization libraries
import seaborn as sns  # Assuming seaborn is installed

# NLP utility (assuming trics is a library/module)
from trics.nlp.utils import to_markdown

# Configure GPU usage and tokenizer parallelism
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Dataset libraries (can be grouped together)
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, concatenate_datasets


import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from sklearn.model_selection import train_test_split

#### **Parameters**

In [2]:
seed = 1                        # Seed
prompt = True                  # Whether to include a prompt about who is likely to take up the treatment if offered
noise = False                    # Whether covariates can predict the takeup of treatment
sample_size = 10_000              # Sample Size
val_set_fraction = 0.25         # Fraction of sample used for validation set
data_version = 1                # version of synthetic data         
class_weight_type = 'standard'  
lr = 2e-4                       # Optimizer learning rate
warmup_ratio = 0.25             # Fraction of training epochs used for learning rate warm up
batch_size = 32                 # Number of observations in each mini-batch
epochs = 50                     # Number of training epochs
patience = 30                    
gamma = 0.0


#### **Seed**

In [3]:
np.random.seed(seed)

#### **Set Up Paths**

In [4]:
data_csv = f'./../../../toy-data/exp2/data_{data_version}.csv'

#### **Set Up Plotting**

In [5]:
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib import rcParams
rcParams['image.interpolation'] = 'nearest'
rcParams['image.cmap'] = 'viridis'
rcParams['axes.grid'] = False
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use('seaborn-v0_8-dark-palette')

from matplotlib import font_manager 
locations = './../../../styles/Newsreader'
font_files = font_manager.findSystemFonts(fontpaths=locations)
print(locations)
print(font_files[0])
for f in font_files: 
    font_manager.fontManager.addfont(f)
plt.rcParams["font.family"] = "Newsreader"

./../../../styles/Newsreader
/home/ubuntu/llmft/styles/Newsreader/static/Newsreader_24pt/Newsreader_24pt-MediumItalic.ttf


#### **First Stage Function**

In [6]:
conditions_not_covered = [
    "Short-term illnesses",
    "Temporary exhaustion",
    "Behavioral Traits and Conditions",
    "Having a quick temper without any underlying medical condition",
    "Certain Addictions",
    "Substance abuse disorders",
    "good health"
]

conditions_covered = [
    "Mobility impairments",
    "Visual impairments",
    "Hearing impairments",
    "Chronic illnesses",
    "Respiratory disorders",
    "Cardiovascular conditions",
    "Intellectual disabilities",
    "Learning disabilities",
    "Autism spectrum disorders",
    "Psychiatric disorders",
    "Traumatic brain injuries",
    "Alzheimer's disease and other dementias"
]

disabilities = conditions_covered + conditions_not_covered

# Create a list of (0,1)s
severity_indicator = [1 if disability in conditions_covered else 0 for disability in disabilities]

def fstage(var1, var2, var3, var4, var5, var6, var7, var8, var9):
    return 1*var9

In [7]:
import re

def extract_information(text):
    # Define simple search terms
    has_lawyer_access = 'tenant has access to a free lawyer' in text
    has_voucher = 'housing voucher' in text
    
    # Define conditions
    conditions = [
        "Short-term illnesses",
        "Temporary exhaustion",
        "Behavioral Traits and Conditions",
        "Having a quick temper without any underlying medical condition",
        "Certain Addictions",
        "Substance abuse disorders",
        "good health",
        "Mobility impairments",
        "Visual impairments",
        "Hearing impairments",
        "Chronic illnesses",
        "Respiratory disorders",
        "Cardiovascular conditions",
        "Intellectual disabilities",
        "Learning disabilities",
        "Autism spectrum disorders",
        "Psychiatric disorders",
        "Traumatic brain injuries",
        "Alzheimer's disease and other dementias"
    ]
    
    # Check for conditions
    condition_status = [condition for condition in conditions if condition.lower() in text.lower()]
    condition_status = f"covered conditions: {', '.join(condition_status)}"
    lawyer_status = "tenant has access to legal representation" if has_lawyer_access else "the right to counsel is not in effect"
    voucher_status = "tenant has a housing voucher" if has_voucher else "tenant does not have a housing voucher"
    result = f"{lawyer_status}, {voucher_status}, {condition_status}"
    return result

#### **Read in Data**

In [8]:
import re 

def remove_newlines(text):
    return re.sub(r'\n+', '', text)

string_b = 'Based on the following information, predict (Yes/No) whether the tenant will be represented in court by a lawyer.\n\n'
string1 = "Note: Some tenants have access to a free lawyer others don't. This tenant **has access** to a free lawyer.\n\n"
string0 = "Note: Some tenants have access to a free lawyer others don't. This tenant **does not have access** to a free lawyer.\n\n"
if prompt:

    context = "Also Note: Access to a lawyer does not mean the tenant will be represented in court by a lawyer." \
        " If the tenant is given access to a lawyer, they must apply for representation." \
        " Because more tenants apply than can be represented, legal aid providers prioritize tenants with **vouchers** and **disabilities** when reviewing applications." \
        " Providers can differ over what they consider to be a dissability."\
        " If a tenant's application is selected, they must follow-up with the provider to arange for legal representation." \
        " Therefor it's possible that tenants with characteristics which sugges they they should be prioritized remain without representation."
else:
    context = ''

# Read in Data Set
df = pd.read_csv(data_csv)
df['Description'] = df['Description'].apply(lambda x: remove_newlines(x))


# Subsample Observations
indices = np.random.choice(df.index, size=sample_size, replace=False)
df = df.loc[indices].reset_index(drop=True)

# Apply First Stage Function
df['FStage_Value'] = df.apply(lambda row: fstage(
                                                 row['Var1'], 
                                                 row['Var2'], 
                                                 row['Var3'], 
                                                 row['Var4'], 
                                                 row['Var5'],
                                                 row['Var6'],
                                                 row['Var7'],
                                                 row['Var8'],
                                                 row['Var9']), axis=1)

# If Noise: Shuffle the first stage values
if noise: 
    df['FStage_Value'] = df['FStage_Value'].sample(frac=1).reset_index(drop=True)

# Sample Instrumental Values  
df['Instrument'] = np.random.binomial(n=1, p=0.5, size=sample_size)

# Text + Instrument
df['FullDescription'] = np.where(df['Instrument'] == 1,
                             string_b + string1 + 'Description:' + df['Description'].replace("\n", "") + '\n\n' + context,
                             string_b + string0 + 'Description:' + df['Description'].replace("\n", "") + '\n\n'+ context)

# Text + Instrument == 1
df['Treated_FullDescription'] = df.apply(lambda row: string_b + string1 + 'Description:' + row['Description'].replace("\n", "") + '\n\n' + context, axis=1)

# Text + Instrument == 0
df['Control_FullDescription'] = df.apply(lambda row: string_b + string0 + 'Description:' + row['Description'].replace("\n", "") + '\n\n' + context, axis=1)

# Sample Treatment Values
df['Treatment'] = np.random.binomial(n=1, p= df['FStage_Value'] * df['Instrument'], size=sample_size)

# Sample Outcome Values
df['Outcome'] =  (1. +  2.*(df['FStage_Value'] > 0.))*df['Treatment'] + 0.1*np.random.normal(size=sample_size)

In [9]:
# Building the vocabulary
def build_vocab(sentences):
    vocab = Counter()
    for sentence in sentences:
        for word in sentence.split():
            vocab[word] += 1
    return {word: i for i, (word, _) in enumerate(vocab.items())}

vocab = build_vocab(df.FullDescription.apply(extract_information))
vocab_size = len(vocab)
print(vocab_size)

61


In [19]:
idx = (df.FullDescription.apply(extract_information).str.contains('the right to counsel', case=False, na=False)) & 
df[ ]


0.0766

In [11]:
# Encoding sentences as BoW vectors
def encode_sentence(sentence, vocab):
    vector = torch.zeros(len(vocab))
    for word in sentence.split():
        if word in vocab:
            vector[vocab[word]] += 1
    return vector

# Encode all sentences
X = torch.stack([encode_sentence(sentence, vocab) for sentence in df.FullDescription.apply(extract_information)])
y = torch.tensor(df.Treatment, dtype=torch.float32)

In [12]:
class BoWModel(nn.Module):
    def __init__(self, vocab_size):
        super(BoWModel, self).__init__()
        self.fc1 = nn.Linear(vocab_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

In [14]:
# Model initialization
model = BoWModel(vocab_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training the model
epochs = 500
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    
    outputs = model(X).squeeze()
    loss = criterion(outputs, y)
    
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
        sns.kdeplot(outputs.detach().numpy(), color='blue', fill=False, bw_adjust=0.25, label='Train')
        plt.show()

AttributeError: module 'torch.optim' has no attribute 'sgd'

In [None]:
outputs = model(X).squeeze()
outputs.detach()

In [None]:
df['contains_housing_voucher']

In [None]:
# Filter for rows with 'housing voucher'
housing_voucher_df = df[(df['contains_housing_voucher']) & (df['Instrument'])]

# Compute the fraction of those with 'housing voucher' who receive treatment
fraction_with_treatment = housing_voucher_df['Treatment'].mean()

print(f"Fraction of those with a housing voucher who receive the treatment: {fraction_with_treatment:.2f}")