#### **Import Libraries**

In [None]:
import os 
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from tqdm import tqdm
import numpy as np 
import random 
import math
from itertools import chain
from IPython.display import display, Markdown
import textwrap
import tiktoken
import csv
import time 
import pandas as pd 
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
import transformers
from transformers import DataCollatorWithPadding
from llmft.train import EncoderTrainer, EarlyStopping
from llmft.metrics import compute_recall
from llmft.losses import FocalLoss
from llmft.utils import predict
import seaborn as sns 

In [None]:
seed = 0 
noise = True 
verbose = False 

In [None]:
np.random.seed(seed)

#### **Set Up Paths**

In [None]:
data_csv = './../../../toy-data/exp2/data_1.csv'

#### **Set Up Plotting**

In [None]:
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib import rcParams
rcParams['image.interpolation'] = 'nearest'
rcParams['image.cmap'] = 'viridis'
rcParams['axes.grid'] = False
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use('seaborn-v0_8-dark-palette')

from matplotlib import font_manager 
locations = './../../../styles/Newsreader'
font_files = font_manager.findSystemFonts(fontpaths=locations)
print(locations)
print(font_files[0])
for f in font_files: 
    font_manager.fontManager.addfont(f)
plt.rcParams["font.family"] = "Newsreader"

In [None]:
def fstage(race, gender, rent, health, fault):
    return 0.05 + .2*(rent<800.) + 0.5*fault  + 0.1*race*gender

#### Read in Data

In [None]:
df = pd.read_csv(data_csv)
  
df['FStage_Value'] = df.apply(lambda row: fstage(row['Race'], row['Gender'], row['Rent'], row['Health'], row['Fault']), axis=1)
if noise: 
    df['FStage_Value'] = df['FStage_Value'].sample(frac=1).reset_index(drop=True)

fig = plt.figure(dpi=300, tight_layout=True, figsize=(7, 4.5))
ax = plt.axes(facecolor=(.95, .96, .97))

# Plot customizations
for key in 'left', 'right', 'top':
    ax.spines[key].set_visible(False)
ax.text(0., 1.02, s='Count', transform=ax.transAxes, size=14)
ax.yaxis.set_tick_params(length=0)
ax.yaxis.grid(True, color='white', linewidth=2)
ax.set_axisbelow(True)
plt.hist(df['FStage_Value'], color='#36454F')
plt.xlim(0, 1)
plt.xlabel('Probability of Treatment Given Instrument', size=14)
plt.show()

#### **Generate Outcome Data**

In [None]:
# Assuming df already exists and has the correct 'Stage_Value' column
n = len(df)  # Get the number of rows in df

# Assign the 'Instrument' column
df['Instrument'] = np.random.binomial(n=1, p=0.5, size=n)

# Calculate probability based on 'Stage_Value' and 'Instrument'
base_prob = 0.5  # This is an example base probability
df['Treatment'] = np.random.binomial(n=1, p= 0.* (1-df['Instrument']) +  df['FStage_Value'] * df['Instrument'], size=n)
df['Outcome'] = np.random.binomial(n=1, p=df['Treatment'] * 0.5 + 0.25, size=n)

In [None]:
import jax 
import jax.numpy as jnp 
import optax 
from trics.regression.data import Data 
from trics.regression.est import iv 

In [None]:
X = jnp.array(df[['Race','Gender', 'Rent', 'Health']].values)
D = jnp.array(df['Treatment'].values.reshape(-1,1)) 
Z = jnp.array(df['Instrument'].values.reshape(-1,1))
Y = jnp.array(df['Outcome'].values.reshape(-1,1))

In [None]:
est = iv(Data(X, D, Y, Z))

df = pd.DataFrame({'Estimate': est})

In [None]:
# Define the path for the CSV file
file_path = f'./../../../toy-data/exp2/results/linear_{noise}.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    # If the file does not exist, write with headers
    df[['Estimate']].to_csv(file_path, mode='w', header=True, index=False)
else:
    # If the file exists, append without headers
    df[['Estimate']].to_csv(file_path, mode='a', header=False, index=False)