#### **Import Libraries**

In [None]:
import os 
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from tqdm import tqdm
import numpy as np 
import random 
import math
from itertools import chain
from IPython.display import display, Markdown
import textwrap
import tiktoken
import csv
import time 
import pandas as pd 
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
import transformers
from transformers import DataCollatorWithPadding
from llmft.train import EncoderTrainer, EarlyStopping
from llmft.metrics import compute_recall
from llmft.losses import FocalLoss
from llmft.utils import predict
import seaborn as sns 
import jax 
import jax.numpy as jnp 
import optax 
from trics.regression.data import Data 
from trics.regression.est import iv 
%config InlineBackend.figure_format = 'retina'  # For better quality figures
import warnings
warnings.filterwarnings('ignore', message=".*not a recognized feature for this target.*", category=UserWarning)


In [None]:
seed = 3
noise = False 
verbose = True 
sample_size = 1000 
version = 4 

In [None]:
np.random.seed(seed)

#### **Set Up Paths**

In [None]:
data_csv = f'./../../../toy-data/exp2/data_{version}.csv'

#### **Set Up Plotting**

In [None]:
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib import rcParams
rcParams['image.interpolation'] = 'nearest'
rcParams['image.cmap'] = 'viridis'
rcParams['axes.grid'] = False
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use('seaborn-v0_8-dark-palette')

from matplotlib import font_manager 
locations = './../../../styles/Newsreader'
font_files = font_manager.findSystemFonts(fontpaths=locations)
print(locations)
print(font_files[0])
for f in font_files: 
    font_manager.fontManager.addfont(f)
plt.rcParams["font.family"] = "Newsreader"

In [None]:
def fstage(var0, var1, var2, var3, var4):
    return (1.0-var4)

#### **Read in Data**

In [None]:
df = pd.read_csv(data_csv)
indices = np.random.choice(df.index, size=sample_size, replace=False)
df = df.loc[indices].reset_index(drop=True)
n = len(df)  # Get the number of rows in df
df['FStage_Value'] = df.apply(lambda row: fstage(row['Var0'], row['Var1'], row['Var2'], row['Var3'], row['Var4']), axis=1)
if noise: 
    df['FStage_Value'] = df['FStage_Value'].sample(frac=1).reset_index(drop=True)
df['Instrument'] = np.random.binomial(n=1, p=0.5, size=n)
df['Treatment'] = np.random.binomial(n=1, p= df['FStage_Value'] * df['Instrument'], size=n)
df['Outcome'] = df['Treatment'] + 0.1*np.random.normal(size=n)

In [None]:
fig = plt.figure(dpi=300, tight_layout=True, figsize=(7, 4.5))
ax = plt.axes(facecolor=(.95, .96, .97))

# Plot customizations
for key in 'left', 'right', 'top':
    ax.spines[key].set_visible(False)
ax.text(0., 1.02, s='Count', transform=ax.transAxes, size=14)
ax.yaxis.set_tick_params(length=0)
ax.yaxis.grid(True, color='white', linewidth=2)
ax.set_axisbelow(True)
plt.hist(df['FStage_Value'], color='#36454F')
plt.xlim(0, 1)
plt.xlabel('Probability of Treatment Given Instrument==1', size=14)
plt.show()

#### **Generate Outcome Data**

In [None]:
D = jnp.array(df['Treatment'].values.reshape(-1,1))
X = jnp.ones_like(D)

Z = jnp.array(df['Instrument'].values.reshape(-1,1))
Y = jnp.array(df['Outcome'].values.reshape(-1,1))


regs = jnp.hstack((X, Z))
Dhat_xz = regs @ np.linalg.lstsq(regs, D)[0]
 
regs = X
Dhat_x = np.array(regs @ np.linalg.lstsq(regs, D)[0])

residuals = Dhat_xz - Dhat_x
np.linalg.lstsq(np.hstack((residuals, np.ones_like(residuals))), Y)

In [None]:
est = iv(Data(X, D, Y, Z))
df_estimate = pd.DataFrame({'Estimate': est})
est

In [None]:
# Define the path for the CSV file
file_path = f'./../../../toy-data/exp2/results/linear_{version}_{noise}.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    # If the file does not exist, write with headers
    df_estimate[['Estimate']].to_csv(file_path, mode='w', header=True, index=False)
else:
    # If the file exists, append without headers
    df_estimate[['Estimate']].to_csv(file_path, mode='a', header=False, index=False)