### **Import Libraries**

In [1]:
import os

# Set the CUDA_LAUNCH_BLOCKING environment variable
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = 'True'

# Verify that the environment variables are set
print("CUDA_LAUNCH_BLOCKING:", os.environ.get('CUDA_LAUNCH_BLOCKING'))
print("TORCH_USE_CUDA_DSA:", os.environ.get('TORCH_USE_CUDA_DSA'))


CUDA_LAUNCH_BLOCKING: 1
TORCH_USE_CUDA_DSA: True


In [2]:
from datasets import load_dataset 
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer
import torch 
import matplotlib.pyplot as plt 
from transformers import DataCollatorWithPadding
import os 
from pathlib import Path
import random 
from datasets import Dataset
import warnings
from functools import partial
from datasets import concatenate_datasets, DatasetDict
from functools import partial 
from IPython.display import display
from IPython.display import Markdown
import textwrap
from transformers import pipeline
from trl import SFTTrainer
# Filter out the specific warning
warnings.filterwarnings('ignore', message='Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.')

#### **Plotting**

In [3]:
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib import rcParams
rcParams['image.interpolation'] = 'nearest'
rcParams['image.cmap'] = 'viridis'
rcParams['axes.grid'] = False
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use('seaborn-v0_8-dark-palette')
from matplotlib import font_manager 
locations =  './../../styles/Newsreader'
font_files = font_manager.findSystemFonts(fontpaths=locations)
print(locations)
print(font_files[0])
for f in font_files: 
    font_manager.fontManager.addfont(f)
plt.rcParams["font.family"] = "Newsreader"

./../../styles/Newsreader
/home/ubuntu/llmft/styles/Newsreader/static/Newsreader_9pt/Newsreader_9pt-Medium.ttf


#### **Key Parameters**

In [4]:
# This cell is tagged with `parameters`
model_name = 'google-bert/bert-base-uncased' # 'distilbert-base-cased'
data_link = 'ppower1/covariates'
casusal_variable = False 
column = 'text'
num_epochs = 3
seed = 2 
test_size = 0.5


#### **Helper Functions**

In [5]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

### ---         Memory Check
def Memory():
    print("Current memory usage:")
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
### ---



### ---         Print Markdown
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
### ---

#### **Model**

In [6]:
Memory()
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
Memory()

Current memory usage:
Allocated: 0.0 GB
Cached:    0.0 GB


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Current memory usage:
Allocated: 0.0 GB
Cached:    0.0 GB


#### **Load Data**

In [11]:
# Load Dataset 
original_dataset = load_dataset("yelp_review_full") # load_dataset(data_link)['train']

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = original_dataset.map(tokenize_function, batched=True)
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

In [12]:
small_eval_dataset

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})

#### **Hyperparameters**

In [13]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

#### **Define Trainer**

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [15]:
# Train
trainer.train()

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
steps, train_loss = [0] + [i['step'] for i in trainer.state.log_history if 'loss' in i], [init_train_loss] + [i['loss'] for i in trainer.state.log_history if 'loss' in i]
_, eval_loss = [i['step'] for i in trainer.state.log_history if 'eval_loss' in i], [init_eval_loss] + [i['eval_loss'] for i in trainer.state.log_history if 'eval_loss' in i]
eval_accuracy = [init_eval_accuracy] + [i['eval_accuracy'] for i in trainer.state.log_history if 'eval_accuracy' in i]
lr = [i['learning_rate'] for i in trainer.state.log_history if 'learning_rate' in i]

In [None]:
plt.plot(lr)
plt.show()

In [None]:
fig = plt.figure(dpi=300, tight_layout=True, figsize=(7, 4.5))
ax = plt.axes(facecolor=(.95, .96, .97))

# Plot customizations
for key in 'left', 'right', 'top':
    ax.spines[key].set_visible(False)
ax.text(0., 1.02, s='Count', transform=ax.transAxes, size=14)
ax.yaxis.set_tick_params(length=0)
ax.yaxis.grid(True, color='white', linewidth=2)
ax.set_axisbelow(True)
ax.set_title('Tokens in Summary of Landlord Complaint', size=14, loc='center', pad=20)

plt.plot(train_loss, label='Training Loss')
plt.plot(eval_loss, label='Validation loss')
plt.plot(eval_accuracy, label='Accuracy')

# Annotating each line at its last data point
x_positions = len(train_loss) -2  # Assuming all lists have the same length
y_pos_shift = 0.05

ax.annotate(f'Accuracy',
            xy=(x_positions, eval_accuracy[-1] + y_pos_shift),
            xytext=(8, 0),  # Shift text to right slightly
            textcoords='offset points',
            va="center", 
            ha='left',
            size=12)

ax.annotate(f'Train Loss',
            xy=(x_positions, train_loss[-1] + y_pos_shift),
            xytext=(8, 0),
            textcoords='offset points',
            va="center", 
            ha='left',
            size=12)

ax.annotate(f'Validation Loss',
            xy=(x_positions, eval_loss[-1] + y_pos_shift),
            xytext=(8, 0),
            textcoords='offset points',
            va="center", 
            ha='left',
            size=12)
plt.xlim(0, num_epochs+1)
plt.ylim(0, 1)
plt.show()

In [None]:
with torch.no_grad():
    original_predictions = trainer.predict(tokenized_original_dataset).predictions
    original_predictions = torch.nn.functional.softmax(torch.tensor(original_predictions), dim=1)[:,1].numpy()


try: 
    with torch.no_grad():
        treated_predictions = trainer.predict(tokenized_treated_dataset).predictions
        treated_predictions = torch.nn.functional.softmax(torch.tensor(treated_predictions), dim=1)[:,1].numpy()

    with torch.no_grad():
        control_predictions = trainer.predict(tokenized_control_dataset).predictions
        control_predictions = torch.nn.functional.softmax(torch.tensor(control_predictions), dim=1)[:,1].numpy()
except NameError as e:
    pass 

In [None]:
text = 'I went to the beach'
trainer.predict(**tokenizer(text, truncation=True, padding=True, max_length=max_tokens))

In [None]:
tokenized_original_dataset

In [None]:
trainer.predict(tokenized_original_dataset)

In [None]:
control_predictions

In [None]:
results = np.hstack((np.array(tokenized_original_dataset['is_train']).reshape(-1,1), original_predictions.reshape(-1,1), treated_predictions.reshape(-1,1), control_predictions.reshape(-1,1) ))

In [None]:
fig = plt.figure(dpi=300, tight_layout=True, figsize=(4, 4.5))
ax = plt.axes(facecolor=(.95, .96, .97))
ax.xaxis.set_tick_params(length=0, labeltop=False, labelbottom=True)

for key in 'left', 'right', 'top':
    ax.spines[key].set_visible(False)

ax.text(0., 1.02, s='Effect', transform=ax.transAxes, size=14)
ax.yaxis.set_tick_params(length=0)
ax.yaxis.grid(True, color='white', linewidth=2)
ax.set_axisbelow(True)
plt.hist((treated_predictions - control_predictions).reshape(-1,), color='#36454F', density=True, bins=30)
plt.show()

In [None]:
fig = plt.figure(dpi=300, tight_layout=True, figsize=(4, 4.5))
ax = plt.axes(facecolor=(.95, .96, .97))
ax.xaxis.set_tick_params(length=0, labeltop=False, labelbottom=True)

for key in 'left', 'right', 'top':
    ax.spines[key].set_visible(False)

ax.text(0., 1.02, s='Effect', transform=ax.transAxes, size=14)
ax.yaxis.set_tick_params(length=0)
ax.yaxis.grid(True, color='white', linewidth=2)
ax.set_axisbelow(True)
plt.scatter(control_predictions.reshape(-1,), (treated_predictions - control_predictions).reshape(-1,), color='#36454F')
plt.xlim(0, .15)
plt.xlabel("Probability of Outcome Without Instrment", size=12)
plt.show()

In [None]:
os.getcwd()

#### **Clean Up**

In [None]:
!rm -rf ./output_dir

In [None]:
!rm -rf ./synth_evict/