<div class="alert alert-block alert-success">

# **1.** **Setup**

<div>

In [1]:
from utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Set random seeds for reproducibility
tf.random.set_seed(221)
random.seed(221)
np.random.seed(221)
tf.random.set_seed(221)

In [None]:
# Load the train/val split data without preprocessing
with open('train_val_split_no_preproc.pkl', 'rb') as f:
    data_no_preproc = pickle.load(f)

# Convert DataFrames to list
train_texts = data_no_preproc['x_train'].tolist()
val_texts = data_no_preproc['x_val'].tolist()

# Convert Series to list
train_labels = data_no_preproc['y_train'].tolist()
val_labels = data_no_preproc['y_val'].tolist()

In [None]:
# # Load variables from .env into environment
# load_dotenv()

# # Print environment variable
# print("AZURE_OPENAI_ENDPOINT:", os.getenv("AZURE_OPENAI_ENDPOINT"))

# # Initialize Azure OpenAI client
# client = AzureOpenAI(
#     api_key=os.getenv("AZURE_OPENAI_KEY"),
#     api_version="2024-02-01",
#     azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
# )

# # Define embedding model
# model = "text-embedding-3-small"

<div class="alert alert-block alert-success">

# **2.** **Models**

<div>

## **2.1** DistilBERT (Encoder)

In [None]:
# Prepare Hugging Face Datasets
train_ds = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_ds = Dataset.from_dict({"text": val_texts, "label": val_labels})
dataset = DatasetDict({"train": train_ds, "validation": val_ds})

In [None]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    return tokenizer(example["text"],truncation=True)

tokenized_datasets = dataset.map(tokenize, batched=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(train_labels)))

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,                        
    eval_strategy="epoch",                    
    save_strategy="epoch",                    
    load_best_model_at_end=True,              
    metric_for_best_model="f1",               
    greater_is_better=True,                   
    report_to=[]       
)

# Data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
  
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}
    

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()

# Save model and tokenizer
trainer.save_model("distilbert-base-uncased")
tokenizer.save_pretrained("distilbert-base-uncased")

In [None]:
# Evaluate on training set
train_metrics = trainer.evaluate(eval_dataset=dataset["train"])
print("\nTraining Set Evaluation Metrics:")
for key, value in train_metrics.items():
    print(f"{key}: {value:.4f}")

# Evaluate on validation set
val_metrics = trainer.evaluate(eval_dataset=dataset["validation"])
print("\nValidation Set Evaluation Metrics:")
for key, value in val_metrics.items():
    print(f"{key}: {value:.4f}")

In [None]:
# Load test data
test_df = pd.read_csv("test.csv")  
test_texts = test_df["text"].tolist()

# Tokenize
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")
outputs = model(**test_encodings)
preds = outputs.logits.argmax(dim=1).numpy()

# Add predictions to the DataFrame
test_df["predicted_label"] = preds
test_df.head()

In [None]:
# Get true labels and predictions from validation set
predictions = trainer.predict(tokenized_datasets["validation"])
y_true = predictions.label_ids
y_pred = predictions.predictions.argmax(axis=1)

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
labels = list(set(train_labels))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap='Blues', xticks_rotation=45)
plt.title("Confusion Matrix")
plt.show()

## **2.2** Extra

### **2.2.1** RoBERTa (Encoder)

#### **2.2.1.1** Setup

In [3]:
# Prepare Hugging Face Datasets
train_ds = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_ds = Dataset.from_dict({"text": val_texts, "label": val_labels})
dataset = DatasetDict({"train": train_ds, "validation": val_ds})

In [4]:
# Load RoBERTa tokenizer and model (no sentiment head)
model_checkpoint = "cardiffnlp/twitter-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Tokenize the dataset
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding=True, max_length=128)

dataset = dataset.map(tokenize, batched=True)

Map: 100%|██████████| 7630/7630 [00:00<00:00, 15291.78 examples/s]
Map: 100%|██████████| 1909/1909 [00:00<00:00, 25963.64 examples/s]


#### **2.2.1.2** Train and Validate

In [None]:
# Define metrics
def trainer_compute_metrics(pred):
    preds = pred.predictions.argmax(-1)
    labels = pred.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",                   # Where model checkpoints and logs go
    learning_rate=2e-5,                       # Standard learning rate for fine-tuning
    per_device_train_batch_size=16,           # Small enough for most GPUs
    per_device_eval_batch_size=32,            # Larger eval batch size is okay for speed
    num_train_epochs=2,                       # 4 epochs — sufficient for many transformer tasks
    weight_decay=0.01,                        # Regularization to avoid overfitting
    logging_dir="./logs",                     # Logs directory
    logging_steps=100,                        # Log every 100 steps
    eval_strategy="epoch",                    # Evaluate at the end of every epoch
    save_strategy="epoch",                    # Save model at the end of every epoch
    load_best_model_at_end=True,              # Keep best model based on metric
    metric_for_best_model="f1",               # Use F1 to choose the best model
    greater_is_better=True,                   # Because higher F1 is better
    report_to=[]                              # Disable WandB or other loggers
)

# Data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    compute_metrics=trainer_compute_metrics
)

# Train the model
trainer.train()

# Save model and tokenizer
trainer.save_model("roberta-market-sentiment")
tokenizer.save_pretrained("roberta-market-sentiment")

In [None]:
# Evaluate on training set
train_metrics = trainer.evaluate(eval_dataset=dataset["train"])
print("\nTraining Set Evaluation Metrics:")
for key, value in train_metrics.items():
    print(f"{key}: {value:.4f}")

# Evaluate on validation set
val_metrics = trainer.evaluate(eval_dataset=dataset["validation"])
print("\nValidation Set Evaluation Metrics:")
for key, value in val_metrics.items():
    print(f"{key}: {value:.4f}")

### **2.2.2** GPT-4o (Decoder)

In [None]:
# Convert Series to list and ensure labels are strings
train_labels = [str(label) for label in data_no_preproc['y_train'].tolist()]
val_labels = [str(label) for label in data_no_preproc['y_val'].tolist()]

In [101]:
# For reproducibility
random.seed(42)

# Label options
label_options = sorted(list(set(train_labels)))

# Generate few-shot examples
few_shot_examples = []
for label in label_options:
    examples = [(t, l) for t, l in zip(train_texts, train_labels) if l == label]
    sample = random.choice(examples)
    few_shot_examples.append({'text': sample[0], 'label': sample[1]})

# Print few-shot examples
print("Few-shot examples:")
for example in few_shot_examples:
    print(f"Text: {example['text']}, Label: {example['label']}")

Few-shot examples:
Text: employee furlough retail industry well million one america biggest clothing store, Label: 0
Text: coca cola break down record quarter, Label: 1
Text: would td bank benefit schwab ameritrade deal, Label: 2


In [None]:
# Run (or load cached) classification
train_pred_gpt4o = cached_classification_run("gpt4o_train_preds.pkl", train_texts, label_options, few_shot_examples, client=client, force_reload=False, batch_size=16)
val_pred_gpt4o = cached_classification_run("gpt4o_val_preds.pkl", val_texts, label_options, few_shot_examples, client=client, force_reload=False, batch_size=16)

No cache found. Running classification and saving to gpt4o_train_preds.pkl


Classifying with GPT-4o: 100%|██████████| 1/1 [00:17<00:00, 17.08s/it]


No cache found. Running classification and saving to gpt4o_val_preds.pkl


Classifying with GPT-4o: 100%|██████████| 1/1 [00:17<00:00, 17.12s/it]


In [None]:
# # Filter out unkown labels from both predictions and labels before computing metrics
# clean_preds = []
# clean_labels = []

# for pred, label in zip(train_pred_gpt4o, train_labels):
#     if pred != "unknown":
#         clean_preds.append(pred)
#         clean_labels.append(label)

In [None]:
# Convert to np.array
X_train_gpt40 = np.array(train_pred_gpt4o)
X_val_gpt40 = np.array(val_pred_gpt4o)

y_train = np.array(train_labels)
y_val = np.array(val_labels)

In [None]:
# Define the title for the metrics and plots
title = "GPT-4o with Few-shot Examples"

Unnamed: 0,Model,Train F1 (Macro),Val F1 (Macro),Train Precision,Val Precision,Train Recall,Val Recall,Train Accuracy,Val Accuracy
0,GPT-4o with with few-shot examples,0.3926,0.3056,0.5111,0.2222,0.5952,0.5,0.4,0.3


In [None]:
# Get metrics DataFrame
metrics_df = get_metrics_df(title, train_labels, train_pred_gpt4o, val_labels, val_pred_gpt4o)
metrics_df

# Plot metrics
plot_metrics(train_labels, train_pred_gpt4o, val_labels, val_pred_gpt4o, title=title)

In [None]:
# Plot confusion matrix
plot_confusion_matrix(val_labels, val_pred_gpt4o, title=title, labels=[0, 1, 2], cmap="Blues")