# ML final project

## Download dataset and libraries

In [None]:
!pip install scikit-learn
!pip install transformers
!pip install xgboost
!pip install torch
!pip install numpy
!pip install groq

!wget -O dataset.zip "https://www.kaggle.com/api/v1/datasets/download/sbhatti/financial-sentiment-analysis"
!unzip -p dataset.zip > dataset.csv

## Load dataset

In [None]:
import pandas as pd

df = pd.read_csv('dataset.csv')
df.head()

## Prepare dataset


### Download already created embeddings
If you want to create the embeddings yourself, skip this cell and continue with the next cells. They will produce embeddings using the BERT-large-uncased model on a GPU.

In [None]:
!wget https://github.com/misosvec/ml-course-project/raw/refs/heads/main/cls_embeddings.npy
!wget https://github.com/misosvec/ml-course-project/raw/refs/heads/main/sum_embeddings.npy
!wget https://github.com/misosvec/ml-course-project/raw/refs/heads/main/mean_embeddings.npy
!wget https://github.com/misosvec/ml-course-project/raw/refs/heads/main/max_embeddings.npy

import numpy as np

sum_embeddings = np.load('sum_embeddings.npy')
max_embeddings = np.load('max_embeddings.npy')
cls_embeddings = np.load('cls_embeddings.npy')
mean_embeddings = np.load('mean_embeddings.npy')



### Create embeddings

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-large-cased")
model = AutoModel.from_pretrained("google-bert/bert-large-cased")

#### Tokenize X

In [None]:
X = tokenizer(
    df["Sentence"].tolist(),
    padding=True,
    return_tensors="pt"
)

#### Generate X embeddings using the BERT on GPU

In [None]:
from torch.utils.data import DataLoader, TensorDataset
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

all_sentences_embeddings = []

# dataset will be moved to GPU
input_ids = X['input_ids'].to(device)
attention_mask = X['attention_mask'].to(device)
dataset = TensorDataset(input_ids, attention_mask)

# we are using batches due to memory capacity
batch_size = 1024
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

for batch in dataloader:
  input_ids_batch, attention_mask_batch = batch
  input_ids_batch = input_ids_batch.to(device)
  attention_mask_batch = attention_mask_batch.to(device)
  # using no_grad for another memory optimization
  with torch.no_grad():
    outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
    all_sentences_embeddings.append(outputs.last_hidden_state)

# Combine all batches
all_sentences_embeddings = torch.cat(all_sentences_embeddings, dim=0)

In [None]:
import numpy as np

sum_embeddings = all_sentences_embeddings.sum(dim=1)
max_embeddings, _ = all_sentences_embeddings.max(dim=1)
cls_embeddings = all_sentences_embeddings[:, 0, :]
mean_embeddings = all_sentences_embeddings.mean(dim=1)

# bring it back to cpu
sum_embeddings = sum_embeddings.cpu().numpy()
max_embeddings = max_embeddings.cpu().numpy()
cls_embeddings = cls_embeddings.cpu().numpy()
mean_embeddings = mean_embeddings.cpu().numpy()

# np.save("cls_embeddings.npy", cls_embeddings)
# np.save("sum_embeddings.npy", sum_embeddings)
# np.save("max_embeddings.npy", max_embeddings)
# np.save("mean_embeddings.npy", mean_embeddings)

### Encode target variable y

In [32]:
# X = cls_embeddings
# X = sum_embeddings
# X = max_embeddings
X = mean_embeddings

y = df["Sentiment"].to_numpy()
y[y == 'positive'] = 0
y[y == 'negative'] = 1
y[y == 'neutral'] = 2
y = y.astype(int)

### Datset train, valid, test split

In [None]:
from sklearn.model_selection import train_test_split

# ~70% train (4090)
# ~15% validation (876)
# ~15% test (876)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=876, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=876, random_state=42)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

## Train and validation evaluate function

In [42]:
from sklearn.metrics import accuracy_score

def train_valid_eval(model, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val):
  print(f"{type(model).__name__} training acccuracy: {accuracy_score(y_train, model.predict(X_train))}")
  print(f"{type(model).__name__} validation accuracy: {accuracy_score(y_val, model.predict(X_val))}")

## Baseline model

In [None]:
from sklearn.dummy import DummyClassifier

baseline = DummyClassifier(strategy="most_frequent")
baseline.fit(X_train, y_train)
train_valid_eval(baseline)

## XGBoost Gradient Boosted Classifier
Gradient boosted trees.

In [None]:
from xgboost import XGBClassifier

xgbcls = XGBClassifier(
    n_estimators=1000, # number of boosting rounds
    max_depth=3,
    num_class=3,
    colsample_bytree=0.8, #subsampling 80% of columns for each tree
    learning_rate=0.002,
    eval_metric=["mlogloss"],
    early_stopping_rounds=12
)
xgbcls.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=False)
train_valid_eval(xgbcls)

## XGBoost Random Forests

In [None]:
from xgboost import XGBRFClassifier

# Define the classifier
xgbrfcls = XGBRFClassifier(
    n_estimators=6,
    max_depth=5,
    num_class=3,
    learning_rate=0.003,
    eval_metric=["mlogloss"],
)

xgbrfcls.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=1
)
train_valid_eval(xgbrfcls)

## Scikit-learn Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfcls = RandomForestClassifier(
    n_estimators=40,
    max_depth=6,
    criterion="gini",
    n_jobs=-1 # paralleization
)
rfcls.fit(X_train, y_train)
train_valid_eval(rfcls)

## Scikit-learn AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
abcls = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=4),
    n_estimators=120,
    learning_rate=0.001
)
abcls.fit(X_train, y_train)
train_valid_eval(abcls)

##HistGadientBoostedClassifier

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

hgbcls = HistGradientBoostingClassifier(
    max_iter=800,
    max_depth=2,
    learning_rate=0.005
)
hgbcls.fit(X_train, y_train)
train_valid_eval(hgbcls)

## SVM attempt


In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_svm = scaler.fit_transform(X_train)
X_val_svm = scaler.transform(X_val)
X_test_svm = scaler.transform(X_test)

svc = SVC(kernel='rbf')
svc.fit(X_train_svm, y_train)
train_valid_eval(svc)
models_evaluation([svc], X=X_test_svm, y=y_test)

## Groq LLama

In [None]:
from groq import Groq

client = Groq(
  api_key='TODO'
)

def generate_prompt(user_content):
  return [
      {
        "role": "system",
        "content": "Act as a classifier for financial sentiment analysis."
      },
      {
        "role": "user",
        "content": "Classify the following text as a 0=positive, 1=negative, or 2=neutral. Output only the single number.\n\nText:" + user_content,
      }
  ]

## Running classification using Llama model on test dataset

In [None]:
from sklearn.model_selection import train_test_split
import time

df = pd.read_csv('dataset.csv')
X = df['Sentence']
y = df["Sentiment"].to_numpy()
y[y == 'positive'] = 0
y[y == 'negative'] = 1
y[y == 'neutral'] = 2
y = y.astype(int)

# ~70% train (4090)
# ~15% validation (876)
# ~15% test (876)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=876, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=876, random_state=42)


y_pred = []
y_true = []
index = 0

for i, (sentence, label) in enumerate(zip(X_test, y_test)):
  index = i
  chat_completion = client.chat.completions.create(
    messages=generate_prompt(sentence),
    model="llama-3.3-70b-versatile",
  )
  pred = chat_completion.choices[0].message.content

  try:
    pred = int(pred)
    y_pred.append(pred)
    y_true.append(label)
    print(f"Index={i} Sentence={sentence[:10]} Sentiment={label} classified as {pred}")
  except ValueError as e:
    print(f"Failed to convert to int, Index={i} Sentence={sentence[:10]} Sentiment={label} classified as {pred}")
    break
  time.sleep(2)

### Evaluate Llama accuracy on testing dataset

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true, y_pred)
print(f"Llama Accuracy: {accuracy}")

# y_true_llama = np.array(y_true, dtype=np.uint8)
# y_pred_llama = np.array(y_pred, dtype=np.uint8)
# np.save("y_true_llama.npy", y_true_llama)
# np.save("y_pred_llama.npy", y_pred_llama)

## Eval models on test dataset

In [None]:
from sklearn.metrics import accuracy_score

def models_evaluation(models, X=X_test, y=y_test):
  for model in models:
    print(f"{type(model).__name__} accuracy: {accuracy_score(y, model.predict(X))}")

models = [baseline, xgbcls, xgbrfcls, rfcls, abcls, hgbcls]
models_evaluation(models, X=X_test, y=y_test)

## Plotting Results

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.ticker as mtick
import matplotlib.lines as mlines

groups = ['Llama-3.3-70b-versatile', 'Baseline (majority)', 'XGBClassifier', 'XGBRFClassifier', 'Random Forest (sklearn)',
         'AdaBoost with Decision Tree', 'HistGradientBoostingClassifier']
sub_groups = ['CLS embeddings', 'Mean embeddings', 'Sum embeddings', 'Max embeddings']

accuracy = [[75.11, 75.11, 75.11, 75.11],
            [51.94, 51.94, 51.94, 51.94],
            [66.78, 67.23, 67.23, 65.75],
            [65.29, 65.75, 65.75, 62.32],
            [65.63, 65.41, 66.09, 62.10],
            [60.27, 60.84, 60.84, 60.61],
            [69.86, 68.26, 68.26, 66.89]
            ]

accuracy = np.array(accuracy) / 100
x = np.arange(len(groups))
width = 0.22

fig, ax = plt.subplots(figsize=(9, 8), dpi=300)
colors = ['darkorange', 'royalblue', 'gold', 'fuchsia']

# # Plot each sub_group as a set of bars
# for i in range(len(sub_groups)):
#   ax.bar(x+ i * width, accuracy[:, i], width, label=sub_groups[i], color=colors[i])

for i in range(len(sub_groups)):
    for j in range(len(groups)):
        bar = ax.bar(x[j] + i * width, accuracy[j, i], width, color=colors[i])
        # Add vertical percentage label inside the bar
        if j != 0 and j != 1:
          ax.text(x[j] + i * width, accuracy[j, i] / 2, f'{accuracy[j, i] * 100:.2f}%', ha='center', va='bottom', rotation=90, fontsize=11, color='black')

# Overwrite bar for Llama
for i in range(len(sub_groups)):
  ax.bar(np.array([0])+ i * width, accuracy[0, i], width, label=sub_groups[i], color='cyan')
ax.text(np.array([0]) + 1.5 * width, accuracy[0, 0] / 2, f'{accuracy[0, 0] * 100:.2f}%', ha='center', va='bottom', fontsize=11, color='black')

# Overwrite bar for basline
for i in range(len(sub_groups)):
  ax.bar(np.array([1])+ i * width, accuracy[1, i], width, label=sub_groups[i], color='springgreen')
ax.text(np.array([1]) + 1.5 * width, accuracy[0, 0] / 2, f'{accuracy[1, 0] * 100:.2f}%', ha='center', va='top', fontsize=11, color='black')

ax.set_xlabel('Models', fontsize=11)
ax.set_ylabel('Accuracy', fontsize=11)
ax.set_title('Models Accuracy on Test Dataset')

ax.set_xticks(np.arange(len(groups)) + width * 3.5)
ax.set_xticklabels(groups, rotation=60, ha='right', fontsize=11)
ax.tick_params(axis='y', labelsize=11)

handles = []
for color, label in zip(colors, sub_groups):
  handle = mlines.Line2D([], [], marker='o', color='w', label=label, markersize=10, markerfacecolor=color)
  handles.append(handle)

ax.legend(title='Training Data Type', labels=sub_groups, handles=handles)

ax.set_ylim(0, 1)
ax.yaxis.set_major_locator(mtick.MultipleLocator(0.05))
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))

ax.grid(True, which='both', linestyle='--', linewidth=0.5, alpha=0.7)

plt.tight_layout()
plt.show()


## Code for plotting train and validation loss

In [None]:
results = xgbrfc.evals_result()
train_results = results['validation_0']
valid_results = results['validation_1']

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(
    range(len(train_results['mlogloss'])),
    train_results['mlogloss'],
    label='Training Loss (mlogloss)',
    marker='o'
)

plt.plot(
    range(len(valid_results['mlogloss'])),
    valid_results['mlogloss'],
    label='Validation (mlogloss)',
    marker='o'
)


plt.xlabel('Boosting rounds')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Boosting Rounds')
plt.legend()
plt.grid(True)
plt.show()

## Code for plotting confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

y_pred = xgbclf.predict(X_val)
cm = confusion_matrix(y_val, y_pred)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix from XGBClassifier')
plt.show()

## Code XGBoost feature importance

In [None]:
from xgboost import plot_importance
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 30))
plot_importance(xgbclf,
                importance_type='weight',
                max_num_features=20,
                height=0.5)
plt.title("Top 20 Feature Importance")
plt.show()
