# ML final project

## Download dataset and libraries

In [29]:
!pip install scikit-learn
!pip install transformers
!pip install xgboost

!wget -O dataset.zip "https://www.kaggle.com/api/v1/datasets/download/sbhatti/financial-sentiment-analysis"
!unzip -p dataset.zip > dataset.csv

NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

## Load dataset

In [27]:
import pandas as pd

df = pd.read_csv('dataset.csv')
df.iloc[200:300]



Unnamed: 0,Sentence,Sentiment
200,A survey conducted by Taloustutkimus for Sampo...,neutral
201,"Antti Orkola , president of Kemira GrowHow 's ...",neutral
202,The pretax profit of the group 's life insuran...,positive
203,We make available the following brand-new mark...,neutral
204,"The Company serves approximately 3,000 custome...",neutral
...,...,...
295,"In addition , MIDs are always online and enabl...",neutral
296,Fiskars has a strong portfolio of internationa...,positive
297,#Tesla: Model X Recall Adds To Reliability Iss...,negative
298,The name of the buyer was not disclosed .,neutral


In [7]:
df.describe()

Unnamed: 0,Sentence,Sentiment
count,5842,5842
unique,5322,3
top,Managing Director 's comments : `` Net sales f...,neutral
freq,2,3130


## Prepare BERT tokenizer

In [8]:
!pip install transformers
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModel.from_pretrained("google-bert/bert-base-uncased")



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [9]:
!pip install torch
import torch



## Prepare dataset
### Create X and y

In [10]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

all_word_embeddings = []

X = tokenizer(
    df["Sentence"].tolist(),
    padding=True,
    return_tensors="pt"
)

# Prepare the inputs and labels (if available)
input_ids = X['input_ids'].to(device)
attention_mask = X['attention_mask'].to(device)

# Create a TensorDataset
dataset = TensorDataset(input_ids, attention_mask)

# Create a DataLoader to load the data in batches of 1024
batch_size = 1024
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Iterate over the batches
for batch in dataloader:
  with torch.no_grad():
    input_ids_batch, attention_mask_batch = batch
    input_ids_batch = input_ids_batch.to(device)
    attention_mask_batch = attention_mask_batch.to(device)

    outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
    word_embeddings = outputs.last_hidden_state  # Shape: (batch_size, seq_len, hidden_size)
    all_word_embeddings.append(word_embeddings)

# Combine all batches
all_word_embeddings = torch.cat(all_word_embeddings, dim=0)

print(all_word_embeddings.shape)

Using device: cuda
torch.Size([5842, 150, 768])


In [11]:
# all_word_embeddings
torch.save(all_word_embeddings, 'embeddings.pt')

In [None]:
# loaded_embeddings = torch.load('embeddings.pt')

In [11]:
summed_embeddings = all_word_embeddings.sum(dim=1)  # Shape: [5842, 768]


In [None]:
# X = tokenizer(
#     df["Sentence"].tolist(),
#     padding=True,
#     return_tensors="pt"
# )

# with torch.no_grad():
#     input_ids, attention_mask = X['input_ids'], X['attention_mask']

#     outputs = model(input_ids=input_ids,
#                     attention_mask=attention_mask)

#     word_embeddings = outputs.last_hidden_state
# print(word_embeddings.shape)

In [18]:
# X = tokenizer(
#     df["Sentence"].tolist(),
#     padding=True,
#     return_tensors="np"
# ).input_ids
y = df["Sentiment"].to_numpy()
y[y == 'positive'] = 0
y[y == 'negative'] = 1
y[y == 'neutral'] = 2
y = y.astype(int)

### Datset train, valid, test split

In [20]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(summed_embeddings, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

## Evaluate function

In [24]:
def evaluate_models(models, X_test, y_test):
  for model in models:
    print(model.score(X_test, y_test))

def train_valid_eval(model, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val):
  print(f"{type(model).__name__} training acccuracy: {accuracy_score(y_train, clf.predict(X_train))}")
  print(f"{type(model).__name__} validation accuracy: {accuracy_score(y_val, clf.predict(X_val))}")

## Baseline model

In [50]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
evaluate_models([dummy_clf], X_test, y_test)

0.5286569717707442


In [7]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(

)
clf = clf.fit(X_train, y_train)
evaluate_models([clf], X_test, y_test)

0.42429426860564584


In [45]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
evaluate_models([rfc], X_test, y_test)

0.5739948674080411


In [46]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=50, learning_rate=1.0, max_depth=1, random_state=0)
gbc.fit(X_train, y_train)
evaluate_models([gbc], X_test, y_test)

0.5799828913601369


In [25]:
# import xgboost as xgb
# from sklearn.metrics import accuracy_score, classification_report


# clf = xgb.XGBClassifier(
#     n_estimators=13,
#     max_depth=4,
#     num_class=3,               # Number of classes
#     learning_rate= 1,
# )
# clf.fit(X_train, y_train)
train_valid_eval(clf)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




XGBClassifier training acccuracy: 0.9261637239165329
XGBClassifier validation accuracy: 0.6417112299465241


In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

bc = BaggingClassifier(estimator=SVC(), n_estimators=50, random_state=0)
bc.fit(X_train, y_train)
evaluate_models([bc], X_test, y_test)

0.5269461077844312
