# ML final project

## Download dataset and libraries

In [2]:
!pip install scikit-learn
!pip install transformers
!pip install xgboost
!pip install torch

!wget -O dataset.zip "https://www.kaggle.com/api/v1/datasets/download/sbhatti/financial-sentiment-analysis"
!unzip -p dataset.zip > dataset.csv

--2024-12-27 16:05:52--  https://www.kaggle.com/api/v1/datasets/download/sbhatti/financial-sentiment-analysis
Resolving www.kaggle.com (www.kaggle.com)... 35.244.233.98
Connecting to www.kaggle.com (www.kaggle.com)|35.244.233.98|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://storage.googleapis.com:443/kaggle-data-sets/1918992/3205803/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20241227%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20241227T160553Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=94b8dcb1941fcc22ac87ff405928fd568770f940d1b5faa24d895406c3c2969d86dfd81b4d766e23da7615e75f17162c447b387460a5e29976ce74992d9d68f9d6773964733c845233bd0ab2449f764dff93ad465ba675b46cc88add8c86cba329f61f77093a151663d9e247dda1806763691b035841cfbe4ab3d3813c84953fb9521052fc91e096192d3b606e9a3888b973abfe40097d339a7ea61537d6fbbbda351d0010484d5815f8da6bd5e0e07e98b07c58

## Load dataset

In [3]:
import pandas as pd

df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


## Prepare BERT tokenizer

In [4]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-large-cased")
model = AutoModel.from_pretrained("google-bert/bert-large-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

## Prepare dataset
### Create X and y

In [5]:
# Tokenizing the sentence
sentence = df["Sentence"][0]
tokenized_output = tokenizer(sentence)

# Extracting input_ids
token_ids = tokenized_output['input_ids']

# Decoding the tokens back to the original sentence
decoded_sentence = tokenizer.decode(token_ids, skip_special_tokens=True)

print("Original Sentence:", sentence)
print("Decoded Sentence:", decoded_sentence)



Original Sentence: The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model .
Decoded Sentence: The GeoSolutions technology will leverage Benefon ' s GPS solutions by providing Location Based Search Technology, a Communities Platform, location relevant multimedia content and a new and powerful commercial model.


In [11]:
# Example sentence
sentence = df["Sentence"][0]

# Tokenizing the sentence
tokenized_output = tokenizer(sentence)

# Using the tokenize method to get individual tokens
tokens = tokenizer.tokenize(sentence)

# Printing each token
print("Individual Tokens:")
for token in tokens:
    print(token)



Individual Tokens:
the
geo
##sol
##ution
##s
technology
will
leverage
ben
##ef
##on
'
s
gps
solutions
by
providing
location
based
search
technology
,
a
communities
platform
,
location
relevant
multimedia
content
and
a
new
and
powerful
commercial
model
.


In [6]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")
all_word_embeddings = []

X = tokenizer(
    df["Sentence"].tolist(),
    padding=True,
    return_tensors="pt"
)

# Prepare the inputs and labels (if available)
input_ids = X['input_ids'].to(device)
attention_mask = X['attention_mask'].to(device)

# Create a TensorDataset
dataset = TensorDataset(input_ids, attention_mask)

# Create a DataLoader to load the data in batches of 1024
batch_size = 1024
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Iterate over the batches
for batch in dataloader:
  input_ids_batch, attention_mask_batch = batch
  input_ids_batch = input_ids_batch.to(device)
  attention_mask_batch = attention_mask_batch.to(device)

  outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
  word_embeddings = outputs.last_hidden_state  # Shape: (batch_size, seq_len, hidden_size)
  all_word_embeddings.append(word_embeddings)

# Combine all batches
all_word_embeddings = torch.cat(all_word_embeddings, dim=0)

print(all_word_embeddings.shape)

Using device: cpu


KeyboardInterrupt: 

In [None]:
# all_word_embeddings
torch.save(all_word_embeddings, 'embeddings.pt')

In [None]:
# loaded_embeddings = torch.load('embeddings.pt')

In [36]:
summed_embeddings = all_word_embeddings.sum(dim=1)  # Shape: [5842, 768]
maxed_embeddings, _ = all_word_embeddings.max(dim=1)  # Shape: [5842, 768]
print(type(summed_embeddings))
print(type(maxed_embeddings))

<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [37]:
summed_embeddings = summed_embeddings.cpu().numpy()
maxed_embeddings = maxed_embeddings.cpu().numpy()

In [None]:
# X = tokenizer(
#     df["Sentence"].tolist(),
#     padding=True,
#     return_tensors="pt"
# )

# with torch.no_grad():
#     input_ids, attention_mask = X['input_ids'], X['attention_mask']

#     outputs = model(input_ids=input_ids,
#                     attention_mask=attention_mask)

#     word_embeddings = outputs.last_hidden_state
# print(word_embeddings.shape)

In [26]:
# X = tokenizer(
#     df["Sentence"].tolist(),
#     padding=True,
#     return_tensors="np"
# ).input_ids
y = df["Sentiment"].to_numpy()
y[y == 'positive'] = 0
y[y == 'negative'] = 1
y[y == 'neutral'] = 2
y = y.astype(int)

### Datset train, valid, test split

In [38]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(maxed_embeddings, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

## Evaluate function

In [17]:
def evaluate_models(models, X_test, y_test):
  for model in models:
    print(model.score(X_test, y_test))

def train_valid_eval(model, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val):
  print(f"{type(model).__name__} training acccuracy: {accuracy_score(y_train, clf.predict(X_train))}")
  print(f"{type(model).__name__} validation accuracy: {accuracy_score(y_val, clf.predict(X_val))}")

## Baseline model

In [39]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
evaluate_models([dummy_clf], X_test, y_test)

0.5286569717707442


In [40]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(

)
clf = clf.fit(X_train, y_train)
evaluate_models([clf], X_test, y_test)

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [41]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
evaluate_models([rfc], X_test, y_test)

0.6090675791274593


In [30]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=50, learning_rate=1.0, max_depth=1, random_state=0)
gbc.fit(X_train, y_train)
evaluate_models([gbc], X_test, y_test)

0.7125748502994012


In [21]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report


clf = xgb.XGBClassifier(
    n_estimators=5,
    max_depth=3,
    num_class=3,               # Number of classes
    learning_rate= 1,
)
clf.fit(X_train, y_train)
train_valid_eval(clf)

XGBClassifier training acccuracy: 0.7696629213483146
XGBClassifier validation accuracy: 0.6641711229946524


In [22]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

bc = BaggingClassifier(estimator=SVC(), n_estimators=50, random_state=0)
bc.fit(X_train, y_train)
evaluate_models([bc], X_test, y_test)

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.