# ML final project

## Download dataset and libraries

In [41]:
!pip install scikit-learn
!pip install transformers
!pip install xgboost

!wget -O dataset.zip "https://www.kaggle.com/api/v1/datasets/download/sbhatti/financial-sentiment-analysis"
!unzip -p dataset.zip > dataset.csv

--2024-12-24 19:24:38--  https://www.kaggle.com/api/v1/datasets/download/sbhatti/financial-sentiment-analysis
Resolving www.kaggle.com (www.kaggle.com)... 35.244.233.98
Connecting to www.kaggle.com (www.kaggle.com)|35.244.233.98|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://storage.googleapis.com:443/kaggle-data-sets/1918992/3205803/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20241224%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20241224T192438Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=8839860b1f834c007a8bef89d3a1970084703fef5db2440fb074fd608bb0232087943f16a9773e81f47a235745b07d634acd96ab75980b8d6a9308fa109bfc4319160e07c0e156178e5d4333e67530c1110464a2851383406bbe8a5c8591173ecacb5955084144eda8a8b5c55053b6ccb56f74ac188b395fa3f9f50dbc2171c21d97b0bad7b9ecec74183b7ee0fa5d5c21bc353e9a9cad792f941606385b71525d536f45bbdf22a9d377eb317a4a5f25e4a5c944

## Load dataset

In [64]:
import pandas as pd

df = pd.read_csv('dataset.csv')
df.iloc[200:300]

Unnamed: 0,Sentence,Sentiment
200,A survey conducted by Taloustutkimus for Sampo...,neutral
201,"Antti Orkola , president of Kemira GrowHow 's ...",neutral
202,The pretax profit of the group 's life insuran...,positive
203,We make available the following brand-new mark...,neutral
204,"The Company serves approximately 3,000 custome...",neutral
...,...,...
295,"In addition , MIDs are always online and enabl...",neutral
296,Fiskars has a strong portfolio of internationa...,positive
297,#Tesla: Model X Recall Adds To Reliability Iss...,negative
298,The name of the buyer was not disclosed .,neutral


## Prepare BERT tokenizer

In [48]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

## Prepare dataset
### Create X and y

In [49]:
X = tokenizer(
    df["Sentence"].tolist(),
    padding=True,
    return_tensors="np"
).input_ids
y = df["Sentiment"].to_numpy()
y[y == 'positive'] = 0
y[y == 'negative'] = 1
y[y == 'neutral'] = 2
y = y.astype(int)


### Datset train, valid, test split

In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

## Evaluate function

In [51]:
def evaluate_models(models, X_test, y_test):
  for model in models:
    print(model.score(X_test, y_test))

In [52]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(

)
clf = clf.fit(X_train, y_train)
evaluate_models([clf], X_test, y_test)

0.43370402053036783


In [53]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
evaluate_models([rfc], X_test, y_test)

0.5782720273738238


In [54]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=50, learning_rate=1.0, max_depth=1, random_state=0)
gbc.fit(X_train, y_train)
evaluate_models([gbc], X_test, y_test)

0.5799828913601369


In [60]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report


clf = xgb.XGBClassifier(
    objective='multi:softmax',  # Multi-class classification
    num_class=3,               # Number of classes
    learning_rate=0.3,
)
clf.fit(X_train, y_train)

accuracy_score(y_test, clf.predict(X_test))

0.5543199315654406

In [57]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

bc = BaggingClassifier(estimator=SVC(), n_estimators=50, random_state=0)
bc.fit(X_train, y_train)
evaluate_models([bc], X_test, y_test)

0.5269461077844312
