In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

import ml_collections
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from utils import clean_text, preprocess_text
# Download required NLTK data
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')  # Open Multilingual Wordnet (needed for lemmatizer)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/s38976581_gmail_com/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/s38976581_gmail_com/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/s38976581_gmail_com/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
# Setting up the model hyperparameters

def model_config():
    cfg_dictionary = {
        "data_path": "../data/data.csv",
        "test_split_size": 0.2,
        "split_random_state": 42,
    }
    cfg = ml_collections.FrozenConfigDict(cfg_dictionary)

    return cfg
cfg = model_config()

# Preprocessing

In [3]:
def preprocess_csv(csv_file: str) -> pd.DataFrame:
    df = pd.read_csv(csv_file)

    labelencoder = LabelEncoder()
    df["label"] = labelencoder.fit_transform(df["Sentiment"])
    df.drop_duplicates(subset=['Sentence'],keep='first',inplace=True)

    cleaned_df = clean_text(df, "Sentence")
    df.rename(columns={"Sentiment": "sentiment"}, inplace=True)
    df.rename(columns={"Sentence": "sentence"}, inplace=True)

    cleaned_df["sentence"] = cleaned_df["sentence"].apply(preprocess_text)
    return cleaned_df

In [4]:
df = preprocess_csv(cfg.data_path)

In [5]:
pd.set_option('display.max_colwidth', None)
df.head(n = 20)

Unnamed: 0,sentence,sentiment,label
0,geosolut technolog leverag benefon gps solut provid locat base search technolog communiti platform locat relev multimedia content new power commerci model,positive,2
1,esi low bk real possibl,negative,0
2,last quarter componenta net sale doubl eur eur period year earlier move zero pre tax profit pre tax loss eur,positive,2
3,accord finnish russian chamber commerc major construct compani finland oper russia,neutral,1
4,swedish buyout firm sold remain percent stake almost eighteen month take compani public finland,neutral,1
5,spi surpris see green close,positive,2
6,shell billion bg deal meet sharehold skeptic,negative,0
7,ssh communic secur corp stock exchang releas octob pm compani updat full year outlook estim result remain loss full year,negative,0
8,kone net sale rose year year first nine month,positive,2
9,stockmann depart store total floor space squar metr stockmann invest project price tag eur million,neutral,1


Split the dataset into train and test with respect to sentiment labels:

In [19]:
train_df, test_df = train_test_split(
        df,
        test_size=cfg.test_split_size,
        random_state=cfg.split_random_state,
        stratify=df.label.values, ## stratify to maintain label distribution
    )

In [20]:
X_train, y_train = np.array(train_df['sentence']), np.array(train_df['label'])
X_test, y_test = np.array(test_df['sentence']), np.array(test_df['label'])

In [21]:
X_train.shape, X_test.shape

((4257,), (1065,))

In [22]:
tfidf = TfidfVectorizer(use_idf=True, tokenizer=word_tokenize, token_pattern=None, min_df=0.00002, max_df=0.70)

X_train_tf = tfidf.fit_transform(X_train.astype('U'))
X_test_tf = tfidf.transform(X_test.astype('U'))

print(f"TF_IDF Model: Train features shape:{X_train_tf.shape} and Test features shape:{X_test_tf.shape}")

TF_IDF Model: Train features shape:(4257, 6913) and Test features shape:(1065, 6913)


# Fit ML models

In [23]:
rf = RandomForestClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)
ada = AdaBoostClassifier(random_state=42)
lgb = LGBMClassifier(random_state=42)
xgb = XGBClassifier(eval_metric="mlogloss",random_state=42)
dt = DecisionTreeClassifier(random_state=42)
svc = SVC(random_state=42)
nb = MultinomialNB()
mlp = MLPClassifier(random_state=42)

clfs = {
    "Random Forest": rf,
    "Gradient Boosting":gb,
    "AdaBoost": ada,
    "LightGBM": lgb,
    "XGBoost": xgb,
    "Decision Tree":dt,
    "Support Vector Machine":svc,
    "Naive Bayes": nb,
    "Multilayer Perceptron":mlp
}

def fit_model(clf, x_train, y_train, x_test, y_test):
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test)
    return y_pred

y_pred_all = {}

for name,clf in tqdm(clfs.items()):
    curr_pred = fit_model(clf,X_train_tf,y_train,X_test_tf,y_test)
    y_pred_all[name] = curr_pred

 33%|███▎      | 3/9 [00:10<00:18,  3.10s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010893 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9785
[LightGBM] [Info] Number of data points in the train set: 4257, number of used features: 496
[LightGBM] [Info] Start training from score -2.195113
[LightGBM] [Info] Start training from score -0.614786
[LightGBM] [Info] Start training from score -1.055847


100%|██████████| 9/9 [01:08<00:00,  7.56s/it]


In [27]:
## save predictions for evaluation later
y_pred_all['y_true'] = y_test

import pickle
with open('../results/predictions/ml_baseline_predictions.pkl', 'wb') as f:
    pickle.dump(y_pred_all, f)

In [28]:
## A coarse evaluation of all models
from sklearn.metrics import accuracy_score

# Compute accuracy for each model
accuracies = {}
for name, y_pred in y_pred_all.items():
    acc = accuracy_score(y_test, y_pred)
    accuracies[name] = acc

# Display results sorted by accuracy
print("Model Accuracies:")
print("-" * 40)
for name, acc in sorted(accuracies.items(), key=lambda x: x[1], reverse=True):
    print(f"{name:25s}: {acc:.4f}")

Model Accuracies:
----------------------------------------
y_true                   : 1.0000
Support Vector Machine   : 0.7343
XGBoost                  : 0.7305
Random Forest            : 0.7268
LightGBM                 : 0.7249
Gradient Boosting        : 0.7099
Naive Bayes              : 0.6892
Multilayer Perceptron    : 0.6892
Decision Tree            : 0.6695
AdaBoost                 : 0.5962
