In [36]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from sklearn.utils import resample
import nltk
from nltk.corpus import stopwords

## Resampling data to balance out labels

In [None]:
def resample_train_data(df):
    # Step 1: Separate the dataset by labels
    
    anger_df = df[df['label'] == 'anger']
    fear_df = df[df['label'] == 'fear']
    joy_df = df[df['label'] == 'joy']
    sadness_df = df[df['label'] == 'sadness']
    love_df = df[df['label'] == 'love']
    surprise_df = df[df['label'] == 'surprise']
    
    # Step 2: Randomly select 2000 entries for joy, sadness and anger
    
    joy_sampled = joy_df.sample(n=2000, random_state=42)
    sadness_sampled = sadness_df.sample(n=2000, random_state=42)
    anger_sampled = anger_df.sample(n=2000, random_state=42)
    
    # Step 3: Oversample love and surprise to 2000 entries
    
    love_oversampled = resample(love_df, 
                                replace=True,     # Sample with replacement
                                n_samples=2000,   # To match 2000
                                random_state=42)  
    
    surprise_oversampled = resample(surprise_df, 
                                    replace=True,     
                                    n_samples=2000,   
                                    random_state=42)  
    
    fear_oversampled = resample(fear_df, 
                                    replace=True,     
                                    n_samples=2000,   
                                    random_state=42)  
    
    
    # Step 5: Combine all the balanced samples into a new DataFrame
    balanced_df = pd.concat([anger_sampled, fear_oversampled, joy_sampled, sadness_sampled, love_oversampled, surprise_oversampled])
    
    # Step 6: Shuffle the final balanced dataset to mix the samples
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
  
    return balanced_df  


# Loading data

In [21]:

# Loading data
def load_data(file_path):
    data = pd.read_csv(file_path, sep=';', names=['text', 'label'])
    return data

train_data = load_data('data/train.txt')
val_data = load_data('data/val.txt')
test_data = load_data('data/test.txt')

# resampling
train_data = resample_train_data(train_data)


In [48]:
## checking if resampling worked
print(train_data["label"].value_counts())

label
anger       2000
sadness     2000
love        2000
joy         2000
fear        2000
surprise    2000
Name: count, dtype: int64


## Tokenisation 
Tokenisation using nltk

In [24]:

# apply lowercasing and tokenise the data
def preprocess_text(text):
    return word_tokenize(text.lower())
   
train_data['processed_text'] = train_data['text'].apply(preprocess_text)
val_data['processed_text'] = val_data['text'].apply(preprocess_text)
test_data['processed_text'] = test_data['text'].apply(preprocess_text)

# Encode the labels
le = LabelEncoder()
train_data['label_enc'] = le.fit_transform(train_data['label'])
val_data['label_enc'] = le.transform(val_data['label'])
test_data['label_enc'] = le.transform(test_data['label'])


In [25]:
train_data.head()

Unnamed: 0,text,label,processed_text,label_enc
0,i suppose i m feeling a little sarcastic about...,anger,"[i, suppose, i, m, feeling, a, little, sarcast...",0
1,i love to sew cook and also dabble in mixed me...,sadness,"[i, love, to, sew, cook, and, also, dabble, in...",4
2,i didn t want to tell him because arun has the...,anger,"[i, didn, t, want, to, tell, him, because, aru...",0
3,i was feeling really horny all afternoon with ...,love,"[i, was, feeling, really, horny, all, afternoo...",3
4,i was feeling grumpy not women problems grumpy...,anger,"[i, was, feeling, grumpy, not, women, problems...",0


# Word2Vec Model
Applying word2vec to "raw" data to capture relationships better

In [26]:
# Train Word2Vec model
w2v_model = Word2Vec(sentences=train_data['processed_text'], vector_size=100, window=5, min_count=1, sg=1, workers=4)

# Get average word vectors for each sentence
def get_avg_word2vec(sentences, model, vector_size):
    vectors = []
    for sentence in sentences:
        vector = sum([model.wv[word] for word in sentence if word in model.wv] or [np.zeros(vector_size)]) / len(sentence)
        vectors.append(vector)
    return vectors

#Tried different paramaters, but model doesnt capture meaningful relationships
X_train_w2v = get_avg_word2vec(train_data['processed_text'], w2v_model, 100)
X_val_w2v = get_avg_word2vec(val_data['processed_text'], w2v_model, 100)
X_test_w2v = get_avg_word2vec(test_data['processed_text'], w2v_model, 100)

y_train = train_data['label_enc']
y_val = val_data['label_enc']
y_test = test_data['label_enc']


## Base modeling - using Word2Vec features

In [33]:
import numpy as np
np.random.seed(42)


In [34]:
# basic models to try out
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(kernel='linear')
}

# Train and evaluate models here
for model_name, model in models.items():
    model.fit(X_train_w2v, y_train)
    y_pred = model.predict(X_val_w2v)
    acc = accuracy_score(y_val, y_pred)
    print(f'{model_name} Validation Accuracy: {acc}')


Logistic Regression Validation Accuracy: 0.3235
Random Forest Validation Accuracy: 0.3385
SVM Validation Accuracy: 0.3135


## Preprocessing + TF-IDF
Preprocessing to remove non-alphabetic tokens, stopwords, and tokenize

In [37]:
# Tokenise, remove stopwords and lowercase
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return " ".join(tokens)

# overwriting the previous preprocess method
train_data['processed_text'] = train_data['text'].apply(preprocess_text)
val_data['processed_text'] = val_data['text'].apply(preprocess_text)
test_data['processed_text'] = test_data['text'].apply(preprocess_text)


 ## Applying TF-IDF


In [38]:

# Using TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features for larger vocab
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['processed_text']).toarray()
X_val_tfidf = tfidf_vectorizer.transform(val_data['processed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_data['processed_text']).toarray()


In [98]:
# import joblib

# joblib.dump(tfidf_vectorizer, 'tfidf_model.pkl')

['tfidf_model.pkl']

## Base Modelling 

In [39]:
#checking VALIDATION accuracy 
# trying base models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(kernel='linear')
}

# train and evaluate models
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_val_pred = model.predict(X_val_tfidf)
    acc = accuracy_score(y_val, y_val_pred)
    print(f'{model_name} Validation Accuracy: {acc}')


Logistic Regression Validation Accuracy: 0.886
Random Forest Validation Accuracy: 0.879
SVM Validation Accuracy: 0.8875


In [85]:
# checking TEST accuracy

# Train and evaluate models 
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_test_pred = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, y_test_pred) #Test Accuracy
    print(f'{model_name} Test Accuracy: {acc}')

Logistic Regression Validation Accuracy: 0.8715
Random Forest Validation Accuracy: 0.866
SVM Validation Accuracy: 0.873


In [88]:

results_data = {
    'Model based on TF-IDF': ['Logistic Regression', 'Random Forest', 'SVM'],
    'Test Accuracy': [0.8715, 0.866, 0.873]
}

results_df = pd.DataFrame(results_data)

results_df

Unnamed: 0,Model based on TF-IDF,Test Accuracy
0,Logistic Regression,0.8715
1,Random Forest,0.866
2,SVM,0.873


# CatBoost model

In [60]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report 
from sklearn.model_selection import GridSearchCV

In [91]:

# create CatBoost Pool
train_pool = Pool(data=X_train_tfidf, label=y_train)
val_pool = Pool(data=X_val_tfidf, label=y_val)
test_pool = Pool(data=X_test_tfidf, label=y_test)

# initialize and train the CatBoost model
catboost_model = CatBoostClassifier()

catboost_model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)


Learning rate set to 0.113478
0:	learn: 1.7426718	test: 1.7723341	best: 1.7723341 (0)	total: 211ms	remaining: 3m 31s
1:	learn: 1.7123860	test: 1.7579925	best: 1.7579925 (1)	total: 370ms	remaining: 3m 4s
2:	learn: 1.6830551	test: 1.7429468	best: 1.7429468 (2)	total: 499ms	remaining: 2m 45s
3:	learn: 1.6606284	test: 1.7308799	best: 1.7308799 (3)	total: 622ms	remaining: 2m 34s
4:	learn: 1.6429627	test: 1.7204423	best: 1.7204423 (4)	total: 747ms	remaining: 2m 28s
5:	learn: 1.6258950	test: 1.7110144	best: 1.7110144 (5)	total: 880ms	remaining: 2m 25s
6:	learn: 1.6117338	test: 1.7028765	best: 1.7028765 (6)	total: 1.01s	remaining: 2m 23s
7:	learn: 1.5987107	test: 1.6950917	best: 1.6950917 (7)	total: 1.23s	remaining: 2m 32s
8:	learn: 1.5850881	test: 1.6852986	best: 1.6852986 (8)	total: 1.37s	remaining: 2m 30s
9:	learn: 1.5729417	test: 1.6785198	best: 1.6785198 (9)	total: 1.49s	remaining: 2m 27s
10:	learn: 1.5622155	test: 1.6707224	best: 1.6707224 (10)	total: 1.65s	remaining: 2m 28s
11:	learn: 1

<catboost.core.CatBoostClassifier at 0x7faef237f5e0>

In [92]:
# make predictions
y_val_pred = catboost_model.predict(val_pool)
y_test_pred = catboost_model.predict(test_pool)

# extract accuracy score
val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# classification report
print("\nClassification Report (Validation):")
print(classification_report(y_test, y_test_pred, target_names=le.classes_))


Validation Accuracy: 0.8895
Test Accuracy: 0.8855

Classification Report (Validation):
              precision    recall  f1-score   support

       anger       0.87      0.92      0.90       275
        fear       0.88      0.83      0.85       224
         joy       0.95      0.87      0.91       695
        love       0.68      0.95      0.79       159
     sadness       0.97      0.88      0.92       581
    surprise       0.57      0.97      0.72        66

    accuracy                           0.89      2000
   macro avg       0.82      0.90      0.85      2000
weighted avg       0.90      0.89      0.89      2000



### Grid Search for CatBoost

In [61]:

# hyperparameter optimization 
param_grid = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.5],
    'iterations': [100, 500, 1000]
}


grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid,
                           scoring='accuracy', cv=3, verbose=1)
grid_search.fit(X_train_tfidf, y_train)

# Best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.4f}".format(grid_search.best_score_))

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters found:  {'depth': 8, 'iterations': 1000, 'learning_rate': 0.5}
Best cross-validation score: 0.9258


In [66]:
best_params = grid_search.best_params_

best_grid_model = CatBoostClassifier(**best_params, verbose = 1)

# fit on the train data
best_grid_model.fit(train_pool)

# make predictions on the test set
y_test_pred = best_grid_model.predict(test_pool)

# evaluate the model
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy with Best Hyperparameters: {test_accuracy:.4f}")

0:	learn: 1.6249478	total: 537ms	remaining: 8m 56s
1:	learn: 1.5125538	total: 1.06s	remaining: 8m 51s
2:	learn: 1.4564390	total: 1.61s	remaining: 8m 53s
3:	learn: 1.4111847	total: 2.12s	remaining: 8m 46s
4:	learn: 1.3624562	total: 2.56s	remaining: 8m 29s
5:	learn: 1.3189519	total: 3.03s	remaining: 8m 22s
6:	learn: 1.2804443	total: 3.5s	remaining: 8m 16s
7:	learn: 1.2543380	total: 3.95s	remaining: 8m 9s
8:	learn: 1.2196954	total: 4.41s	remaining: 8m 5s
9:	learn: 1.1877932	total: 4.91s	remaining: 8m 5s
10:	learn: 1.1576097	total: 5.44s	remaining: 8m 9s
11:	learn: 1.1304784	total: 5.97s	remaining: 8m 11s
12:	learn: 1.1011732	total: 6.55s	remaining: 8m 17s
13:	learn: 1.0750587	total: 7.04s	remaining: 8m 16s
14:	learn: 1.0506697	total: 7.5s	remaining: 8m 12s
15:	learn: 1.0312168	total: 7.96s	remaining: 8m 9s
16:	learn: 1.0151528	total: 8.47s	remaining: 8m 10s
17:	learn: 0.9972404	total: 8.93s	remaining: 8m 7s
18:	learn: 0.9774506	total: 9.4s	remaining: 8m 5s
19:	learn: 0.9614013	total: 9.85

In [97]:
best_params

{'depth': 8, 'iterations': 1000, 'learning_rate': 0.5}

In [None]:
#Test Accuracy with Best Hyperparameters: 0.8860

## CatBoost on raw text

In [94]:
# extracting the text features
X_train = train_data['text']
X_val = val_data['text']
X_test = test_data['text']

# creating CatBoost Pools with the text features 
train_pool = Pool(data=X_train, label=y_train, text_features=[0])
val_pool = Pool(data=X_val, label=y_val, text_features=[0])
test_pool = Pool(data=X_test, label=y_test, text_features=[0])


In [95]:
# CatBoost model with raw text processing
model = CatBoostClassifier(

    text_processing='default',   # Built-in text processing
    verbose=1                    # Verbose output every 100 iterations
)

model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)
y_test_pred = model.predict(test_pool)

# evaluating model's accuracy on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.4f}")

Learning rate set to 0.113478
0:	learn: 1.5172326	test: 1.5330279	best: 1.5330279 (0)	total: 855ms	remaining: 14m 14s
1:	learn: 1.3631533	test: 1.3763655	best: 1.3763655 (1)	total: 1.59s	remaining: 13m 13s
2:	learn: 1.2442462	test: 1.2495958	best: 1.2495958 (2)	total: 2.25s	remaining: 12m 26s
3:	learn: 1.1532248	test: 1.1481395	best: 1.1481395 (3)	total: 2.91s	remaining: 12m 5s
4:	learn: 1.0853813	test: 1.0748269	best: 1.0748269 (4)	total: 3.63s	remaining: 12m 3s
5:	learn: 1.0310968	test: 1.0135712	best: 1.0135712 (5)	total: 4.32s	remaining: 11m 54s
6:	learn: 0.9870461	test: 0.9633017	best: 0.9633017 (6)	total: 4.9s	remaining: 11m 34s
7:	learn: 0.9500809	test: 0.9223877	best: 0.9223877 (7)	total: 5.49s	remaining: 11m 20s
8:	learn: 0.9177979	test: 0.8870252	best: 0.8870252 (8)	total: 6.08s	remaining: 11m 9s
9:	learn: 0.8927545	test: 0.8580754	best: 0.8580754 (9)	total: 6.72s	remaining: 11m 5s
10:	learn: 0.8703503	test: 0.8321827	best: 0.8321827 (10)	total: 7.42s	remaining: 11m 6s
11:	le

In [96]:
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.8330


### Naive Bayes model

In [68]:
from sklearn.naive_bayes import MultinomialNB

In [69]:
# base Naive Bayes model with no tuning
nb_base_model = MultinomialNB()
nb_base_model.fit(X_train_tfidf, y_train)

# evaluate on the test set
y_test_pred_base = nb_base_model.predict(X_test_tfidf)
base_test_accuracy = accuracy_score(y_test, y_test_pred_base)

# test accuracy
print(f"Base Naive Bayes Test Accuracy (No Tuning): {base_test_accuracy:.4f}")

Base Naive Bayes Test Accuracy (No Tuning): 0.8270


### Grid Searh 

In [71]:
# Hyperparameter grid for tuning
param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1, 5, 10],             # Range of smoothing values
    'fit_prior': [True, False],                      # Whether to learn class priors
    'class_prior': [None]                    # Used to binarize the input features
}

# set up Grid Search with 3-fold cross-validation
grid_search = GridSearchCV(estimator=MultinomialNB(), param_grid=param_grid, 
                           scoring='accuracy', cv=3, verbose=2)

grid_search.fit(X_train_tfidf, y_train)

# best parameters found by GridSearch
print(f"Best hyperparameters: {grid_search.best_params_}")

# extract best parameters
best_nb_model = grid_search.best_estimator_


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END .......alpha=0.01, class_prior=None, fit_prior=True; total time=   1.0s
[CV] END .......alpha=0.01, class_prior=None, fit_prior=True; total time=   0.5s
[CV] END .......alpha=0.01, class_prior=None, fit_prior=True; total time=   0.4s
[CV] END ......alpha=0.01, class_prior=None, fit_prior=False; total time=   0.4s
[CV] END ......alpha=0.01, class_prior=None, fit_prior=False; total time=   0.5s
[CV] END ......alpha=0.01, class_prior=None, fit_prior=False; total time=   0.5s
[CV] END ........alpha=0.1, class_prior=None, fit_prior=True; total time=   0.4s
[CV] END ........alpha=0.1, class_prior=None, fit_prior=True; total time=   0.4s
[CV] END ........alpha=0.1, class_prior=None, fit_prior=True; total time=   0.4s
[CV] END .......alpha=0.1, class_prior=None, fit_prior=False; total time=   0.4s
[CV] END .......alpha=0.1, class_prior=None, fit_prior=False; total time=   0.4s
[CV] END .......alpha=0.1, class_prior=None, fit

In [72]:

# testing on the test set
y_test_pred_best = best_nb_model.predict(X_test_tfidf)
best_test_accuracy = accuracy_score(y_test, y_test_pred_best)

# print best results
print(f"Best Naive Bayes Test Accuracy after Tuning: {best_test_accuracy:.4f}")

Best Naive Bayes Test Accuracy after Tuning: 0.8270


## Random forest model

### RandomSearch

In [78]:
from sklearn.model_selection import RandomizedSearchCV

In [79]:
rf = RandomForestClassifier(random_state=42)

# randomised search
param_distributions = {
    'n_estimators': np.arange(100, 1001, 100),          # Number of trees in the forest
    'max_depth': [None] + list(np.arange(10, 31, 5)),   # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],                    # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],                      # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]                          # Whether bootstrap samples are used when building trees
}

# setting up RandomizedSearchCV to search over the parameter grid
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions, 
                                   scoring='accuracy', n_iter=100, cv=3, verbose=2, n_jobs=-1, random_state=42)

# model with random search
random_search.fit(X_train_tfidf, y_train)

# best parameters found by Randomized Search
print(f"Best hyperparameters: {random_search.best_params_}")


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best hyperparameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': False}
[CV] END bootstrap=True, max_depth=25, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time= 2.1min
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=900; total time=32.4min
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=900; total time=32.4min
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time= 3.6min
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time= 1.8min
[CV] END bootstrap=False, max_depth=25, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time= 3.1min
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimator

In [80]:

# best Random Forest model based on RandomizedSearchCV
best_rf_model = random_search.best_estimator_

# evaluate on the test set
y_test_pred = best_rf_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)

# best model accuracy
print(f"Test Accuracy with Best Random Forest Model (Randomized Search): {test_accuracy:.4f}")

Test Accuracy with Best Random Forest Model (Randomized Search): 0.8695


In [82]:
# test on test set
y_val_pred = best_rf_model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)

# best model accuracy
print(f"Test Accuracy with Best Random Forest Model (Randomized Search): {val_accuracy:.4f}")

Validation Accuracy with Best Random Forest Model (Randomized Search): 0.8790
