# **SENTIMENT ANALYSIS - SUPERVISED**

In [1]:
import nltk
import numpy as np
import json
from sklearn.metrics import accuracy_score

In [2]:
# Load datasets
with open('./data/X_train.json', 'r') as file:
    X_train = json.load(file)
    
with open('./data/X_test.json', 'r') as file:
    X_test = json.load(file)
    
with open('./data/y_train.json', 'r') as file:
    y_train = json.load(file)
    
with open('./data/y_test.json', 'r') as file:
    y_test = json.load(file)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0.1, stop_words='english', strip_accents='ascii')

In [4]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [24]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.2,max_depth=5, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

0.746

In [23]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
clf = AdaBoostClassifier(n_estimators=200, algorithm="SAMME", random_state=0)
clf.fit(X_train, y_train)

accuracy_score(clf.predict(X_test), y_test)

0.74

In [22]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train.toarray(), y_train)

accuracy_score(clf.predict(X_test.toarray()), y_test)

0.68

In [27]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0)
clf.fit(X_train, y_train)

accuracy_score(clf.predict(X_test), y_test)

0.718

In [9]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)

accuracy_score(clf.predict(X_test), y_test)

0.726

In [19]:
from sklearn.svm import SVC
clf = SVC(C=1, kernel='rbf', degree=3)
clf.fit(X_train, y_train)

accuracy_score(clf.predict(X_test), y_test)

0.734

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train, y_train)

accuracy_score(clf.predict(X_test), y_test)

In [35]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', max_iter=10000, alpha=1e-2, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train, y_train)
accuracy_score(clf.predict(X_test), y_test)

0.728

In [38]:
# Cross-validation

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# List of models and their hyperparameters
random_state = 42
models_params = [
    (GradientBoostingClassifier(), {'n_estimators': [100, 200], 'learning_rate': [0.1, 0.2, 0.5], 'max_depth': [3, 5, 7], 'random_state': [random_state]}),
    (AdaBoostClassifier(), {'n_estimators': [50, 100, 200], 'algorithm': ["SAMME", "SAMME.R"], 'learning_rate': [0.5, 1], 'random_state': [random_state]}),
    (GaussianNB(), {}),  # GaussianNB doesn't have any hyperparameters
    (RandomForestClassifier(), {'n_estimators': [100, 200], 'max_depth': [2, 5, 10, None], 'random_state': [random_state]}),
    (LogisticRegression(), {'C': [0.5, 1, 5], 'max_iter': [1000] , 'random_state': [random_state]}),
    (SVC(), {'C': [0.5, 1, 5], 'kernel': ['linear', 'rbf', 'poly'], 'degree': [1, 2, 3], 'random_state': [random_state]}),
    (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
    (MLPClassifier(), {'solver': ['lbfgs', 'adam'], 'learning_rate': ['constant', 'adaptive', 'invscaling'], 'hidden_layer_sizes': [(5, 2), (10, 5), (20, 10)], 'max_iter': [10000], 'alpha': [1e-2, 1e-3, 1e-5], 'random_state': [random_state]})
]
results = []

def run_grid_search(model, params):
    """
    Runs a grid search for the given model and parameters.
    """
    if model.__class__.__name__ in ['GaussianNB']:
        model = make_pipeline(FunctionTransformer(lambda x: x.toarray(), accept_sparse=True), model)
    
    grid_search = GridSearchCV(model, params, scoring='accuracy', cv=5)
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_params_, grid_search.best_score_

# Run grid search for each model
iter = 1
for model, params in models_params:
    print(f"Running grid search on model: {iter}/{len(models_params)}\r", end="")
    best_params, best_score = run_grid_search(model, params)
    results.append((model.__class__.__name__, best_params, best_score, model))
    iter += 1

# Show results
for model_name, best_params, best_score, model in results:
    print(f"{model_name}: Best params = {best_params} (Accuracy = {best_score:.4f})")

best_model = max(results, key=lambda x: x[2])
print(f"BEST MODEL: {best_model[0]} with params = {best_model[1]} (Accuracy = {best_model[2]:.4f})")

Running grid search on model: 2/8



GradientBoostingClassifier: Best params = {'learning_rate': 0.5, 'max_depth': 7, 'n_estimators': 200, 'random_state': 42} (Accuracy = 0.7453)
AdaBoostClassifier: Best params = {'algorithm': 'SAMME.R', 'learning_rate': 0.5, 'n_estimators': 200, 'random_state': 42} (Accuracy = 0.7373)
GaussianNB: Best params = {} (Accuracy = 0.6860)
RandomForestClassifier: Best params = {'max_depth': 10, 'n_estimators': 200, 'random_state': 42} (Accuracy = 0.7633)
LogisticRegression: Best params = {'C': 0.5, 'max_iter': 1000, 'random_state': 42} (Accuracy = 0.7447)
SVC: Best params = {'C': 1, 'degree': 1, 'kernel': 'poly', 'random_state': 42} (Accuracy = 0.7660)
KNeighborsClassifier: Best params = {'n_neighbors': 3} (Accuracy = 0.5853)
MLPClassifier: Best params = {'alpha': 1e-05, 'hidden_layer_sizes': (10, 5), 'learning_rate': 'constant', 'max_iter': 10000, 'random_state': 42, 'solver': 'adam'} (Accuracy = 0.7520)
BEST MODEL: SVC with params = {'C': 1, 'degree': 1, 'kernel': 'poly', 'random_state': 42} 

In [39]:
# Run the best model on the test set
clf = SVC(C=1, kernel='poly', degree=1, random_state=random_state)
clf.fit(X_train, y_train)
accuracy_score(clf.predict(X_test), y_test)

0.746