# **SENTIMENT ANALYSIS - SUPERVISED**

In [1]:
import nltk
import numpy as np
import json
from sklearn.metrics import accuracy_score
from nltk.stem import WordNetLemmatizer

In [2]:
# Load datasets
with open('./data/X_train.json', 'r') as file:
    X_train = json.load(file)
    
with open('./data/X_test.json', 'r') as file:
    X_test = json.load(file)
    
with open('./data/y_train.json', 'r') as file:
    y_train = json.load(file)
    
with open('./data/y_test.json', 'r') as file:
    y_test = json.load(file)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0.0, stop_words='english', strip_accents='ascii')

In [5]:
lemmatizer = WordNetLemmatizer()
X_train_lem = [lemmatizer.lemmatize(token) for token in X_train]
X_test_lem = [lemmatizer.lemmatize(token) for token in X_test]

In [8]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [9]:
X_train_lem = vectorizer.fit_transform(X_train_lem)
X_test_lem = vectorizer.transform(X_test_lem)

In [45]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=3, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

0.774

In [108]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
clf = AdaBoostClassifier(n_estimators=300, algorithm="SAMME", random_state=0)
clf.fit(X_train, y_train)

accuracy_score(clf.predict(X_test), y_test)

0.816

In [84]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train.toarray(), y_train)

accuracy_score(clf.predict(X_test.toarray()), y_test)

0.678

In [17]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=1500, max_depth=None, random_state=0)
clf.fit(X_train, y_train)

accuracy_score(clf.predict(X_test), y_test)

0.844

In [21]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, C=0.5)
clf.fit(X_train, y_train)

accuracy_score(clf.predict(X_test), y_test)

0.812

In [90]:
from sklearn.svm import SVC
clf = SVC(C=1, kernel='rbf', degree=3)
clf.fit(X_train, y_train)

accuracy_score(clf.predict(X_test), y_test)

0.778

In [94]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train, y_train)

accuracy_score(clf.predict(X_test), y_test)

0.556

In [38]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-3, hidden_layer_sizes=(20, 10), random_state=42)
clf.fit(X_train, y_train)
accuracy_score(clf.predict(X_test), y_test)

0.81

In [47]:
# Cross-validation

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# List of models and their hyperparameters
random_state = 42
models_params = [
    (GradientBoostingClassifier(), {'learning_rate': [0.1, 0.25, 0.5], 'max_depth': [3, 6], 'random_state': [random_state]}),
    (AdaBoostClassifier(), {'n_estimators': [50, 100, 200], 'algorithm': ["SAMME", "SAMME.R"], 'learning_rate': [0.5, 1], 'random_state': [random_state]}),
    (RandomForestClassifier(), {'n_estimators': [100, 500, 1000, 1500], 'max_depth': [2, 5, 10, 12, 13, 14, None], 'random_state': [random_state]}),
    (LogisticRegression(), {'max_iter': [1000] , 'random_state': [random_state]}),
    (SVC(), {'C': [0.5, 1, 5], 'kernel': ['linear', 'rbf', 'poly'], 'degree': [1, 2, 3], 'random_state': [random_state]}),
    (MLPClassifier(), {'solver': ['lbfgs', 'adam'], 'learning_rate': ['constant', 'adaptive', 'invscaling'], 'hidden_layer_sizes': [(5, 2), (10, 5), (20, 10)], 'alpha': [1e-2, 1e-3, 1e-5], 'random_state': [random_state]})
]
results = []

def run_grid_search(model, params):
    """
    Runs a grid search for the given model and parameters.
    """
    grid_search = GridSearchCV(model, params, scoring='accuracy', cv=5)
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_params_, grid_search.best_score_

# Run grid search for each model
iter = 1
for model, params in models_params:
    print(f"Running grid search on model: {iter}/{len(models_params)}\r", end="")
    best_params, best_score = run_grid_search(model, params)
    results.append((model.__class__.__name__, best_params, best_score))
    iter += 1

# Show results
for model_name, best_params, best_score in results:
    print(f"{model_name}: Best params = {best_params} (Accuracy = {best_score:.4f})")

best_model = max(results, key=lambda x: x[2])
print(f"BEST MODEL: {best_model[0]} with params = {best_model[1]} (Accuracy = {best_model[2]:.4f})")

Running grid search on model: 1/6

KeyboardInterrupt: 

In [67]:
# Run the best model on the test set
best_clf = RandomForestClassifier(random_state=random_state)
best_clf.fit(X_train, y_train)
accuracy_score(best_clf.predict(X_test), y_test)

0.812