# Bag of Words - Models - Sentiment Analysis - Big Richard Club

#### Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.linear_model import SGDClassifier

pd.options.mode.chained_assignment = None  # default='warn'

# I. Import Data

In [3]:
x_train_bow = np.loadtxt("data/lesser_x_train_bow.txt.gz")
y_train = np.loadtxt("data/y_train.txt.gz")

print(f"x_train shape : {x_train_bow.shape} \ny_train shape : {y_train.shape}")

x_train shape : (41157, 1000) 
y_train shape : (41157,)


In [4]:
x_val_bow = np.loadtxt("data/lesser_x_val_bow.txt.gz")
y_val = np.loadtxt("data/y_val.txt.gz")

print(f"x_val shape : {x_val_bow.shape} \ny_val shape : {y_val.shape}")

x_val shape : (3798, 1000) 
y_val shape : (3798,)


# II. Models 

## A) Linear Support Vector Classifier 

In [5]:
linear_svm = svm.LinearSVC(max_iter=10000).fit(x_train_bow, y_train)

In [6]:
svm_training_accuracy = round(
    linear_svm.score(x_train_bow, y_train),
    5
)

print(f"Linear SVM Training Accuracy : {svm_training_accuracy * 100} %")

Linear SVM Training Accuracy : 75.518 %


In [7]:
svm_validation_accuracy = round(
    linear_svm.score(x_val_bow, y_val),
    5
)

print(f"Linear SVM validation Accuracy : {svm_validation_accuracy * 100} %")

Linear SVM validation Accuracy : 18.668000000000003 %


## B) Stochastic Gradient Descent on Linear Models 

In [8]:
sgd = SGDClassifier(
    early_stopping=False,
    penalty="l2",
    loss="hinge",
    max_iter=10000
).fit(x_train_bow, y_train)

In [9]:
sgd_training_accuracy = round(
    sgd.score(x_train_bow, y_train),
    5
)

print(f"SGD Training Accuracy : {sgd_training_accuracy * 100} %")

SGD Training Accuracy : 75.292 %


In [10]:
sgd_validation_accuracy = round(
    sgd.score(x_val_bow, y_val),
    5
)

print(f"SGD validation Accuracy : {sgd_validation_accuracy * 100} %")

SGD validation Accuracy : 18.246000000000002 %


## C) Support Vector Classifier

In [11]:
nu_svc = svm.NuSVC(
    max_iter=1000
).fit(x_train_bow[:10000], y_train[:10000])



In [12]:
nu_svc_training_accuracy = round(
    nu_svc.score(x_train_bow[:10000], y_train[:10000]),
    5
)

print(f"Nu SVC Training Accuracy : {nu_svc_training_accuracy * 100} %")

Nu SVC Training Accuracy : 89.99000000000001 %


In [13]:
nu_svc_validation_accuracy = round(
    nu_svc.score(x_val_bow[:10000], y_val[:10000]),
    5
)

print(f"Nu SVC validation Accuracy : {nu_svc_validation_accuracy * 100} %")

Nu SVC validation Accuracy : 39.757999999999996 %


## D) Logistic Regression

In [14]:
log_reg = LogisticRegression(
    penalty="l2",
    solver="lbfgs"
).fit(x_train_bow, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
log_reg_training_accuracy = round(
    log_reg.score(x_train_bow, y_train),
    5
)

print(f"Logistic Regression Training Accuracy : {log_reg_training_accuracy * 100} %")

Logistic Regression Training Accuracy : 75.58399999999999 %


In [16]:
log_reg_validation_accuracy = round(
    log_reg.score(x_val_bow, y_val),
    5
)

print(f"Logistic Regression validation Accuracy : {log_reg_validation_accuracy * 100} %")

Logistic Regression validation Accuracy : 36.414 %


## E) Multinomial Naive Bayes 

In [14]:
multinom_nb = MultinomialNB(
    alpha=1,
    fit_prior=False
).fit(x_train_bow, y_train)

In [15]:
multinom_nb_training_accuracy = round(
    multinom_nb.score(x_train_bow, y_train),
    5
)

print(f"Multinomial Naive Bayes Training Accuracy : {multinom_nb_training_accuracy * 100} %")

Multinomial Naive Bayes Training Accuracy : 65.994 %


In [16]:
multinom_nb_validation_accuracy = round(
    multinom_nb.score(x_val_bow, y_val),
    5
)

print(f"Multinomial Naive Bayes validation Accuracy : {multinom_nb_validation_accuracy * 100} %")

Multinomial Naive Bayes validation Accuracy : 34.887 %
