# Experimentation with different number of classes

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# This cell reads files from Google Colab. If not using Colab, change the file directories accordingly
from google.colab import drive
drive.mount('/content/drive')

train = pd.read_csv('/content/drive/MyDrive/Datasets/ML_train.csv', index_col = 0)
test = pd.read_csv('/content/drive/MyDrive/Datasets/ML_test.csv', index_col = 0)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
X_train = train['Text']
y_train = train['Sentiment']

In [None]:
X_test = test['Text']
y_test = test['Sentiment']

In [None]:
bow_vectorizer = CountVectorizer()
bow_vectorizer.fit(X_train)

bow_X_train = bow_vectorizer.transform(X_train)
bow_X_test = bow_vectorizer.transform(X_test)

In [None]:
# ngram_range=(1, 3): This tells the vectorizer to consider unigrams, bigrams, and trigrams
# min_df=2: This means an n-gram must appear in at least two documents to be considered. This helps in removing very rare n-grams that might not be useful for modeling.
# max_df=0.85: This means an n-gram appearing in more than 85% of the documents will be ignored, helping in filtering out too common n-grams.

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=2, max_df=0.85)
tfidf_vectorizer.fit(X_train)

tfidf_X_train = tfidf_vectorizer.transform(X_train)
tfidf_X_test = tfidf_vectorizer.transform(X_test)

# Function to help us test the models

In [None]:
def train_and_eval(model, trainX, trainY, testX, testY):

    # training the model
    fitted_model = model.fit(trainX, trainY)

    # getting predictions
    y_preds_train = fitted_model.predict(trainX)
    y_preds_test = fitted_model.predict(testX)

    # evaluating the model
    print()
    print(model)
    print(f"Train accuracy score : {accuracy_score(trainY, y_preds_train)}")
    print(f"Test accuracy score : {accuracy_score(testY, y_preds_test)}")
    print(classification_report(testY, y_preds_test))
    print('\n',40*'-')

# Random Forest Classifier

Fitting random forest with bag of words

In [None]:
trees = [50,100,150,200]

for t in trees:
  clf = RandomForestClassifier(random_state=42, n_estimators=t)
  train_and_eval(
      model = clf,
      trainX = bow_X_train,
      trainY = y_train,
      testX = bow_X_test,
      testY = y_test
  )


RandomForestClassifier(n_estimators=50, random_state=42)
Train accuracy score : 0.999938256359595
Test accuracy score : 0.27445652173913043
              precision    recall  f1-score   support

           1       0.27      0.83      0.40      2024
           2       0.37      0.07      0.12      2024
           3       0.34      0.09      0.15      2024
           4       0.33      0.12      0.18      2024
           5       0.36      0.12      0.18      2024
           6       0.28      0.13      0.18      2024
           7       0.35      0.07      0.11      2024
           8       0.25      0.76      0.38      2024

    accuracy                           0.27     16192
   macro avg       0.32      0.27      0.21     16192
weighted avg       0.32      0.27      0.21     16192


 ----------------------------------------

RandomForestClassifier(random_state=42)
Train accuracy score : 0.999938256359595
Test accuracy score : 0.2799530632411067
              precision    recall  f1-scor

Fitting random forest with TF-IDF

In [None]:
trees = [50,100,150,200]

for t in trees:
  clf = RandomForestClassifier(random_state=42, n_estimators=t)
  train_and_eval(
      model = clf,
      trainX = tfidf_X_train,
      trainY = y_train,
      testX = tfidf_X_test,
      testY = y_test
  )


RandomForestClassifier(n_estimators=50, random_state=42)
Train accuracy score : 0.999938256359595
Test accuracy score : 0.2813735177865613
              precision    recall  f1-score   support

           1       0.27      0.85      0.41      2024
           2       0.31      0.05      0.09      2024
           3       0.34      0.08      0.13      2024
           4       0.34      0.15      0.21      2024
           5       0.39      0.14      0.21      2024
           6       0.30      0.15      0.20      2024
           7       0.40      0.07      0.13      2024
           8       0.26      0.76      0.38      2024

    accuracy                           0.28     16192
   macro avg       0.33      0.28      0.22     16192
weighted avg       0.33      0.28      0.22     16192


 ----------------------------------------

RandomForestClassifier(random_state=42)
Train accuracy score : 0.999938256359595
Test accuracy score : 0.28649950592885376
              precision    recall  f1-scor

Random Forest Classifier seems to be overfitted, performing will 99.9% accuracy on training data, but only 28.6% accuracy for test data.

Due to the high similarity between the different classes, it is too hard for the model to distinguish between one class and another that is one higher or one lower.




# Multinomial Naive Bayes

Fitting Naive Bayes with Bag of Words

In [None]:
alphas = [0, 0.2, 0.6, 0.8, 1]

for a  in alphas:
    # Define model
    nb_model = MultinomialNB(alpha=a)

    # Train and evaluate model
    train_and_eval(model = nb_model,
                   trainX = bow_X_train,
                   trainY = y_train,
                   testX = bow_X_test,
                   testY = y_test)




MultinomialNB(alpha=0)
Train accuracy score : 0.9337336379352926
Test accuracy score : 0.2741477272727273
              precision    recall  f1-score   support

           1       0.31      0.57      0.40      2024
           2       0.26      0.13      0.18      2024
           3       0.25      0.17      0.20      2024
           4       0.28      0.24      0.26      2024
           5       0.24      0.19      0.21      2024
           6       0.23      0.24      0.24      2024
           7       0.27      0.14      0.18      2024
           8       0.29      0.51      0.37      2024

    accuracy                           0.27     16192
   macro avg       0.27      0.27      0.25     16192
weighted avg       0.27      0.27      0.25     16192


 ----------------------------------------

MultinomialNB(alpha=0.2)
Train accuracy score : 0.8694430723635466
Test accuracy score : 0.32806324110671936
              precision    recall  f1-score   support

           1       0.38      0.70 

Fitting Naive Bayes with TF-IDF

In [None]:
alphas = [0, 0.2, 0.6, 0.8, 1]

for a  in alphas:
    # Define model
    nb_model = MultinomialNB(alpha=a)

    # Train and evaluate model
    train_and_eval(model = nb_model,
                   trainX = tfidf_X_train,
                   trainY = y_train,
                   testX = tfidf_X_test,
                   testY = y_test)




MultinomialNB(alpha=0)
Train accuracy score : 0.9632779698691035
Test accuracy score : 0.2625370553359684
              precision    recall  f1-score   support

           1       0.31      0.52      0.39      2024
           2       0.24      0.13      0.17      2024
           3       0.24      0.17      0.20      2024
           4       0.28      0.25      0.26      2024
           5       0.23      0.19      0.21      2024
           6       0.22      0.23      0.22      2024
           7       0.27      0.15      0.19      2024
           8       0.27      0.46      0.34      2024

    accuracy                           0.26     16192
   macro avg       0.26      0.26      0.25     16192
weighted avg       0.26      0.26      0.25     16192


 ----------------------------------------

MultinomialNB(alpha=0.2)
Train accuracy score : 0.9011638676216349
Test accuracy score : 0.33547430830039526
              precision    recall  f1-score   support

           1       0.43      0.63 

In [None]:
trial_train = train.copy()


In [None]:
def good_bad(row):
  if row > 4:
    return 1
  else:
    return 0

In [None]:
trial_train['Good'] = trial_train['Sentiment'].apply(good_bad)

In [None]:
trial_X_train = trial_train['Text']
trial_y_train = trial_train['Good']

In [None]:
trial_test = test.copy()

In [None]:
trial_test['Good'] = trial_test['Sentiment'].apply(good_bad)

In [None]:
trial_X_test = trial_test['Text']
trial_y_test = trial_test['Good']

# Logistic Regression with 2 classes

BoW

In [None]:
bow_vectorizer = CountVectorizer()
bow_vectorizer.fit(trial_X_train)

trial_bow_X_train = bow_vectorizer.transform(trial_X_train)
trial_bow_X_test = bow_vectorizer.transform(trial_X_test)

TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=2, max_df=0.85)
tfidf_vectorizer.fit(trial_X_train)

trial_tfidf_X_train = tfidf_vectorizer.transform(trial_X_train)
trial_tfidf_X_test = tfidf_vectorizer.transform(trial_X_test)

Logistic Regression with BOW

In [None]:
C = [0.001, 0.01, 0.1, 1, 10,]

for c in C:
    # Define model
    log_model = LogisticRegression(C=c, max_iter=500, penalty='l2')

    # Train and evaluate model
    train_and_eval(model=log_model,
                   trainX = trial_bow_X_train,
                   trainY = trial_y_train,
                   testX = trial_bow_X_test,
                   testY = trial_y_test)


LogisticRegression(C=0.001, max_iter=500)
Train accuracy score : 0.8763120523586071
Test accuracy score : 0.8617835968379447
              precision    recall  f1-score   support

           0       0.87      0.85      0.86      8096
           1       0.85      0.87      0.86      8096

    accuracy                           0.86     16192
   macro avg       0.86      0.86      0.86     16192
weighted avg       0.86      0.86      0.86     16192


 ----------------------------------------

LogisticRegression(C=0.01, max_iter=500)
Train accuracy score : 0.9313410718695975
Test accuracy score : 0.8857460474308301
              precision    recall  f1-score   support

           0       0.89      0.88      0.89      8096
           1       0.88      0.89      0.89      8096

    accuracy                           0.89     16192
   macro avg       0.89      0.89      0.89     16192
weighted avg       0.89      0.89      0.89     16192


 ----------------------------------------

Logistic

Logistic Regression with TF-IDF

In [None]:
C = [0.001, 0.01, 0.1, 1, 10,]

for c in C:
    # Define model
    log_model = LogisticRegression(C=c, max_iter=500, penalty='l2')

    # Train and evaluate model
    train_and_eval(model=log_model,
                   trainX = trial_tfidf_X_train,
                   trainY = trial_y_train,
                   testX = trial_tfidf_X_test,
                   testY = trial_y_test)


LogisticRegression(C=0.001, max_iter=500)
Train accuracy score : 0.7909360335885404
Test accuracy score : 0.786808300395257
              precision    recall  f1-score   support

           0       0.82      0.74      0.78      8096
           1       0.76      0.84      0.80      8096

    accuracy                           0.79     16192
   macro avg       0.79      0.79      0.79     16192
weighted avg       0.79      0.79      0.79     16192


 ----------------------------------------

LogisticRegression(C=0.01, max_iter=500)
Train accuracy score : 0.829510372931588
Test accuracy score : 0.8221961462450593
              precision    recall  f1-score   support

           0       0.85      0.78      0.81      8096
           1       0.80      0.86      0.83      8096

    accuracy                           0.82     16192
   macro avg       0.82      0.82      0.82     16192
weighted avg       0.82      0.82      0.82     16192


 ----------------------------------------

LogisticRe

Might be better if we just classify good or bad as opposed to having 8 different classes to predict

For binary classification, best models so far would be:

```
LogisticRegression(C=1, max_iter=500)
LogisticRegression(C=10, max_iter=500)
```



# Multinomial Naive Bayes for 2 classes

Multinomial Naive Bayes with BOW

In [None]:
alphas = [0, 0.2, 0.6, 0.8, 1]

for a in alphas:
    # Define model
    nb_model = MultinomialNB(alpha=a)

    # Train and evaluate model
    train_and_eval(model = nb_model,
                   trainX = trial_bow_X_train,
                   trainY = trial_y_train,
                   testX = trial_bow_X_test,
                   testY = trial_y_test)




MultinomialNB(alpha=0)
Train accuracy score : 0.9722771054581378
Test accuracy score : 0.7861907114624506
              precision    recall  f1-score   support

           0       0.79      0.78      0.79      8096
           1       0.78      0.79      0.79      8096

    accuracy                           0.79     16192
   macro avg       0.79      0.79      0.79     16192
weighted avg       0.79      0.79      0.79     16192


 ----------------------------------------

MultinomialNB(alpha=0.2)
Train accuracy score : 0.944399851815263
Test accuracy score : 0.8541254940711462
              precision    recall  f1-score   support

           0       0.84      0.87      0.86      8096
           1       0.87      0.84      0.85      8096

    accuracy                           0.85     16192
   macro avg       0.85      0.85      0.85     16192
weighted avg       0.85      0.85      0.85     16192


 ----------------------------------------

MultinomialNB(alpha=0.6)
Train accuracy scor

Multinomial Naive Bayes with TF-IDF

In [None]:
alphas = [0, 0.2, 0.6, 0.8, 1]

for a  in alphas:
    # Define model
    nb_model = MultinomialNB(alpha=a)

    # Train and evaluate model
    train_and_eval(model = nb_model,
                  trainX = trial_tfidf_X_train,
                  trainY = trial_y_train,
                  testX = trial_tfidf_X_test,
                  testY = trial_y_test)




MultinomialNB(alpha=0)
Train accuracy score : 0.9817856260805137
Test accuracy score : 0.7742712450592886
              precision    recall  f1-score   support

           0       0.78      0.76      0.77      8096
           1       0.77      0.79      0.78      8096

    accuracy                           0.77     16192
   macro avg       0.77      0.77      0.77     16192
weighted avg       0.77      0.77      0.77     16192


 ----------------------------------------

MultinomialNB(alpha=0.2)
Train accuracy score : 0.9556680661891825
Test accuracy score : 0.8571516798418972
              precision    recall  f1-score   support

           0       0.86      0.86      0.86      8096
           1       0.86      0.85      0.86      8096

    accuracy                           0.86     16192
   macro avg       0.86      0.86      0.86     16192
weighted avg       0.86      0.86      0.86     16192


 ----------------------------------------

MultinomialNB(alpha=0.6)
Train accuracy sco

# Random Forest for 2 classes

Random Forest with BOW

In [None]:
trees = [50,100,150,200]

for t in trees:
  clf = RandomForestClassifier(random_state=42, n_estimators=t)
  train_and_eval(
      model = clf,
      trainX = trial_bow_X_train,
      trainY = trial_y_train,
      testX = trial_bow_X_test,
      testY = trial_y_test
  )


RandomForestClassifier(n_estimators=50, random_state=42)
Train accuracy score : 1.0
Test accuracy score : 0.8350419960474308
              precision    recall  f1-score   support

           0       0.83      0.84      0.84      8096
           1       0.84      0.83      0.83      8096

    accuracy                           0.84     16192
   macro avg       0.84      0.84      0.84     16192
weighted avg       0.84      0.84      0.84     16192


 ----------------------------------------

RandomForestClassifier(random_state=42)
Train accuracy score : 1.0
Test accuracy score : 0.848443675889328
              precision    recall  f1-score   support

           0       0.85      0.84      0.85      8096
           1       0.85      0.85      0.85      8096

    accuracy                           0.85     16192
   macro avg       0.85      0.85      0.85     16192
weighted avg       0.85      0.85      0.85     16192


 ----------------------------------------

RandomForestClassifier(n_

Random Forest with TF-IDF

In [None]:
trees = [50,100,150,200]

for t in trees:
  clf = RandomForestClassifier(random_state=42, n_estimators=t)
  train_and_eval(
      model = clf,
      trainX = trial_tfidf_X_train,
      trainY = trial_y_train,
      testX = trial_tfidf_X_test,
      testY = trial_y_test
  )


RandomForestClassifier(n_estimators=50, random_state=42)
Train accuracy score : 1.0
Test accuracy score : 0.8277544466403162
              precision    recall  f1-score   support

           0       0.82      0.84      0.83      8096
           1       0.83      0.82      0.83      8096

    accuracy                           0.83     16192
   macro avg       0.83      0.83      0.83     16192
weighted avg       0.83      0.83      0.83     16192


 ----------------------------------------

RandomForestClassifier(random_state=42)
Train accuracy score : 1.0
Test accuracy score : 0.850481719367589
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      8096
           1       0.85      0.85      0.85      8096

    accuracy                           0.85     16192
   macro avg       0.85      0.85      0.85     16192
weighted avg       0.85      0.85      0.85     16192


 ----------------------------------------

RandomForestClassifier(n_

# Splitting sentiment into 3 classes (good, average, poor)

In [None]:
class_3_train = train.copy()
class_3_train.head()

Unnamed: 0,Text,Sentiment
0,movie is funny suitable age is definitely fami...,6
1,old commercial blank audio cassette tag line w...,7
2,cinemascope color cinematography leon shamroy ...,4
3,get film possible will find really good perfor...,6
4,soundtrack is bit dated story is relevant yous...,6


In [None]:
class_3_test = test.copy()
class_3_test.head()

Unnamed: 0,Text,Sentiment
0,would highly recommend seeing movie viewing wi...,1
1,see everyone love film much true doe have good...,1
2,damn thought would seen bad western cannot top...,1
3,well certainly stunned believe someone made an...,1
4,jefferey dahmer wa one sick guy is much say al...,1


In [None]:
def good_avg_bad(row):
  if row <= 3:
    return 0
  elif row <= 5:
    return 1
  else:
    return 2

In [None]:
class_3_train['Split'] = class_3_train['Sentiment'].apply(good_avg_bad)
class_3_train.head()

Unnamed: 0,Text,Sentiment,Split
0,movie is funny suitable age is definitely fami...,6,2
1,old commercial blank audio cassette tag line w...,7,2
2,cinemascope color cinematography leon shamroy ...,4,1
3,get film possible will find really good perfor...,6,2
4,soundtrack is bit dated story is relevant yous...,6,2


In [None]:
class_3_X_train = class_3_train['Text']
class_3_y_train = class_3_train['Split']

In [None]:
class_3_test['Split'] = class_3_test['Sentiment'].apply(good_avg_bad)
class_3_test.head()

Unnamed: 0,Text,Sentiment,Split
0,would highly recommend seeing movie viewing wi...,1,0
1,see everyone love film much true doe have good...,1,0
2,damn thought would seen bad western cannot top...,1,0
3,well certainly stunned believe someone made an...,1,0
4,jefferey dahmer wa one sick guy is much say al...,1,0


In [None]:
class_3_X_test = class_3_test['Text']
class_3_y_test = class_3_test['Split']

# Vectorization

Bag of Words vectorizer

In [None]:
bow_vectorizer = CountVectorizer()
bow_vectorizer.fit(class_3_X_train)

class_3_bow_X_train = bow_vectorizer.transform(class_3_X_train)
class_3_bow_X_test = bow_vectorizer.transform(class_3_X_test)

TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=2, max_df=0.85)
tfidf_vectorizer.fit(class_3_X_train)

class_3_tfidf_X_train = tfidf_vectorizer.transform(class_3_X_train)
class_3_tfidf_X_test = tfidf_vectorizer.transform(class_3_X_test)

# Multinomial Naive Bayes for 3 classes

Multinomial Naive Bayes with BOW

In [None]:
alphas = [0, 0.2, 0.6, 0.8, 1]

for a in alphas:
    # Define model
    nb_model = MultinomialNB(alpha=a)

    # Train and evaluate model
    train_and_eval(model = nb_model,
                   trainX = class_3_bow_X_train,
                   trainY = class_3_y_train,
                   testX = class_3_bow_X_test,
                   testY = class_3_y_test)




MultinomialNB(alpha=0)
Train accuracy score : 0.9507903185971844
Test accuracy score : 0.6129570158102767
              precision    recall  f1-score   support

           0       0.66      0.73      0.69      6072
           1       0.41      0.28      0.33      4048
           2       0.65      0.72      0.68      6072

    accuracy                           0.61     16192
   macro avg       0.57      0.58      0.57     16192
weighted avg       0.59      0.61      0.60     16192


 ----------------------------------------

MultinomialNB(alpha=0.2)
Train accuracy score : 0.9008397135095085
Test accuracy score : 0.673233695652174
              precision    recall  f1-score   support

           0       0.71      0.81      0.75      6072
           1       0.46      0.35      0.40      4048
           2       0.74      0.75      0.75      6072

    accuracy                           0.67     16192
   macro avg       0.64      0.64      0.63     16192
weighted avg       0.66      0.67  

Multinomial Naive Bayes with TF-IDF

In [None]:
alphas = [0, 0.2, 0.6, 0.8, 1]

for a in alphas:
    # Define model
    nb_model = MultinomialNB(alpha=a)

    # Train and evaluate model
    train_and_eval(model = nb_model,
                   trainX = class_3_tfidf_X_train,
                   trainY = class_3_y_train,
                   testX = class_3_tfidf_X_test,
                   testY = class_3_y_test)


MultinomialNB(alpha=0)




Train accuracy score : 0.9626605334650531
Test accuracy score : 0.5965909090909091
              precision    recall  f1-score   support

           0       0.65      0.70      0.67      6072
           1       0.39      0.23      0.29      4048
           2       0.62      0.74      0.67      6072

    accuracy                           0.60     16192
   macro avg       0.55      0.56      0.54     16192
weighted avg       0.57      0.60      0.58     16192


 ----------------------------------------

MultinomialNB(alpha=0.2)
Train accuracy score : 0.8865306248456409
Test accuracy score : 0.6756422924901185
              precision    recall  f1-score   support

           0       0.69      0.86      0.76      6072
           1       0.50      0.15      0.23      4048
           2       0.69      0.85      0.76      6072

    accuracy                           0.68     16192
   macro avg       0.63      0.62      0.58     16192
weighted avg       0.64      0.68      0.63     16192


 -

# Random Forest for 3 classes

Random Forest with BOW

In [None]:
trees = [50,100,150,200]

for t in trees:
  clf = RandomForestClassifier(random_state=42, n_estimators=t)
  train_and_eval(
      model = clf,
      trainX = class_3_bow_X_train,
      trainY = class_3_y_train,
      testX = class_3_bow_X_test,
      testY = class_3_y_test
  )


RandomForestClassifier(n_estimators=50, random_state=42)
Train accuracy score : 0.9999536922696962
Test accuracy score : 0.6487771739130435
              precision    recall  f1-score   support

           0       0.65      0.84      0.74      6072
           1       0.63      0.04      0.08      4048
           2       0.64      0.86      0.74      6072

    accuracy                           0.65     16192
   macro avg       0.64      0.58      0.52     16192
weighted avg       0.64      0.65      0.57     16192


 ----------------------------------------

RandomForestClassifier(random_state=42)
Train accuracy score : 0.9999536922696962
Test accuracy score : 0.6641551383399209
              precision    recall  f1-score   support

           0       0.68      0.86      0.76      6072
           1       0.80      0.03      0.06      4048
           2       0.65      0.89      0.75      6072

    accuracy                           0.66     16192
   macro avg       0.71      0.59      

Random Forest with TF-IDF

In [None]:
trees = [50,100,150,200]

for t in trees:
  clf = RandomForestClassifier(random_state=42, n_estimators=t)
  train_and_eval(
      model = clf,
      trainX = class_3_tfidf_X_train,
      trainY = class_3_y_train,
      testX = class_3_tfidf_X_test,
      testY = class_3_y_test
  )


RandomForestClassifier(n_estimators=50, random_state=42)
Train accuracy score : 0.9999536922696962
Test accuracy score : 0.6468626482213439
              precision    recall  f1-score   support

           0       0.66      0.84      0.74      6072
           1       0.66      0.03      0.06      4048
           2       0.64      0.86      0.73      6072

    accuracy                           0.65     16192
   macro avg       0.65      0.58      0.51     16192
weighted avg       0.65      0.65      0.57     16192


 ----------------------------------------

RandomForestClassifier(random_state=42)
Train accuracy score : 0.9999536922696962
Test accuracy score : 0.6601408102766798
              precision    recall  f1-score   support

           0       0.67      0.85      0.75      6072
           1       0.81      0.02      0.04      4048
           2       0.65      0.89      0.75      6072

    accuracy                           0.66     16192
   macro avg       0.71      0.59      

# Splitting sentiment into 4 classes

In [None]:
class_4_train = train.copy()
class_4_train.head()

Unnamed: 0,Text,Sentiment
0,movie is funny suitable age is definitely fami...,6
1,old commercial blank audio cassette tag line w...,7
2,cinemascope color cinematography leon shamroy ...,4
3,get film possible will find really good perfor...,6
4,soundtrack is bit dated story is relevant yous...,6


In [None]:
class_4_test = test.copy()
class_4_test.head()

Unnamed: 0,Text,Sentiment
0,would highly recommend seeing movie viewing wi...,1
1,see everyone love film much true doe have good...,1
2,damn thought would seen bad western cannot top...,1
3,well certainly stunned believe someone made an...,1
4,jefferey dahmer wa one sick guy is much say al...,1


In [None]:
def divide_4(row):
  if row <= 2:
    return 0
  elif row <= 4:
    return 1
  elif row <= 6:
    return 2
  else:
    return 3

In [None]:
class_4_train['Split'] = class_4_train['Sentiment'].apply(divide_4)
class_4_train.head()

Unnamed: 0,Text,Sentiment,Split
0,movie is funny suitable age is definitely fami...,6,2
1,old commercial blank audio cassette tag line w...,7,3
2,cinemascope color cinematography leon shamroy ...,4,1
3,get film possible will find really good perfor...,6,2
4,soundtrack is bit dated story is relevant yous...,6,2


In [None]:
class_4_train['Split'].value_counts()

2    16196
3    16196
1    16196
0    16196
Name: Split, dtype: int64

In [None]:
class_4_X_test = class_4_test['Text']
class_4_y_test = class_4_test['Split']

In [None]:
class_4_test['Split'] = class_4_test['Sentiment'].apply(divide_4)
class_4_test.head()

Unnamed: 0,Text,Sentiment,Split
0,would highly recommend seeing movie viewing wi...,1,0
1,see everyone love film much true doe have good...,1,0
2,damn thought would seen bad western cannot top...,1,0
3,well certainly stunned believe someone made an...,1,0
4,jefferey dahmer wa one sick guy is much say al...,1,0


In [None]:
class_4_X_train = class_4_train['Text']
class_4_y_train = class_4_train['Split']

# Vectorization

Bag of Words vectorizer

In [None]:
bow_vectorizer = CountVectorizer()
bow_vectorizer.fit(class_4_X_train)

class_4_bow_X_train = bow_vectorizer.transform(class_4_X_train)
class_4_bow_X_test = bow_vectorizer.transform(class_4_X_test)

TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=2, max_df=0.85)
tfidf_vectorizer.fit(class_4_train)

class_4_tfidf_X_train = tfidf_vectorizer.transform(class_4_X_train)
class_4_tfidf_X_test = tfidf_vectorizer.transform(class_4_X_test)

# Multinomial Naive Bayes for 4 classes

Multinomial Naive Bayes with BOW

In [None]:
alphas = [0, 0.2, 0.6, 0.8, 1]

for a in alphas:
    # Define model
    nb_model = MultinomialNB(alpha=a)

    # Train and evaluate model
    train_and_eval(model = nb_model,
                   trainX = class_4_bow_X_train,
                   trainY = class_4_y_train,
                   testX = class_4_bow_X_test,
                   testY = class_4_y_test)

  _warn_prf(average, modifier, msg_start, len(result))



MultinomialNB(alpha=0)
Train accuracy score : 0.9406489256606569
Test accuracy score : 0.3878458498023715
              precision    recall  f1-score   support

           0       0.70      0.54      0.61      6072
           1       0.33      0.27      0.29      4048
           2       0.51      0.31      0.39      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.39     16192
   macro avg       0.38      0.28      0.32     16192
weighted avg       0.53      0.39      0.45     16192


 ----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



MultinomialNB(alpha=0.2)
Train accuracy score : 0.884338725611262
Test accuracy score : 0.4342885375494071
              precision    recall  f1-score   support

           0       0.75      0.64      0.69      6072
           1       0.39      0.31      0.35      4048
           2       0.54      0.31      0.39      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.43     16192
   macro avg       0.42      0.32      0.36     16192
weighted avg       0.58      0.43      0.49     16192


 ----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



MultinomialNB(alpha=0.6)
Train accuracy score : 0.8566158310693999
Test accuracy score : 0.4357707509881423
              precision    recall  f1-score   support

           0       0.75      0.63      0.69      6072
           1       0.40      0.32      0.35      4048
           2       0.53      0.31      0.40      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.44     16192
   macro avg       0.42      0.32      0.36     16192
weighted avg       0.58      0.44      0.50     16192


 ----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



MultinomialNB(alpha=0.8)
Train accuracy score : 0.8478945418621882
Test accuracy score : 0.43570899209486164
              precision    recall  f1-score   support

           0       0.75      0.63      0.68      6072
           1       0.39      0.33      0.36      4048
           2       0.53      0.32      0.40      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.44     16192
   macro avg       0.42      0.32      0.36     16192
weighted avg       0.58      0.44      0.50     16192


 ----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



MultinomialNB(alpha=1)
Train accuracy score : 0.8413188441590517
Test accuracy score : 0.43601778656126483
              precision    recall  f1-score   support

           0       0.76      0.62      0.68      6072
           1       0.39      0.33      0.36      4048
           2       0.53      0.32      0.40      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.44     16192
   macro avg       0.42      0.32      0.36     16192
weighted avg       0.58      0.44      0.50     16192


 ----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Multinomial Naive Bayes with TF-IDF

In [None]:
alphas = [0, 0.2, 0.6, 0.8, 1]

for a in alphas:
    # Define model
    nb_model = MultinomialNB(alpha=a)

    # Train and evaluate model
    train_and_eval(model = nb_model,
                   trainX = class_4_tfidf_X_train,
                   trainY = class_4_y_train,
                   testX = class_4_tfidf_X_test,
                   testY = class_4_y_test)


MultinomialNB(alpha=0)
Train accuracy score : 0.2505402568535441
Test accuracy score : 0.3748147233201581
              precision    recall  f1-score   support

           0       0.38      0.99      0.55      6072
           1       0.00      0.00      0.00      4048
           2       0.45      0.01      0.02      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.37     16192
   macro avg       0.21      0.25      0.14     16192
weighted avg       0.31      0.37      0.21     16192


 ----------------------------------------

MultinomialNB(alpha=0.2)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train accuracy score : 0.2505402568535441
Test accuracy score : 0.3748147233201581
              precision    recall  f1-score   support

           0       0.38      0.99      0.55      6072
           1       0.00      0.00      0.00      4048
           2       0.45      0.01      0.02      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.37     16192
   macro avg       0.21      0.25      0.14     16192
weighted avg       0.31      0.37      0.21     16192


 ----------------------------------------

MultinomialNB(alpha=0.6)
Train accuracy score : 0.2505402568535441
Test accuracy score : 0.3748147233201581
              precision    recall  f1-score   support

           0       0.38      0.99      0.55      6072
           1       0.00      0.00      0.00      4048
           2       0.45      0.01      0.02      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.37     16192
   m

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.38      0.99      0.55      6072
           1       0.00      0.00      0.00      4048
           2       0.45      0.01      0.02      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.37     16192
   macro avg       0.21      0.25      0.14     16192
weighted avg       0.31      0.37      0.21     16192


 ----------------------------------------

MultinomialNB(alpha=1)
Train accuracy score : 0.2505402568535441
Test accuracy score : 0.3748147233201581
              precision    recall  f1-score   support

           0       0.38      0.99      0.55      6072
           1       0.00      0.00      0.00      4048
           2       0.45      0.01      0.02      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.37     16192
   macro avg       0.21      0.25      0.14     16192
weighted avg       0.31      0.37  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Random Forest for 4 classes

Random Forest with BOW

In [None]:
trees = [50,100,150,200]

for t in trees:
  clf = RandomForestClassifier(random_state=42, n_estimators=t)
  train_and_eval(
      model = clf,
      trainX = class_4_bow_X_train,
      trainY = class_4_y_train,
      testX = class_4_bow_X_test,
      testY = class_4_y_test
  )


RandomForestClassifier(n_estimators=50, random_state=42)
Train accuracy score : 0.9999691281797974
Test accuracy score : 0.36765069169960474
              precision    recall  f1-score   support

           0       0.68      0.71      0.70      6072
           1       0.38      0.15      0.22      4048
           2       0.48      0.16      0.24      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.37     16192
   macro avg       0.39      0.26      0.29     16192
weighted avg       0.53      0.37      0.41     16192


 ----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



RandomForestClassifier(random_state=42)
Train accuracy score : 0.9999691281797974
Test accuracy score : 0.37648221343873517
              precision    recall  f1-score   support

           0       0.71      0.75      0.73      6072
           1       0.43      0.15      0.22      4048
           2       0.49      0.15      0.23      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.38     16192
   macro avg       0.41      0.26      0.30     16192
weighted avg       0.56      0.38      0.42     16192


 ----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



RandomForestClassifier(n_estimators=150, random_state=42)
Train accuracy score : 0.9999691281797974
Test accuracy score : 0.37493824110671936
              precision    recall  f1-score   support

           0       0.71      0.76      0.74      6072
           1       0.44      0.14      0.21      4048
           2       0.49      0.15      0.22      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.37     16192
   macro avg       0.41      0.26      0.29     16192
weighted avg       0.56      0.37      0.41     16192


 ----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



RandomForestClassifier(n_estimators=200, random_state=42)
Train accuracy score : 0.9999691281797974
Test accuracy score : 0.37716156126482214
              precision    recall  f1-score   support

           0       0.72      0.77      0.74      6072
           1       0.46      0.14      0.22      4048
           2       0.49      0.14      0.22      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.38     16192
   macro avg       0.42      0.26      0.29     16192
weighted avg       0.57      0.38      0.41     16192


 ----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest with TF-IDF

In [None]:
trees = [50,100,150,200]

for t in trees:
  clf = RandomForestClassifier(random_state=42, n_estimators=t)
  train_and_eval(
      model = clf,
      trainX = class_4_tfidf_X_train,
      trainY = class_4_y_train,
      testX = class_4_tfidf_X_test,
      testY = class_4_y_test
  )


RandomForestClassifier(n_estimators=50, random_state=42)
Train accuracy score : 0.2512194368979995
Test accuracy score : 0.37685276679841895
              precision    recall  f1-score   support

           0       0.55      0.01      0.02      6072
           1       0.42      0.01      0.01      4048
           2       0.38      0.99      0.54      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.38     16192
   macro avg       0.34      0.25      0.14     16192
weighted avg       0.45      0.38      0.21     16192


 ----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



RandomForestClassifier(random_state=42)
Train accuracy score : 0.2512194368979995
Test accuracy score : 0.37685276679841895
              precision    recall  f1-score   support

           0       0.55      0.01      0.02      6072
           1       0.42      0.01      0.01      4048
           2       0.38      0.99      0.54      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.38     16192
   macro avg       0.34      0.25      0.14     16192
weighted avg       0.45      0.38      0.21     16192


 ----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



RandomForestClassifier(n_estimators=150, random_state=42)
Train accuracy score : 0.2512194368979995
Test accuracy score : 0.37685276679841895
              precision    recall  f1-score   support

           0       0.55      0.01      0.02      6072
           1       0.42      0.01      0.01      4048
           2       0.38      0.99      0.54      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.38     16192
   macro avg       0.34      0.25      0.14     16192
weighted avg       0.45      0.38      0.21     16192


 ----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



RandomForestClassifier(n_estimators=200, random_state=42)
Train accuracy score : 0.2512194368979995
Test accuracy score : 0.37685276679841895
              precision    recall  f1-score   support

           0       0.55      0.01      0.02      6072
           1       0.42      0.01      0.01      4048
           2       0.38      0.99      0.54      6072
           3       0.00      0.00      0.00         0

    accuracy                           0.38     16192
   macro avg       0.34      0.25      0.14     16192
weighted avg       0.45      0.38      0.21     16192


 ----------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


We get the best metrics when classifying for 2 classes (good/bad)