In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('ML_train.csv')
test = pd.read_csv('ML_test.csv')

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,Text,Sentiment
0,0,movie is funny suitable age is definitely fami...,6
1,1,old commercial blank audio cassette tag line w...,7
2,2,cinemascope color cinematography leon shamroy ...,4
3,3,get film possible will find really good perfor...,6
4,4,soundtrack is bit dated story is relevant yous...,6


In [4]:
train = train.drop(columns=['Unnamed: 0'])

In [5]:
train.head()

Unnamed: 0,Text,Sentiment
0,movie is funny suitable age is definitely fami...,6
1,old commercial blank audio cassette tag line w...,7
2,cinemascope color cinematography leon shamroy ...,4
3,get film possible will find really good perfor...,6
4,soundtrack is bit dated story is relevant yous...,6


In [6]:
X_train = train['Text']
y_train = train['Sentiment']

In [7]:
test.head()

Unnamed: 0.1,Unnamed: 0,Text,Sentiment
0,0,would highly recommend seeing movie viewing wi...,1
1,1,see everyone love film much true doe have good...,1
2,2,damn thought would seen bad western cannot top...,1
3,3,well certainly stunned believe someone made an...,1
4,4,jefferey dahmer wa one sick guy is much say al...,1


In [8]:
test = test.drop(columns=['Unnamed: 0'])

In [9]:
test.head()

Unnamed: 0,Text,Sentiment
0,would highly recommend seeing movie viewing wi...,1
1,see everyone love film much true doe have good...,1
2,damn thought would seen bad western cannot top...,1
3,well certainly stunned believe someone made an...,1
4,jefferey dahmer wa one sick guy is much say al...,1


In [10]:
X_test = test['Text']
y_test = test['Sentiment']

# Vectorization

Bag of Words Vectorizer

In [34]:
bow_vectorizer = CountVectorizer()
bow_vectorizer.fit(X_train)

bow_X_train = bow_vectorizer.transform(X_train)
bow_X_test = bow_vectorizer.transform(X_test)

TF-IDF Vectorizer

In [35]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(X_train)

tfidf_X_train = tfidf_vectorizer.transform(X_train)
tfidf_X_test = tfidf_vectorizer.transform(X_test)

# Function to help us test the models

In [17]:
def train_and_eval(model, trainX, trainY, testX, testY):

    # training the model
    fitted_model = model.fit(trainX, trainY)

    # getting predictions
    y_preds_train = fitted_model.predict(trainX)
    y_preds_test = fitted_model.predict(testX)

    # evaluating the model
    print()
    print(model)
    print(f"Train accuracy score : {accuracy_score(trainY, y_preds_train)}")
    print(f"Test accuracy score : {accuracy_score(testY, y_preds_test)}")
    print(classification_report(testY, y_preds_test))
    print('\n',40*'-')

# Random Forest Classifier

Fitting random forest with bag of words

In [18]:
trees = [50,100,150,200]

for t in trees:
  clf = RandomForestClassifier(random_state=42, n_estimators=t)
  train_and_eval(
      model = clf,
      trainX = bow_X_train,
      trainY = y_train,
      testX = bow_X_test,
      testY = y_test
  )


RandomForestClassifier(n_estimators=50, random_state=42)
Train accuracy score : 0.999938256359595
Test accuracy score : 0.27445652173913043
              precision    recall  f1-score   support

           1       0.27      0.83      0.40      2024
           2       0.37      0.07      0.12      2024
           3       0.34      0.09      0.15      2024
           4       0.33      0.12      0.18      2024
           5       0.36      0.12      0.18      2024
           6       0.28      0.13      0.18      2024
           7       0.35      0.07      0.11      2024
           8       0.25      0.76      0.38      2024

    accuracy                           0.27     16192
   macro avg       0.32      0.27      0.21     16192
weighted avg       0.32      0.27      0.21     16192


 ----------------------------------------

RandomForestClassifier(random_state=42)
Train accuracy score : 0.999938256359595
Test accuracy score : 0.2799530632411067
              precision    recall  f1-scor

Fitting random forest with TF-IDF

In [19]:
trees = [50,100,150,200]

for t in trees:
  clf = RandomForestClassifier(random_state=42, n_estimators=t)
  train_and_eval(
      model = clf,
      trainX = tfidf_X_train,
      trainY = y_train,
      testX = tfidf_X_test,
      testY = y_test
  )


RandomForestClassifier(n_estimators=50, random_state=42)
Train accuracy score : 0.999938256359595
Test accuracy score : 0.2813735177865613
              precision    recall  f1-score   support

           1       0.27      0.85      0.41      2024
           2       0.31      0.05      0.09      2024
           3       0.34      0.08      0.13      2024
           4       0.34      0.15      0.21      2024
           5       0.39      0.14      0.21      2024
           6       0.30      0.15      0.20      2024
           7       0.40      0.07      0.13      2024
           8       0.26      0.76      0.38      2024

    accuracy                           0.28     16192
   macro avg       0.33      0.28      0.22     16192
weighted avg       0.33      0.28      0.22     16192


 ----------------------------------------

RandomForestClassifier(random_state=42)
Train accuracy score : 0.999938256359595
Test accuracy score : 0.28649950592885376
              precision    recall  f1-scor

Random Forest Classifier seems to be overfitted, performing will 99.9% accuracy on training data, but only 28.6% accuracy for test data.

Will need to address overfitting of the model





# Multinomial Naive Bayes

Fitting Naive Bayes with Bag of Words

In [20]:
alphas = [0, 0.2, 0.6, 0.8, 1]

for a  in alphas:
    # Define model
    nb_model = MultinomialNB(alpha=a)

    # Train and evaluate model
    train_and_eval(model = nb_model,
                   trainX = bow_X_train,
                   trainY = y_train,
                   testX = bow_X_test,
                   testY = y_test)




MultinomialNB(alpha=0)
Train accuracy score : 0.9337336379352926
Test accuracy score : 0.2741477272727273
              precision    recall  f1-score   support

           1       0.31      0.57      0.40      2024
           2       0.26      0.13      0.18      2024
           3       0.25      0.17      0.20      2024
           4       0.28      0.24      0.26      2024
           5       0.24      0.19      0.21      2024
           6       0.23      0.24      0.24      2024
           7       0.27      0.14      0.18      2024
           8       0.29      0.51      0.37      2024

    accuracy                           0.27     16192
   macro avg       0.27      0.27      0.25     16192
weighted avg       0.27      0.27      0.25     16192


 ----------------------------------------

MultinomialNB(alpha=0.2)
Train accuracy score : 0.8694430723635466
Test accuracy score : 0.32806324110671936
              precision    recall  f1-score   support

           1       0.38      0.70 

Fitting Naive Bayes with TF-IDF

In [21]:
alphas = [0, 0.2, 0.6, 0.8, 1]

for a  in alphas:
    # Define model
    nb_model = MultinomialNB(alpha=a)

    # Train and evaluate model
    train_and_eval(model = nb_model,
                   trainX = tfidf_X_train,
                   trainY = y_train,
                   testX = tfidf_X_test,
                   testY = y_test)




MultinomialNB(alpha=0)
Train accuracy score : 0.9632779698691035
Test accuracy score : 0.2625370553359684
              precision    recall  f1-score   support

           1       0.31      0.52      0.39      2024
           2       0.24      0.13      0.17      2024
           3       0.24      0.17      0.20      2024
           4       0.28      0.25      0.26      2024
           5       0.23      0.19      0.21      2024
           6       0.22      0.23      0.22      2024
           7       0.27      0.15      0.19      2024
           8       0.27      0.46      0.34      2024

    accuracy                           0.26     16192
   macro avg       0.26      0.26      0.25     16192
weighted avg       0.26      0.26      0.25     16192


 ----------------------------------------

MultinomialNB(alpha=0.2)
Train accuracy score : 0.9011638676216349
Test accuracy score : 0.33547430830039526
              precision    recall  f1-score   support

           1       0.43      0.63 

# Changing sentiments to only good or bad

In [27]:
trial_train = train.copy()
trial_train.head()

Unnamed: 0,Text,Sentiment
0,movie is funny suitable age is definitely fami...,6
1,old commercial blank audio cassette tag line w...,7
2,cinemascope color cinematography leon shamroy ...,4
3,get film possible will find really good perfor...,6
4,soundtrack is bit dated story is relevant yous...,6


In [22]:
def good_bad(row):
  if row > 4:
    return 1
  else:
    return 0

In [28]:
trial_train['Good'] = trial_train['Sentiment'].apply(good_bad)
trial_train.head()

Unnamed: 0,Text,Sentiment,Good
0,movie is funny suitable age is definitely fami...,6,1
1,old commercial blank audio cassette tag line w...,7,1
2,cinemascope color cinematography leon shamroy ...,4,0
3,get film possible will find really good perfor...,6,1
4,soundtrack is bit dated story is relevant yous...,6,1


In [29]:
trial_X_train = trial_train['Text']
trial_y_train = trial_train['Good']

In [30]:
trial_test = test.copy()
trial_test.head()

Unnamed: 0,Text,Sentiment
0,would highly recommend seeing movie viewing wi...,1
1,see everyone love film much true doe have good...,1
2,damn thought would seen bad western cannot top...,1
3,well certainly stunned believe someone made an...,1
4,jefferey dahmer wa one sick guy is much say al...,1


In [31]:
trial_test['Good'] = trial_test['Sentiment'].apply(good_bad)
trial_test.head()

Unnamed: 0,Text,Sentiment,Good
0,would highly recommend seeing movie viewing wi...,1,0
1,see everyone love film much true doe have good...,1,0
2,damn thought would seen bad western cannot top...,1,0
3,well certainly stunned believe someone made an...,1,0
4,jefferey dahmer wa one sick guy is much say al...,1,0


In [32]:
trial_X_test = trial_test['Text']
trial_y_test = trial_test['Good']

# Vectorization

Bag of Words vectorizer

In [36]:
bow_vectorizer = CountVectorizer()
bow_vectorizer.fit(trial_X_train)

trial_bow_X_train = bow_vectorizer.transform(trial_X_train)
trial_bow_X_test = bow_vectorizer.transform(trial_X_test)

TF-IDF

In [37]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(trial_X_train)

trial_tfidf_X_train = tfidf_vectorizer.transform(trial_X_train)
trial_tfidf_X_test = tfidf_vectorizer.transform(trial_X_test)

# Logistic Regression with BOW

In [39]:
C = [0.001, 0.01, 0.1, 1, 10,]

for c in C:
    # Define model
    log_model = LogisticRegression(C=c, max_iter=500)

    # Train and evaluate model
    train_and_eval(model=log_model,
                   trainX = trial_bow_X_train,
                   trainY = trial_y_train,
                   testX = trial_bow_X_test,
                   testY = trial_y_test)


LogisticRegression(C=0.001, max_iter=500)
Train accuracy score : 0.008582366016300321
Test accuracy score : 0.009263833992094862
              precision    recall  f1-score   support

           0       0.87      0.85      0.86      8096
           1       0.85      0.87      0.86      8096

    accuracy                           0.86     16192
   macro avg       0.86      0.86      0.86     16192
weighted avg       0.86      0.86      0.86     16192


 ----------------------------------------

LogisticRegression(C=0.01, max_iter=500)
Train accuracy score : 0.005572363546554705
Test accuracy score : 0.008337450592885376
              precision    recall  f1-score   support

           0       0.89      0.88      0.89      8096
           1       0.88      0.89      0.89      8096

    accuracy                           0.89     16192
   macro avg       0.89      0.89      0.89     16192
weighted avg       0.89      0.89      0.89     16192


 ----------------------------------------



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



LogisticRegression(C=1, max_iter=500)
Train accuracy score : 0.00023153865151889354
Test accuracy score : 0.008522727272727272
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      8096
           1       0.88      0.88      0.88      8096

    accuracy                           0.88     16192
   macro avg       0.88      0.88      0.88     16192
weighted avg       0.88      0.88      0.88     16192


 ----------------------------------------

LogisticRegression(C=10, max_iter=500)
Train accuracy score : 0.0
Test accuracy score : 0.0089550395256917
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      8096
           1       0.87      0.87      0.87      8096

    accuracy                           0.87     16192
   macro avg       0.87      0.87      0.87     16192
weighted avg       0.87      0.87      0.87     16192


 ----------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Logistic Regression with TF-IDF

In [40]:
C = [0.001, 0.01, 0.1, 1, 10,]

for c in C:
    # Define model
    log_model = LogisticRegression(C=c, max_iter=500)

    # Train and evaluate model
    train_and_eval(model=log_model,
                   trainX = trial_tfidf_X_train,
                   trainY = trial_y_train,
                   testX = trial_tfidf_X_test,
                   testY = trial_y_test)


LogisticRegression(C=0.001, max_iter=500)
Train accuracy score : 0.018060014818473698
Test accuracy score : 0.017601284584980236
              precision    recall  f1-score   support

           0       0.82      0.74      0.78      8096
           1       0.76      0.84      0.80      8096

    accuracy                           0.79     16192
   macro avg       0.79      0.79      0.79     16192
weighted avg       0.79      0.79      0.79     16192


 ----------------------------------------

LogisticRegression(C=0.01, max_iter=500)
Train accuracy score : 0.01367621634971598
Test accuracy score : 0.013957509881422924
              precision    recall  f1-score   support

           0       0.85      0.78      0.81      8096
           1       0.80      0.86      0.83      8096

    accuracy                           0.82     16192
   macro avg       0.82      0.82      0.82     16192
weighted avg       0.82      0.82      0.82     16192


 ----------------------------------------

L

Might be better if we just classify good or bad as opposed to having 8 different classes to predict

For binary classification, best models so far would be:

```
LogisticRegression(C=1, max_iter=500)
LogisticRegression(C=10, max_iter=500)
```

