In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('ML_train.csv')
test = pd.read_csv('ML_test.csv')

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,Text,Sentiment
0,0,movie is funny suitable age is definitely fami...,6
1,1,old commercial blank audio cassette tag line w...,7
2,2,cinemascope color cinematography leon shamroy ...,4
3,3,get film possible will find really good perfor...,6
4,4,soundtrack is bit dated story is relevant yous...,6


In [4]:
train = train.drop(columns=['Unnamed: 0'])
train.head()

Unnamed: 0,Text,Sentiment
0,movie is funny suitable age is definitely fami...,6
1,old commercial blank audio cassette tag line w...,7
2,cinemascope color cinematography leon shamroy ...,4
3,get film possible will find really good perfor...,6
4,soundtrack is bit dated story is relevant yous...,6


In [5]:
trial_train = train.copy()
trial_train.head()

Unnamed: 0,Text,Sentiment
0,movie is funny suitable age is definitely fami...,6
1,old commercial blank audio cassette tag line w...,7
2,cinemascope color cinematography leon shamroy ...,4
3,get film possible will find really good perfor...,6
4,soundtrack is bit dated story is relevant yous...,6


In [6]:
def good_bad(row):
  if row > 4:
    return 1
  else:
    return 0

In [7]:
trial_train['Good'] = trial_train['Sentiment'].apply(good_bad)
trial_train.head()

Unnamed: 0,Text,Sentiment,Good
0,movie is funny suitable age is definitely fami...,6,1
1,old commercial blank audio cassette tag line w...,7,1
2,cinemascope color cinematography leon shamroy ...,4,0
3,get film possible will find really good perfor...,6,1
4,soundtrack is bit dated story is relevant yous...,6,1


In [8]:
class2_X_train = trial_train['Text']
class2_y_train = trial_train['Good']

In [9]:
trial_test = test.copy()
trial_test.head()

Unnamed: 0.1,Unnamed: 0,Text,Sentiment
0,0,would highly recommend seeing movie viewing wi...,1
1,1,see everyone love film much true doe have good...,1
2,2,damn thought would seen bad western cannot top...,1
3,3,well certainly stunned believe someone made an...,1
4,4,jefferey dahmer wa one sick guy is much say al...,1


In [10]:
trial_test['Good'] = trial_test['Sentiment'].apply(good_bad)
trial_test.head()

Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Good
0,0,would highly recommend seeing movie viewing wi...,1,0
1,1,see everyone love film much true doe have good...,1,0
2,2,damn thought would seen bad western cannot top...,1,0
3,3,well certainly stunned believe someone made an...,1,0
4,4,jefferey dahmer wa one sick guy is much say al...,1,0


In [11]:
class2_X_test = trial_test['Text']
class2_y_test = trial_test['Good']

# Vectorization

BOW Vectorizer

In [12]:
bow_vectorizer = CountVectorizer()
bow_vectorizer.fit(class2_X_train)

class2_bow_X_train = bow_vectorizer.transform(class2_X_train)
class2_bow_X_test = bow_vectorizer.transform(class2_X_test)

TF-IDF

In [13]:

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(class2_X_train)

class2_tfidf_X_train = tfidf_vectorizer.transform(class2_X_train)
class2_tfidf_X_test = tfidf_vectorizer.transform(class2_X_test)

# Tuning Random Forest with BOW

In [77]:
param_grid = {
    'n_estimators': [50, 100, 150, 200, 500, 1000],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [20, 30, 50, 75, 100],
    'min_samples_split': [10, 25, 50, 100, 150, 200, 400],
    'min_samples_leaf': [25, 50, 100]
}

In [78]:
rf_model = RandomForestClassifier(random_state = 42, n_jobs = -1)
clf = RandomizedSearchCV(rf_model, param_distributions=param_grid, cv=3, verbose=4, scoring='accuracy', random_state=42, n_iter=20)

In [79]:
best_clf = clf.fit(class2_bow_X_train, class2_y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV 1/3] END criterion=entropy, max_depth=100, min_samples_leaf=100, min_samples_split=200, n_estimators=100;, score=0.846 total time=  19.4s
[CV 2/3] END criterion=entropy, max_depth=100, min_samples_leaf=100, min_samples_split=200, n_estimators=100;, score=0.837 total time=  17.7s
[CV 3/3] END criterion=entropy, max_depth=100, min_samples_leaf=100, min_samples_split=200, n_estimators=100;, score=0.808 total time=   4.4s
[CV 1/3] END criterion=entropy, max_depth=75, min_samples_leaf=50, min_samples_split=10, n_estimators=500;, score=0.863 total time= 1.8min
[CV 2/3] END criterion=entropy, max_depth=75, min_samples_leaf=50, min_samples_split=10, n_estimators=500;, score=0.859 total time= 2.0min
[CV 3/3] END criterion=entropy, max_depth=75, min_samples_leaf=50, min_samples_split=10, n_estimators=500;, score=0.831 total time=  31.2s
[CV 1/3] END criterion=log_loss, max_depth=75, min_samples_leaf=100, min_samples_split=100, n_es

In [80]:
# best parameters
best_clf.best_params_

{'n_estimators': 200,
 'min_samples_split': 100,
 'min_samples_leaf': 25,
 'max_depth': 100,
 'criterion': 'log_loss'}

In [81]:
# highest accuracy score
best_clf.best_score_

0.8586531118965618

Function to help us evaluate model performance

In [82]:
def train_and_eval(model, trainX, trainY, testX, testY):

    # training the model
    fitted_model = model.fit(trainX, trainY)

    # getting predictions
    y_preds_train = fitted_model.predict(trainX)
    y_preds_test = fitted_model.predict(testX)

    # evaluating the model
    print()
    print(model)
    print(f"Train accuracy score : {accuracy_score(trainY, y_preds_train)}")
    print(f"Test accuracy score : {accuracy_score(testY, y_preds_test)}")
    print(classification_report(testY, y_preds_test))
    print('\n',40*'-')

run below for this section, and edit accordingly

In [84]:
best_rf = RandomForestClassifier(
    random_state=42,
    n_estimators=200,
    min_samples_split=100,
    min_samples_leaf=25,
    max_depth=100,
    criterion='log_loss'
)

In [85]:
train_and_eval(best_rf, class2_bow_X_train, class2_y_train, class2_bow_X_test, class2_y_test)


RandomForestClassifier(criterion='log_loss', max_depth=100, min_samples_leaf=25,
                       min_samples_split=100, n_estimators=200,
                       random_state=42)
Train accuracy score : 0.8809119535687824
Test accuracy score : 0.8536931818181818
              precision    recall  f1-score   support

           0       0.87      0.84      0.85      8096
           1       0.84      0.87      0.86      8096

    accuracy                           0.85     16192
   macro avg       0.85      0.85      0.85     16192
weighted avg       0.85      0.85      0.85     16192


 ----------------------------------------


Best Random Forest model with BOW gave us 85.3% accuracy on test data

# Tuning Random Forest with TF-IDF

In [70]:
param_grid = {
    'n_estimators': [50, 100, 150, 200, 500, 1000],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [20, 30, 50, 75, 100],
    'min_samples_split': [10, 25, 50, 100, 150, 200, 400],
    'min_samples_leaf': [25, 50, 100]
}

In [71]:
rf_model = RandomForestClassifier(random_state = 42, n_jobs = -1)
clf = RandomizedSearchCV(rf_model, param_distributions=param_grid, cv=3, verbose=4, scoring='accuracy', random_state=42, n_iter=20)

In [72]:
best_clf = clf.fit(class2_tfidf_X_train, class2_y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV 1/3] END criterion=entropy, max_depth=100, min_samples_leaf=100, min_samples_split=200, n_estimators=100;, score=0.842 total time=  20.2s
[CV 2/3] END criterion=entropy, max_depth=100, min_samples_leaf=100, min_samples_split=200, n_estimators=100;, score=0.842 total time=  19.4s
[CV 3/3] END criterion=entropy, max_depth=100, min_samples_leaf=100, min_samples_split=200, n_estimators=100;, score=0.802 total time=   6.5s
[CV 1/3] END criterion=entropy, max_depth=75, min_samples_leaf=50, min_samples_split=10, n_estimators=500;, score=0.860 total time= 2.0min
[CV 2/3] END criterion=entropy, max_depth=75, min_samples_leaf=50, min_samples_split=10, n_estimators=500;, score=0.861 total time= 2.1min
[CV 3/3] END criterion=entropy, max_depth=75, min_samples_leaf=50, min_samples_split=10, n_estimators=500;, score=0.828 total time=  35.1s
[CV 1/3] END criterion=log_loss, max_depth=75, min_samples_leaf=100, min_samples_split=100, n_es

In [73]:
# best parameters
best_clf.best_params_

{'n_estimators': 200,
 'min_samples_split': 100,
 'min_samples_leaf': 25,
 'max_depth': 100,
 'criterion': 'log_loss'}

In [74]:
# highest accuracy score
best_clf.best_score_

0.8639785023136602

In [75]:
best_rf = RandomForestClassifier(
    random_state=42,
    n_estimators=200,
    min_samples_split=100,
    min_samples_leaf=25,
    max_depth=100,
    criterion='log_loss'
)

In [76]:
train_and_eval(best_rf, class2_tfidf_X_train, class2_y_train, class2_tfidf_X_test, class2_y_test)


RandomForestClassifier(criterion='log_loss', max_depth=100, min_samples_leaf=25,
                       min_samples_split=100, n_estimators=200,
                       random_state=42)
Train accuracy score : 0.8982619165225981
Test accuracy score : 0.8515316205533597
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      8096
           1       0.85      0.86      0.85      8096

    accuracy                           0.85     16192
   macro avg       0.85      0.85      0.85     16192
weighted avg       0.85      0.85      0.85     16192


 ----------------------------------------


Best Random forest Classifier with TF-IDF gives us 85.1% accuracy on test data