In [12]:
import time
import pandas as pd
import numpy as np

from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [2]:
df = pd.read_csv('../data/final_data.csv')
df.head()

Unnamed: 0,cleaned_review,sentiment
0,one of the other reviewer have mention that af...,positive
1,a wonderful little production the filming t...,positive
2,think this be a wonderful way to spend time o...,positive
3,basically there s a family where a little boy ...,negative
4,Petter Matteis Love in the Time of Money be a ...,positive


In [3]:
X = df['cleaned_review']
y = df['sentiment'].replace({'positive':1, 'negative':0})

In [4]:
tfidf = TfidfVectorizer()

In [5]:
%%time
X = tfidf.fit_transform(X)

CPU times: user 8.82 s, sys: 212 ms, total: 9.03 s
Wall time: 9.03 s


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

In [7]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(42500, 155043) (7500, 155043) (42500,) (7500,)


In [13]:
models = [
    
    MultinomialNB(),
    LogisticRegression(n_jobs=-1),
    RandomForestClassifier(n_jobs=-1),
    XGBClassifier(n_jobs=-1),
#     DecisionTreeClassifier(),
#     GradientBoostingClassifier()
]


def display_metrics(true, pred):

    f1 = round(f1_score(y_true=true, y_pred=pred) * 100)
    precision = round(precision_score(y_true=true, y_pred=pred) * 100)
    recall = round(recall_score(y_true=true, y_pred=pred) * 100)

    print(f'F1: {f1}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')

In [15]:
%%time
trained_models = dict()

for model in models:
    print(f'Training -> {model.__class__.__name__}')
    s = time.time()
    trained_models[model.__class__.__name__] = model.fit(X_train, y_train)
    e = time.time()
    preds = trained_models[model.__class__.__name__].predict(X_test)
    acc = round(accuracy_score(y_true=y_test, y_pred=preds) * 100)
    print(f'Acc: {acc}')
    display_metrics(true=y_test, pred=preds)
    print(f'Training time: {round(e -s)} seconds')
    print('-' * 10)

Training -> MultinomialNB
Acc: 86
F1: 86
Precision: 88
Recall: 84
Training time: 0 seconds
----------
Training -> LogisticRegression
Acc: 89
F1: 89
Precision: 88
Recall: 90
Training time: 6 seconds
----------
Training -> RandomForestClassifier
Acc: 84
F1: 83
Precision: 84
Recall: 83
Training time: 47 seconds
----------
Training -> XGBClassifier
Acc: 86
F1: 86
Precision: 85
Recall: 88
Training time: 161 seconds
----------
CPU times: user 8min 46s, sys: 1.4 s, total: 8min 48s
Wall time: 3min 34s
