In [8]:
from warnings import filterwarnings
filterwarnings('ignore')

import time
import pandas as pd
import numpy as np

from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [2]:
df = pd.read_csv('../data/final_data.csv')
df.head()

Unnamed: 0,cleaned_review,sentiment
0,one of the other reviewer have mention that af...,positive
1,a wonderful little production the filming t...,positive
2,think this be a wonderful way to spend time o...,positive
3,basically there s a family where a little boy ...,negative
4,Petter Matteis Love in the Time of Money be a ...,positive


In [3]:
X = df['cleaned_review']
y = df['sentiment'].replace({'positive':1, 'negative':0})

In [4]:
tfidf = TfidfVectorizer()

In [5]:
%%time
X = tfidf.fit_transform(X)

Wall time: 11.9 s


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [7]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(40000, 155043) (10000, 155043) (40000,) (10000,)


In [9]:
models = [
    MultinomialNB(),
    LogisticRegression(n_jobs=-1),
    RandomForestClassifier(n_jobs=-1),
    LinearSVC(),
    XGBClassifier(n_jobs=-1),
    DecisionTreeClassifier()
]


def display_metrics(true, pred):

    f1 = round(f1_score(y_true=true, y_pred=pred) * 100)
    precision = round(precision_score(y_true=true, y_pred=pred) * 100)
    recall = round(recall_score(y_true=true, y_pred=pred) * 100)

    print(f'F1: {f1}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')

In [10]:
%%time
trained_models = dict()

for model in models:
    print(f'Training -> {model.__class__.__name__}')
    s = time.time()
    trained_models[model.__class__.__name__] = model.fit(X_train, y_train)
    e = time.time()
    preds = trained_models[model.__class__.__name__].predict(X_test)
    acc = round(accuracy_score(y_true=y_test, y_pred=preds) * 100)
    print(f'Acc: {acc}')
    display_metrics(true=y_test, pred=preds)
    print(f'Training time: {round(e - s)} seconds')
    print('-' * 10)

Training -> MultinomialNB
Acc: 86.0
F1: 86.0
Precision: 88.0
Recall: 84.0
Training time: 0 seconds
----------
Training -> LogisticRegression
Acc: 89.0
F1: 89.0
Precision: 88.0
Recall: 90.0
Training time: 7 seconds
----------
Training -> RandomForestClassifier
Acc: 84.0
F1: 84.0
Precision: 84.0
Recall: 84.0
Training time: 129 seconds
----------
Training -> LinearSVC
Acc: 90.0
F1: 89.0
Precision: 89.0
Recall: 90.0
Training time: 1 seconds
----------
Training -> XGBClassifier
Acc: 86.0
F1: 86.0
Precision: 85.0
Recall: 87.0
Training time: 110 seconds
----------
Training -> DecisionTreeClassifier
Acc: 71.0
F1: 71.0
Precision: 71.0
Recall: 71.0
Training time: 113 seconds
----------
Wall time: 6min 1s


In [11]:
import joblib

In [12]:
joblib.dump(value=trained_models['LinearSVC'], filename='../models/linear_svm.joblib')

['../models/linear_svm.joblib']

In [14]:
joblib.dump(value=tfidf, filename='../models/tfidf_vectorizer.joblib')

['../models/tfidf_vectorizer.joblib']