In [29]:
import pandas as pd 
import numpy as np

from collections import Counter

In [32]:
df = pd.read_csv('./data/IMDB Dataset.csv')

X = df['review'].values

y = df['sentiment'].apply(lambda x: int(x == 'positive')).values

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [33]:
Counter(y)

Counter({1: 25000, 0: 25000})

In [49]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# GridSearch CV


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer

grid = {'lr__C': [0.1, 1.0, 5.0], 
        'tfidf__max_features': [20000]}

pipe = Pipeline([('tfidf', TfidfVectorizer(strip_accents='ascii', lowercase=True, )),
                 ('normalizer', Normalizer()),
                 ('lr', LogisticRegression(max_iter=200))])

grid_cv = GridSearchCV(estimator=pipe, 
             param_grid=grid, 
             cv=5, 
             n_jobs=-1)

grid_cv.fit(X_train, y_train)

In [None]:
(pd.DataFrame(grid_cv.cv_results_)
     [['mean_test_score', 'rank_test_score', 'param_lr__C', 'param_tfidf__max_features']]
   .sort_values('rank_test_score'))

# Best Model

In [None]:
from joblib import dump, load

model = grid_cv.best_estimator_

model.fit(X_train, y_train)

dump(model, 'baseline.joblib')

# Evaluate

In [None]:
from sklearn.metrics import classification_report

y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

print(f"Test: {classification_report(y_test, y_pred_test)}")
print(f"Train: {classification_report(y_train, y_pred_train)}")