In [5]:
import pandas as pd 
import numpy as np
import re

from collections import Counter

In [6]:
df = pd.read_csv('../data/IMDB Dataset.csv')

def preprocess_imdb_raw_data(x):
    x = re.sub("<br\\s*/?>", " ", x)
    return x 

X = [preprocess_imdb_raw_data(x) for x in df['review'].values]

y = df['sentiment'].apply(lambda x: int(x == 'positive')).values

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
Counter(y)

Counter({1: 25000, 0: 25000})

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# GridSearch CV


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer

grid = {'lr__C': [0.1, 1.0, 5.0], 
        'tfidf__max_features': [20000, 30000, 40000]}

pipe = Pipeline([('tfidf', TfidfVectorizer(strip_accents='ascii', lowercase=True, )),
                 ('normalizer', Normalizer()),
                 ('lr', LogisticRegression(max_iter=200))])

grid_cv = GridSearchCV(estimator=pipe, 
             param_grid=grid, 
             cv=5, 
             n_jobs=-1)

grid_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [10]:
(pd.DataFrame(grid_cv.cv_results_)
     [['mean_test_score', 'rank_test_score', 'param_lr__C', 'param_tfidf__max_features']]
   .sort_values('rank_test_score'))

Unnamed: 0,mean_test_score,rank_test_score,param_lr__C,param_tfidf__max_features
8,0.90035,1,5.0,40000
7,0.899625,2,5.0,30000
6,0.898425,3,5.0,20000
5,0.89435,4,1.0,40000
4,0.893975,5,1.0,30000
3,0.893625,6,1.0,20000
0,0.8632,7,0.1,20000
1,0.8621,8,0.1,30000
2,0.8618,9,0.1,40000


# Best Model

In [11]:
from joblib import dump, load

model = grid_cv.best_estimator_

model.fit(X_train, y_train)

dump(model, '../models/baseline.joblib')

['../models/baseline.joblib']

# Evaluate

In [12]:
from sklearn.metrics import classification_report

y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

print(f"Test: {classification_report(y_test, y_pred_test)}")
print(f"Train: {classification_report(y_train, y_pred_train)}")

Test:               precision    recall  f1-score   support

           0       0.91      0.90      0.90      5044
           1       0.90      0.91      0.90      4956

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Train:               precision    recall  f1-score   support

           0       0.97      0.96      0.97     19956
           1       0.96      0.97      0.97     20044

    accuracy                           0.97     40000
   macro avg       0.97      0.97      0.97     40000
weighted avg       0.97      0.97      0.97     40000

