In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from utils import preprocess
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import sys
import joblib
from jupyterthemes import jtplot
jtplot.style(context="notebook", theme="monokai", ticks=True)

In [2]:
df = pd.read_csv("./DATA/IMDB.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
texts = df.review.values
print(texts[:5])

["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the f

In [4]:
texts = preprocess(texts, stem=True)
print(texts[:5])

['one review mention watch 1 oz episod hooked. right  exactli happen me.         first thing struck oz brutal unflinch scene violenc  set right word go. trust  show faint heart timid. show pull punch regard drug  sex violence. hardcor  classic use word.         call oz nicknam given oswald maximum secur state penitentary. focus mainli emerald citi  experiment section prison cell glass front face inward  privaci high agenda. em citi home many..aryan  muslim  gangsta  latino  christian  italian  irish ... .so scuffl  death stare  dodgi deal shadi agreement never far away.         would say main appeal show due fact goe show would dare. forget pretti pictur paint mainstream audienc  forget charm  forget romanc ... oz mess around. first episod ever saw struck nasti surreal  could say readi  watch  develop tast oz  got accustom high level graphic violence. violenc  injustic  crook guard sold nickel  inmat kill order get away  well manner  middl class inmat turn prison bitch due lack street 

In [5]:
sentiments = df["sentiment"].values

In [6]:
# Split the data
Xtrain, Xtest, ytrain, ytest = train_test_split(texts, sentiments)

print(f"Length of reviews in training set : {len(Xtrain)}")
print(f"Length of sentiments in training set : {len(ytrain)}")
print("-"*10)
print(f"Length of reviews in test set : {len(Xtest)}")
print(f"Length of sentiments in test set : {len(ytest)}")

Length of reviews in training set : 37500
Length of sentiments in training set : 37500
----------
Length of reviews in test set : 12500
Length of sentiments in test set : 12500


In [7]:
tfidf_1gram = TfidfVectorizer(min_df=5, max_df=0.5)
tfidf_2gram = TfidfVectorizer(min_df=5, max_df=0.5, ngram_range=(1, 2))

logreg_1gram = LogisticRegression()
logreg_2gram = LogisticRegression()

In [8]:
features_1gram_train = tfidf_1gram.fit_transform(Xtrain)
features_1gram_test = tfidf_1gram.transform(Xtest)

In [9]:
# Fit Logistic regression on 1 grams
logreg_1gram.fit(features_1gram_train, ytrain)

print(f"Score on Training data : {logreg_1gram.score(features_1gram_train, ytrain)}")
print(f"Score on Test data : {logreg_1gram.score(features_1gram_test, ytest)}")

Score on Training data : 0.9267733333333333
Score on Test data : 0.89192


In [10]:
features_2gram_train = tfidf_2gram.fit_transform(Xtrain)
features_2gram_test = tfidf_2gram.transform(Xtest)

In [11]:
# Fit Logistic regression on 2grams
logreg_2gram.fit(features_2gram_train, ytrain)

print(f"Score on Traininig data : {logreg_2gram.score(features_2gram_train, ytrain)}")
print(f"Score on Test data : {logreg_2gram.score(features_2gram_test, ytest)}")

Score on Traininig data : 0.9446933333333334
Score on Test data : 0.89512


In [14]:
result_1gram = pd.DataFrame({"text":tfidf_1gram.get_feature_names(), "weights":logreg_1gram.coef_.reshape(-1,)})
result_1gram = result_1gram.sort_values(by="weights", ascending=False)

In [15]:
result_2gram = pd.DataFrame({"text":tfidf_2gram.get_feature_names(), "weights":logreg_2gram.coef_.reshape(-1,)})
result_2gram = result_2gram.sort_values(by="weights", ascending=False)

In [16]:
# print top 5 words in positive place for 1 gram
result_1gram.head(5)

Unnamed: 0,text,weights
10096,great,7.785242
8070,excel,7.137094
2462,best,5.581592
17327,perfect,5.524824
7736,enjoy,5.442074


In [17]:
# print bottom 5 words in negative place for 1 gram
result_1gram.tail(5)

Unnamed: 0,text,weights
2918,bore,-7.038613
1817,aw,-7.063951
1925,bad,-8.399024
25450,wast,-9.028823
25975,worst,-10.769187


In [18]:
# print top 5 words in positive place for 2 gram
result_2gram.head(5)

Unnamed: 0,text,weights
52708,great,8.971438
38359,excel,7.399745
72438,love,6.039971
12487,best,5.863861
35186,enjoy,5.851382


In [19]:
# print bottom 5 words in negative place for 2 gram
result_2gram.tail(5)

Unnamed: 0,text,weights
9315,aw,-7.074951
14760,bore,-7.466444
134669,wast,-8.501928
10107,bad,-10.086912
139679,worst,-10.277416


In [20]:
# Test a sample
sample_txt = ["This is the worst movie"]
sample_txt = preprocess(sample_txt)

_features = tfidf_2gram.transform(sample_txt)
print(f"Review : {logreg_2gram.predict(_features)}")

Review : ['negative']


In [22]:
# Test a sample
sample_txt2 = ["This is the best movie"]
sample_txt2 = preprocess(sample_txt2)

_features2 = tfidf_2gram.transform(sample_txt2)
print(f"Review : {logreg_2gram.predict(_features2)}")

Review : ['positive']


In [24]:
# Save the model

if not os.path.exists("./MODELS/"):
    os.mkdir("./MODELS/")
    
joblib.dump(logreg_2gram,"./MODELS/Logistic_regression_2_grams.sav")
joblib.dump(logreg_1gram, "./MODELS/Logistic_regression_1_grams.sav")

joblib.dump(tfidf_1gram, "./MODELS/TFIDF_1_grams.sav")
joblib.dump(tfidf_2gram, "./MODELS/TFIDF_2_grams.sav")

['./MODELS/TFIDF_2_grams.sav']