# Movie Reviews Sentiment Analysis

### Problem Statement
- In this project, we try to estimate the sentiment from a movie review.

### Dataset
- Source - https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
- The data contains 50,000 reviews, 25,000 of them are positive and 25,000 of them are negative.

In this notebook, we perform training on following models - 
- Logistic Regresssion

### Importing required packages

In [49]:
import numpy as np 
import scipy
import pandas as pd
import re
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from nltk.stem.porter import PorterStemmer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline  
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import pickle
import os

### Importing data

In [2]:
df = pd.read_csv('data/imdb_data.csv')
df.head()

Unnamed: 0,Review,Rating,Sentiment
0,Imagine The Big Chill with a cast of twenty-so...,2,0
1,I'd have to say that I've seen worse Sci Fi Ch...,3,0
2,Director Fabio Barreto got a strange Academy N...,1,0
3,Pretty bad PRC cheapie which I rarely bother t...,4,0
4,This is a very intriguing short movie by David...,8,1


In [4]:
print(f'Shape = {df.shape}')

Shape = (50000, 3)


### Preprocessing text

In [6]:
def text_preprocessing(text):
    # Replacing n't with not since it could be really important in sentiment analysis
    text = re.sub("n't", ' not ', text)
    # Removing URLs
    text = re.sub('(http).*\/', ' ', text)
    # Removing HTML tags
    text = re.sub('<[^>]*>', '', text)
    # Extracting emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|P|D|]|})', text)
    # Removing punctuations
    text = re.sub('[\W]+', ' ', text.lower())
    # Adding emoticons at end and converting :-) to :)
    text = text + ' ' + ' '.join(emoticons).replace('-', '')
    return text   

In [7]:
df['Review'] = df['Review'].apply(text_preprocessing)
df.head()

Unnamed: 0,Review,Rating,Sentiment
0,imagine the big chill with a cast of twenty so...,2,0
1,i d have to say that i ve seen worse sci fi ch...,3,0
2,director fabio barreto got a strange academy n...,1,0
3,pretty bad prc cheapie which i rarely bother t...,4,0
4,this is a very intriguing short movie by david...,8,1


### Creating Feature Matrix and Target Vector

In [8]:
X = df['Review'].values 
y = df['Sentiment'].values 

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)

Tokenizer-1 : we simply split the text 

In [13]:
def tokenizer(text):
    return text.split()

Tokenizer-2 : we use a tokenizer which contains stems of words as tokens. We use PorterStemmer() module to stem the words.

In [17]:
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

Removing Stopwords

In [21]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mndpp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
print(f'Number of stopwords = {len(stop)}')

Number of stopwords = 179


In [23]:
print(f'Stopwords = {stop}')

Stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so

There are certain stopwords which are relevant so we don't to remove those words

In [26]:
relevant_stopwords = ['no', 'not', 'ain', 'don', "don't", 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
new_stop = list(set(stop) - set(relevant_stopwords))
print(f'Stopwords remaining = {len(new_stop)}')

Stopwords remaining = 140


## Models

In [42]:
accuracy_models = {}

Post Training

In [43]:
def post_training(rs, model_name):
    print(f'Best Accuracy = {rs.best_score_:.4f}')
    print(f'Best Parameters = \n{rs.best_params_}')
    model = rs.best_estimator_
    model.fit(X_train, y_train)
    print(f'Accuracy on test set = {model.score(X_test, y_test)}')
    accuracy_models[model_name] = model.score(X_test, y_test)
    if not os.path.exists('models'):
        os.mkdir('models')
    model_filename = f'models/{model_name}_model.pickle'
    pickle.dump(model, open(model_filename, 'wb'))
    return model

### 1. Logistic Regression

In [44]:
model_name = 'logistic_regression'
tfidf = TfidfVectorizer(lowercase = False,
                        preprocessor=None,
                        min_df=5)
lr_tfidf_pipe = Pipeline([
    ('vect', tfidf),
    ('clf', LogisticRegression(solver='liblinear'))
])
np.random.seed(42)
param_range = scipy.stats.loguniform(0.0001, 1000.0)
param_grid_rs = [
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'clf__penalty'      : ['l1', 'l2'],
        'clf__C'            : param_range
    },
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'vect__use_idf'     : [False],
        'vect__norm'        : [None],
        'clf__penalty'      : ['l1','l2'],
        'clf__C'            : param_range
    }
]
rs_lr = RandomizedSearchCV(lr_tfidf_pipe, param_grid_rs,
                           scoring='accuracy', cv=5,
                           verbose = 2, n_jobs= -1,
                           random_state = 42, n_iter = 2)
rs_lr.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits




In [45]:
lr_model = post_training(rs_lr, model_name)

Best Accuracy = 0.8696
Best Parameters = 
{'clf__C': 37.65311476616745, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1), 'vect__stop_words': ['be', 'from', 'how', 'him', 'only', 'very', 'ours', 'if', 'both', 'of', 'will', 'down', 'can', 'who', 'for', 'same', 'y', 'me', 'by', 'am', 'herself', 'an', 'off', 'or', 'here', 'his', "you've", 'most', 'some', 'when', 'himself', 'that', 'just', 'have', 'because', 'yours', 'this', 've', 'against', 're', 'above', 'up', 'again', 'm', 'over', 'to', 'between', 'below', 'further', 'in', 'once', 'on', 'more', 'he', "that'll", 's', "you'd", 'so', 'our', 'as', 'do', 't', 'at', 'hers', 'your', 'you', 'did', 'these', 'all', 'll', 'own', 'then', 'was', 'about', 'which', 'theirs', 'other', 'whom', 'it', "it's", 'been', 'there', 'nor', 'themselves', 'they', 'before', 'a', 'now', 'through', 'are', 'ourselves', 'had', 'than', 'what', 'their', "you'll", 'being', "should've", 'too', 'o', 'does', 'out', 'where', 'but', 'doing', 'is', 'with', 'under', 'its', 'why



Accuracy on test set = 0.87084


### 2. Naive Bayes

In [46]:
model_name = 'naive_bayes'
tfidf = TfidfVectorizer(lowercase = False,
                        preprocessor=None,
                        min_df=5)
nb_tfidf_pipe = Pipeline([
    ('vect', tfidf),
    ('clf', MultinomialNB())
])
np.random.seed(42)
param_range = scipy.stats.loguniform(0.00001, 10000.0)
param_grid_rs = [
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'clf__alpha'        : param_range
    },
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'vect__use_idf'     : [False],
        'vect__norm'        : [None],
        'clf__alpha'        : param_range
    }
]
rs_nb = RandomizedSearchCV(nb_tfidf_pipe, param_grid_rs,
                           scoring='accuracy', cv=5,
                           verbose = 2, n_jobs= -1,
                           random_state = 42, n_iter = 2)
rs_nb.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits




In [47]:
model_nb = post_training(rs_nb, 'naive_bayes')

Best Accuracy = 0.8773
Best Parameters = 
{'clf__alpha': 2.353159805263749, 'vect__ngram_range': (1, 2), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x0000021F8B9D0280>}




Accuracy on test set = 0.87364


### 3. SVM

In [51]:
model_name = 'SVM'
tfidf = TfidfVectorizer(lowercase = False,
                        preprocessor=None,
                        min_df=5)
svm_tfidf_pipe = Pipeline([
    ('vect', tfidf),
    ('clf', SVC())
])
np.random.seed(42)
param_range = scipy.stats.loguniform(0.00001, 10000.0)
param_distributions = [
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'clf__kernel' : ['linear'],
        'clf__degree' : [1, 2, 3, 4, 5],
        'clf__C'      : scipy.stats.loguniform(0.0001, 1000.0),
    },
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'clf__kernel' : ['rbf', 'poly', 'sigmoid'],
        'clf__gamma'  : ['scale','auto'],
        'clf__C'      : scipy.stats.loguniform(0.0001, 1000.0),
    },
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'clf__kernel' : ['rbf', 'poly', 'sigmoid'],
        'clf__gamma'  : scipy.stats.loguniform(0.0001, 1000.0),
        'clf__C'      : scipy.stats.loguniform(0.0001, 1000.0),
    },
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'vect__use_idf'     : [False],
        'vect__norm'        : [None],
        'clf__kernel' : ['linear'],
        'clf__degree' : [1, 2, 3, 4, 5],
        'clf__C'      : scipy.stats.loguniform(0.0001, 1000.0),
    },
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'vect__use_idf'     : [False],
        'vect__norm'        : [None],
        'clf__kernel' : ['rbf', 'poly', 'sigmoid'],
        'clf__gamma'  : ['scale','auto'],
        'clf__C'      : scipy.stats.loguniform(0.0001, 1000.0),
    },
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'vect__use_idf'     : [False],
        'vect__norm'        : [None],
        'clf__kernel' : ['rbf', 'poly', 'sigmoid'],
        'clf__gamma'  : scipy.stats.loguniform(0.0001, 1000.0),
        'clf__C'      : scipy.stats.loguniform(0.0001, 1000.0),
    },
]
rs_svm = RandomizedSearchCV(svm_tfidf_pipe, param_distributions,
                            scoring='accuracy', cv=5, verbose=2,
                            n_jobs = -1, random_state=42,
                            n_iter = 2, refit = True)
rs_svm.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


KeyboardInterrupt: 

In [None]:
svm_model = post_training(rs_svm, model_name)

### 4. Random Forest

In [None]:
model_name = 'random_forest'
tfidf = TfidfVectorizer(lowercase = False,
                        preprocessor=None,
                        min_df=5)
rf_tfidf_pipe = Pipeline([
    ('vect', tfidf),
    ('clf', RandomForestClassifier())
])
np.random.seed(42)
param_range = scipy.stats.loguniform(0.00001, 10000.0)
param_distributions = [
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'clf__n_estimators' : [10, 20, 30, 40, 50, 75, 100, 150, 200],
        'clf__criterion'    : ['gini', 'entropy', 'log_loss'],
        'clf__max_depth'    : [ 2, 4, 6, 8, 10, 15, 20],
        'clf__max_features' : ['sqrt', 'log2', None]
    },
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'vect__use_idf'     : [False],
        'vect__norm'        : [None],
        'clf__n_estimators' : [10, 20, 30, 40, 50, 75, 100, 150, 200],
        'clf__criterion'    : ['gini', 'entropy', 'log_loss'],
        'clf__max_depth'    : [ 2, 4, 6, 8, 10, 15, 20],
        'clf__max_features' : ['sqrt', 'log2', None]
    },
]
rs_rf = RandomizedSearchCV(rf_tfidf_pipe, param_distributions,
                            scoring='accuracy', cv=5, verbose=2,
                            n_jobs = -1, random_state=42,
                            n_iter = 2, refit = True)
rs_rf.fit(X_train, y_train)

In [None]:
rf_model = post_training(rs_rf, model_name)

### 5. K Nearest Neighbors

In [None]:
model_name = 'knn'
tfidf = TfidfVectorizer(lowercase = False,
                        preprocessor=None,
                        min_df=5)
knn_tfidf_pipe = Pipeline([
    ('vect', tfidf),
    ('clf', KNeighborsClassifier())
])
np.random.seed(42)
param_range = scipy.stats.loguniform(0.00001, 10000.0)
param_distributions = [
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'clf__n_neighbors' : np.arange(1, 20).tolist(),
        'clf__weights'     : ['uniform', 'distance'],
        'clf__algorithm'   : ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'clf__leaf_size'   : [10, 15, 20, 25, 30, 35, 40]
    },
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'vect__use_idf'     : [False],
        'vect__norm'        : [None],
        'clf__n_neighbors' : np.arange(1, 20).tolist(),
        'clf__weights'     : ['uniform', 'distance'],
        'clf__algorithm'   : ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'clf__leaf_size'   : [10, 15, 20, 25, 30, 35, 40]
    },
]
rs_knn = RandomizedSearchCV(knn_tfidf_pipe, param_distributions,
                            scoring='accuracy', cv=5, verbose=2,
                            n_jobs = -1, random_state=42,
                            n_iter = 2, refit = True)
rs_knn.fit(X_train, y_train)

In [None]:
knn_model = post_training(rs_knn, model_name)

### 6. XG Boost

In [None]:
model_name = 'xgboost'
tfidf = TfidfVectorizer(lowercase = False,
                        preprocessor=None,
                        min_df=5)
xgb_tfidf_pipe = Pipeline([
    ('vect', tfidf),
    ('clf', XGBClassifier())
])
np.random.seed(42)
param_range = scipy.stats.loguniform(0.00001, 10000.0)
param_distributions = [
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'clf__n_estimators' : [10, 20, 30, 40, 50, 75, 100, 150, 200],
        'clf__max_depth'    : [2, 4, 6, 8, 10, 15, 20],
        'clf__subsample'    : [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'clf__colsample_bytree' : [0.2, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'clf__colsample_bylevel' : [0.2, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'clf__min_child_weight':  scipy.stats.uniform(0.5, 10.0),
        'clf__gamma': [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
        'clf__reg_lambda': scipy.stats.loguniform(0.1, 100.0),
        'clf__learning_rate': scipy.stats.loguniform(0.0001, 10.0)
    },
    {
        'vect__ngram_range' : [(1, 1), (1, 2)],
        'vect__stop_words'  : [None, new_stop],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'vect__use_idf'     : [False],
        'vect__norm'        : [None],
        'clf__n_estimators' : [10, 20, 30, 40, 50, 75, 100, 150, 200],
        'clf__max_depth'    : [2, 4, 6, 8, 10, 15, 20],
        'clf__subsample'    : [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'clf__colsample_bytree' : [0.2, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'clf__colsample_bylevel' : [0.2, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'clf__min_child_weight':  scipy.stats.uniform(0.5, 10.0),
        'clf__gamma': [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
        'clf__reg_lambda': scipy.stats.loguniform(0.1, 100.0),
        'clf__learning_rate': scipy.stats.loguniform(0.0001, 10.0)
    },
]
rs_xgb = RandomizedSearchCV(xgb_tfidf_pipe, param_distributions,
                            scoring='accuracy', cv=5, verbose=2,
                            n_jobs = -1, random_state=42,
                            n_iter = 2, refit = True)
rs_xgb.fit(X_train, y_train)

In [None]:
xgb_model = post_training(rs_xgb, model_name)