Problem Statement -  Predict the sentiments based on reviews

Install necessary libraries

In [5]:
!pip install contractions
!pip install textsearch
!pip install tqdm
import nltk 
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Loading the dataset in pandas dataframe

In [9]:
import pandas as pd

dataset = pd.read_csv("movie_reviews.csv")

In [11]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Train and test dataset 

In [12]:
reviews = dataset["review"].values
sentiment = dataset["sentiment"].values

train_reviews = reviews[:35000]

test_reviews = reviews[35000:]

train_sentiments = sentiment[:35000]

test_sentiments = sentiment[35000:]

Text wrangling and normalization

In [13]:
import contractions
from bs4 import BeautifulSoup
import numpy as np
import re
import tqdm
import unicodedata

def strip_html_tags(text):
  soup = BeautifulSoup(text, "html.parser")
  [s.extract() for s in soup(['iframe', 'script'])]
  stripped_text = soup.get_text()
  stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
  return stripped_text

def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

def pre_process_corpus(docs):
  norm_docs = []
  for doc in tqdm.tqdm(docs):
    doc = strip_html_tags(doc)
    doc = doc.translate(doc.maketrans("\n\t\r", "   "))
    doc = doc.lower()
    doc = remove_accented_chars(doc)
    doc = contractions.fix(doc)
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = re.sub(' +', ' ', doc)
    doc = doc.strip()  
    norm_docs.append(doc)
  
  return norm_docs

In [14]:
%%time

norm_train_reviews = pre_process_corpus(train_reviews)
norm_test_reviews = pre_process_corpus(test_reviews)

100%|██████████| 35000/35000 [00:16<00:00, 2107.74it/s]
100%|██████████| 15000/15000 [00:07<00:00, 2127.98it/s]

CPU times: user 23.3 s, sys: 224 ms, total: 23.6 s
Wall time: 23.7 s





Feature engineering

In [42]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer

# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=5, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(norm_train_reviews)

CPU times: user 28.7 s, sys: 592 ms, total: 29.3 s
Wall time: 29.4 s


In [43]:
%%time

# transform test reviews into features
tv_test_features = tv.transform(norm_test_reviews)

CPU times: user 6.35 s, sys: 8.76 ms, total: 6.36 s
Wall time: 6.37 s


In [44]:
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

TFIDF model:> Train features shape: (35000, 194919)  Test features shape: (15000, 194919)


Applying ML models

In [47]:
%%time
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs', random_state=42)
# Logistic Regression model on TF-IDF features

# train model
lr.fit(tv_train_features, train_sentiments)

# predict on test data
lr_tfidf_predictions = lr.predict(tv_test_features)

CPU times: user 3.85 s, sys: 2.45 s, total: 6.3 s
Wall time: 3.3 s


In [48]:
print(classification_report(test_sentiments,lr_tfidf_predictions))

              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      7490
    positive       0.90      0.91      0.90      7510

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000



In [50]:
labels = ["negative","positive"]
print(pd.DataFrame(confusion_matrix(test_sentiments,lr_tfidf_predictions),index = labels, columns=labels))

          negative  positive
negative      6688       802
positive       666      6844
