In [60]:

import html
import time
import functools
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from nltk.corpus import stopwords as sw
import string
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
#Load the data set
data =pd.read_csv('Reviews.csv')

In [67]:
data.head(4)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,pos_neg
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,pos
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,neg
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,pos
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,neg


In [25]:
data.columns.values

array(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype=object)

In [28]:
# Make a new column named 'pos_neg', which has value 'neg' if the overall rating is 1, 2 ,or 3, 
# and has value 'pos' of the overall rating is 4 or 5. 
data.loc[data.Score.isin([1,2,3]), 'pos_neg'] = 'neg'
data.loc[data.Score.isin([4,5]), 'pos_neg'] = 'pos'
df = data[['pos_neg', 'Text']]

In [29]:
df.head()

Unnamed: 0,pos_neg,Text
0,pos,I have bought several of the Vitality canned d...
1,neg,Product arrived labeled as Jumbo Salted Peanut...
2,pos,This is a confection that has been around a fe...
3,neg,If you are looking for the secret ingredient i...
4,pos,Great taffy at a great price. There was a wid...


In [30]:
# Check whethere there is any missing data
df.isnull().sum() 

pos_neg    0
Text       0
dtype: int64

In [31]:
data.shape

(568454, 11)

In [32]:
# Check the distribution of the positive and negative reviews
df.pos_neg.value_counts()

pos    443777
neg    124677
Name: pos_neg, dtype: int64

In [33]:
# Sample positive reveiws to get a balanced dataset
neg = df.loc[df.pos_neg=='neg']
pos = df.loc[df.pos_neg=='pos'].sample(n=df.pos_neg.value_counts()['neg'], random_state=42)

In [34]:
print(type(pos))
print("pos:", len(pos), ", neg:", len(neg))

<class 'pandas.core.frame.DataFrame'>
pos: 124677 , neg: 124677


In [43]:
#Data preprocessing
lemmatizer = nltk.WordNetLemmatizer()
stopwords = sw.words('english')
stopwords = stopwords + ['not_' + w for w in stopwords]

# transform punctuation to blanks
trans_punct = str.maketrans(string.punctuation,' '*len(string.punctuation)) 

# pad punctuation with blanks
pad_punct = str.maketrans({key: " {0} ".format(key) for key in string.punctuation}) 
# remove "_" from string.punctuation
invalidChars = str(string.punctuation.replace("_", ""))  

In [45]:
def preprocessing(line, ngram=1, neg_handling=True, remove_stop=False):
    """
    Preprocessing the review texts
    @params:
        line                       - Required: the input text (Str)
        ngram                  - Optional: number n in the n-gram model(Int, 1, 2, or 3)
        neg_handling       - Optional: whether to perform negation handling (Boolean)
        remove_stop        -Optional: whether to remove the stop words (Boolean)
    """
        
    line = html.unescape(str(line))
    line = str(line).replace("can't", "can not")
    line = str(line).replace("n't", " not")
    
    if neg_handling:
        line = str(line).translate(pad_punct)  # If performing negation handling, pad punctuations with blanks
        line = nltk.word_tokenize(line.lower()) # Word normalization and tokenization
        tokens = []
        negated = False
        for t in line:
            if t in ['not', 'no']:
                negated = not negated
            elif t in string.punctuation or not t.isalpha():
                negated = False
            else:
                tokens.append('not_' + t if negated else t)  # add "not_" prefix to words behind "not", or "no"     
    else:
        line = str(line).translate(trans_punct)  # If not performing negation handling, remove punctuations
        line = nltk.word_tokenize(line.lower()) # Word normalization and tokenization
        tokens = line
        
        if ngram==2:
         bi_tokens = list(nltk.bigrams(line))
         bi_tokens = list(map('_'.join, bi_tokens))
        bi_tokens = [i for i in bi_tokens if all(j not in invalidChars for j in i)]
        tokens = tokens + bi_tokens

    if ngram==3:
        bi_tokens = list(nltk.bigrams(line))
        bi_tokens = list(map('_'.join, bi_tokens))
        bi_tokens = [i for i in bi_tokens if all(j not in invalidChars for j in i)]
        tri_tokens = list(nltk.trigrams(line))
        tri_tokens = list(map('_'.join, tri_tokens))
        tri_tokens = [i for i in tri_tokens if all(j not in invalidChars for j in i)]
        tokens = tokens + bi_tokens + tri_tokens    
     
    if remove_stop:
        line = [lemmatizer.lemmatize(t) for t in tokens if t not in stopwords]
    else:
        line = [lemmatizer.lemmatize(t) for t in tokens] 
    
    return ' '.join(line)

In [68]:
# Preprocessing the positive reveiws
pos_data = []
n_pos = len(pos)
for i, p in enumerate(pos['Text']):
    pos_data.append(preprocessing(p, ngram=3))
    

In [69]:
# Preprocessing the negative reveiws
neg_data = []
n_neg = len(neg)
for i, n in enumerate(neg['Text']):
    neg_data.append(preprocessing(n, ngram=3))
    

In [70]:
# Combine the preprocessed data
data = pos_data + neg_data
labels = np.concatenate((pos['pos_neg'].values, neg['pos_neg'].values))

In [71]:
# split the dataset to training, validation, test sets by 60-20-20
train_data, rest_data, train_labels, rest_labels = train_test_split(data, labels, test_size=0.4, 
                                                                    stratify=labels, random_state=1234)
valid_data, test_data, valid_labels, test_labels = train_test_split(rest_data, rest_labels, test_size=0.5, 
                                                                    stratify=rest_labels, random_state=1234)
print("training size = ", len(train_data), "validation size = ", len(valid_data), "testing size = ", len(test_data))

training size =  149612 validation size =  49871 testing size =  49871


In [72]:
#Computing the frequencu of words
# Push all tokens and compute the frequency of words
tokens = [word for line in train_data for word in nltk.word_tokenize(line)]
word_features = nltk.FreqDist(tokens)

In [73]:
print(word_features)

<FreqDist with 4173490 samples and 33080505 outcomes>


In [74]:
# Print the 10 most common words
word_features.most_common(10)

[('the', 485414),
 ('i', 458634),
 ('a', 362638),
 ('and', 321562),
 ('it', 276055),
 ('to', 246623),
 ('of', 206679),
 ('is', 191803),
 ('br', 184557),
 ('this', 168636)]

In [75]:
# Remove features (words) which occur only once (This is to be used in the basic modeling process)
topwords = [fpair[0] for fpair in list(word_features.most_common(len(word_features))) if fpair[1]>=2] 
len(topwords) 

1477440

In [76]:
# Convert a collection of raw documents to a matrix of TF-IDF features.
# Equivalent to CountVectorizer followed by TfidfTransformer.
tf_vec = TfidfVectorizer()

tf_vec.fit_transform([' '.join(topwords)])

<1x1477353 sparse matrix of type '<class 'numpy.float64'>'
	with 1477353 stored elements in Compressed Sparse Row format>

In [77]:
# Extract features from training set
# Vocabulary is from topwords
train_features = tf_vec.transform(train_data)

In [78]:
train_features.shape

(149612, 1477353)

In [79]:
# Extract features from test set
test_features = tf_vec.transform(test_data)

In [80]:
test_features.shape

(49871, 1477353)

In [81]:
#Basic modelling
#Naive Bayes model
mnb_model = MultinomialNB()
mnb_model
# Train Model
mnb_model.fit(train_features, train_labels)

In [82]:
# Predict
pred = mnb_model.predict(test_features)
print(pred)

['neg' 'neg' 'pos' ... 'neg' 'neg' 'pos']


In [83]:
# Metrics
accuracy = metrics.accuracy_score(test_labels, pred)
print(accuracy)

0.8920214152513485


In [84]:
print(metrics.classification_report(y_true=test_labels, y_pred=pred, digits=4))

              precision    recall  f1-score   support

         neg     0.8709    0.9205    0.8950     24935
         pos     0.9157    0.8636    0.8889     24936

    accuracy                         0.8920     49871
   macro avg     0.8933    0.8920    0.8919     49871
weighted avg     0.8933    0.8920    0.8919     49871



In [85]:
#Logistic regression 
lgr_model = LogisticRegression()
print(lgr_model, end='\n'*2)


lgr_model.fit(train_features, train_labels)
lgr_pred = lgr_model.predict(test_features)

print('Accuracy = %.5f' % metrics.accuracy_score(test_labels, lgr_pred))
print(metrics.classification_report(y_pred=lgr_pred, y_true=test_labels, digits=4))

LogisticRegression()



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy = 0.89693
              precision    recall  f1-score   support

         neg     0.8898    0.9061    0.8979     24935
         pos     0.9044    0.8878    0.8960     24936

    accuracy                         0.8969     49871
   macro avg     0.8971    0.8969    0.8969     49871
weighted avg     0.8971    0.8969    0.8969     49871



In [86]:
#SVM : Linear Support Vector Model
svc_model = LinearSVC()
print(svc_model, end='\n'*2)

svc_model.fit(train_features, train_labels)
svc_pred = svc_model.predict(test_features)

print('Accuracy = %.5f' % metrics.accuracy_score(test_labels, svc_pred))
print(metrics.classification_report(y_pred=svc_pred, y_true=test_labels, digits=4))

LinearSVC()

Accuracy = 0.92254
              precision    recall  f1-score   support

         neg     0.9189    0.9269    0.9229     24935
         pos     0.9263    0.9182    0.9222     24936

    accuracy                         0.9225     49871
   macro avg     0.9226    0.9225    0.9225     49871
weighted avg     0.9226    0.9225    0.9225     49871

