In [3]:
#Given a review, determine whether the review is positive (Rating of 4 or 5) or negative (rating of 1 or 2).
#We could use the Score/Rating. A rating of 4 or 5 could be cosnidered a positive review.
#A review of 1 or 2 could be considered negative. A review of 3 is nuetral and ignored.
#This is an approximate and proxy way of determining the polarity (positivity/negativity) of a review.

#loading data

import sqlite3
import pandas as pd
import numpy as np
import nltk
#nltk.download('stopwords')
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

#1) Reading Data

# using the SQLite Table to read data.
con = sqlite3.connect('./database.sqlite') 

#filtering only positive and negative reviews i.e. 
# not taking into consideration those reviews with Score=3
filtered_data = pd.read_sql_query(""" 
SELECT *
FROM Reviews
WHERE Score != 3 
""", con)

# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

#changing reviews with score less than 3 to be positive and vice-versa
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition) 
filtered_data['Score'] = positiveNegative

#looking at the number of attributes and size of the data
print("Number of data points in our data", filtered_data.shape)
print("Filtered data:")
filtered_data.head()

#2) Data Cleaning: (1)Deduplication

#Sorting data according to ProductId in ascending order
sorted_data=filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

#Deduplication of entries
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
print("Number of data points after deduplication:")
final.shape

#Checking to see how much % of data still remains
print("Number of remaining data:")
(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100

# (2)It was also seen that in two rows given below the value of HelpfulnessNumerator is greater than HelpfulnessDenominator 
#which is not practically possible hence these two rows too are removed from calcualtions

display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND Id=44737 OR Id=64422
ORDER BY ProductID
""", con)
print("After cleaning data:")
display.head()

final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

#Before starting the next phase of preprocessing lets see the number of entries left
print("Number of remaining data:")
print(final.shape)

#How many positive and negative reviews are present in our dataset?
print("Number of positive and negative reviews :")
final['Score'].value_counts()

#3) Featurization:

#BAG OF WORDS

count_vect = CountVectorizer() #in scikit-learn
final_counts = count_vect.fit_transform(final['Text'].values)
print("the type of count vectorizer ",type(final_counts))
print("the shape of out text BOW vectorizer ",final_counts.get_shape())

#4)Text Preprocessing.
stop = set(stopwords.words('english'))#set of stop words
sno = nltk.stem.SnowballStemmer('english')#initialising the Snowball Stemmer

def cleanhtml(sentence): #function to clean the word of html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    return cleantext

def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[ ? | ! | \' | " | # ] ' , r'' , sentence)
    cleaned = re.sub(r'[ . | , | ) | ( | \ | /]' , r'', cleaned)                     
    return cleaned
print("set of stop words:")
print(stop)
print("***********************************************")
print("Stemming of word 'tasty'")
print(sno.stem('tasty'))

#code for implementing step-by-step the checks mentioned in pre-processing
i = 0
str1 = ' '
final_string =[]
all_positive_words =[] #store words from +ve reviews here
all_negative_words =[] #store words from -ve reviews here
s = ''
for sent in final['Text'].values:
    filtered_sentence =[]
    #print(sent);
    sent = cleanhtml(sent) #remove HTML tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words) > 2)):
                if(cleaned_words.lower() not in stop):
                    s = (sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if(final['Score'].values)[i] == 'positive':
                        all_positive_words.append(s) #
                    if(final['Score'].values)[i] == 'negative': 
                        all_negative_words.append(s)
                else:
                    continue
                    
            else:
                continue
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of clean words
    final_string.append(str1)
    i += 1
#adding a column     
final['CleanedText'] = final_string

print("Processed review of CleanedText : ")
final.head()

#store final table into an SQlLite table for future use
conn = sqlite3.connect('final.sqlite')
c = conn.cursor()
conn.text_factory = str
final.to_sql('Reviews', conn, schema = None, if_exists='replace')

freq_dist_positive = nltk.FreqDist(all_positive_words)
freq_dist_negative = nltk.FreqDist(all_negative_words)
print("Most Common Positive Words : ",freq_dist_positive.most_common(20))
print("Most Common Negative Words : ",freq_dist_negative.most_common(20))
    





Number of data points in our data (525814, 10)
Filtered data:
Number of data points after deduplication:
Number of remaining data:
After cleaning data:
Number of remaining data:
(364171, 10)
Number of positive and negative reviews :
the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer  (364171, 115281)
set of stop words:
{'mustn', 'why', 'both', 'd', 'did', 'after', 'yours', 'its', 'ourselves', 'where', 'until', 'isn', 'do', 'were', 'other', 'hadn', "isn't", 'each', 'mightn', 'have', 'him', 'his', 'below', "hadn't", 'couldn', "you've", 'some', 'we', 'aren', "doesn't", 'a', 'before', 'and', 'this', 'with', 'doesn', 'ma', 'from', 'in', 'her', 'hers', 'having', 'i', 'not', 'ours', 'down', 'now', 'm', "shan't", "needn't", 'between', 'themselves', "aren't", 'about', 'off', 'is', 'nor', 'there', "you'd", 'they', 'during', 'y', "you'll", 'should', 'll', 'only', 'our', 'was', 'your', 'wouldn', 've', 'won', 'shouldn', "didn't", 'whom', "mightn