In [52]:
%matplotlib inline
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve,auc
from nltk.stem.porter import PorterStemmer

In [4]:
con=sqlite3.connect('database.sqlite')

In [5]:
filtered_data=pd.read_sql_query("SELECT * FROM REVIEWS WHERE Score!=3",con)

In [6]:
def partition(x):
    if x<3: 
        return "negative"
    else:
        return "positive"

In [7]:
actualScore=filtered_data['Score']
positiveNegative=actualScore.map(partition)
filtered_data['Score']=positiveNegative

In [8]:
filtered_data.shape


(525814, 10)

In [9]:
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [10]:
#Data de-deuplication
sorted_data=filtered_data.sort_values('ProductId',axis=0,ascending=True)

In [11]:
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"},keep='first',inplace=False)

In [13]:
final.shape

(364173, 10)

In [13]:
(final['Id'].size*1.0)/(filtered_data['Id'].size*1.00)*100

69.25890143662969

In [14]:
display=pd.read_sql_query("SELECT * FROM Reviews WHERE Score!=3 AND ID=44737 OR ID=64422 ORDER BY ProductId",con)

In [15]:
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [16]:
print(final.shape)

(364171, 10)


In [18]:
final['Score'].value_counts()

positive    307061
negative     57110
Name: Score, dtype: int64

In [20]:
final.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc..."
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...


# BAG OF WORDS Model

In [21]:
#Bag Of Words Model
count_vect=CountVectorizer()

In [22]:
final_counts=count_vect.fit_transform(final['Text'].values)

In [23]:
type(final_counts)

scipy.sparse.csr.csr_matrix

In [24]:
final_counts.get_shape()

(364171, 115281)

# TEXT PREPROCESSING

In [25]:
#Removing stopwords
import re

In [28]:
#removing html tags
i=0
for sent in final['Text'].values:
    if(len(re.findall('<.*?>',sent))):
        print(i)
        print(sent)
        break;
    i +=1 

6
I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [29]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [30]:
stop=set(stopwords.words('english'))
snp=nltk.stem.SnowballStemmer('english')

In [31]:
def cleanhtml(sentence):
    cleanr=re.compile('<.*?>')
    cleantext=re.sub(cleanr,'',sentence)
    return cleantext

In [32]:
def cleanpunc(sentence):
    cleaned=re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned=re.sub(r'[.|,|)|\|/]',r'',cleaned)
    return cleaned

In [33]:
print(stop)
print("**************************************************************")
print(snp.stem('tasty'))

{'needn', 'up', 'other', 'his', 'too', "that'll", 'then', 'hers', 'only', 'at', 'of', 'haven', 'was', 'there', 'who', 'by', 'aren', "you'd", 'her', 'than', 't', "mightn't", 'how', 'wasn', "haven't", 'why', "don't", 'has', 'above', 'll', 'just', 'your', "doesn't", 'ourselves', 'when', 'yours', 'theirs', 've', 'which', "isn't", 'have', 'wouldn', 'didn', 'further', "mustn't", 'yourself', 'she', 'such', 'had', "should've", 'that', 'you', 'they', 'our', 'down', 'any', 'hasn', 'itself', "aren't", 'can', 'about', 'after', 'more', 'are', 'now', 'it', 'm', 'him', 'will', 'below', 'if', 'don', 'isn', 'in', 'until', 'own', 'once', 'again', 'mightn', 'weren', 'should', 'while', 'their', 'against', "won't", 'and', 'myself', 'into', 'as', 'himself', 'for', 'a', 'its', 'so', 'doesn', "weren't", 'do', "wasn't", 'those', 'me', 'ma', 'o', "wouldn't", 'where', 'yourselves', 'won', "you're", 'themselves', 'am', 'couldn', 'because', 'most', 'doing', 'here', 'be', 'being', 'is', 'out', 'hadn', "you'll", 'du

In [34]:
i=0
str1=''
final_string=[]
all_positive_words=[]
all_negative_words=[]
s=''
for sent in final['Text'].values:
    filtered_sentence=[]
    sent=cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):
                if(cleaned_words.lower() not in stop):
                    s=(snp.stem(cleaned_words.lower()).encode('utf-8'))
                    filtered_sentence.append(s)
                    if(final['Score'].values)[i]== 'positive': 
                        all_positive_words.append(s)
                    if(final['Score'].values)[i]=='negative':
                        all_negative_words.append(s)
                else:
                    continue
                    
            else:
                continue
    
    str1=b"".join(filtered_sentence)
               
    final_string.append(str1)
    i+=1
                       
                   

In [35]:
final['CleanedText']=final_string

In [None]:
final.head(3)
conn=sqlite3.connect('final_sqlite')
c=conn.cursor()
conn.text_factory=str
final.to_sql('Reviews',conn,flavor=None,schema=None,if_exists='replace')

In [32]:
final.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,b'wittilittlbookmakesonlaughloudrecitcardrivea...
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",b'grewreadsendakbookwatchreallirosimoviincorpo...
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,b'funwaychildrenlearnmonthyearlearnpoemthrough...


# BIGRAMS and N-GRAMS

In [37]:
freq_dist_positive=nltk.FreqDist(all_positive_words)
freq_dist_negative=nltk.FreqDist(all_negative_words)
print("Most common positive words",freq_dist_positive.most_common(20))
print("Most common negative words",freq_dist_negative.most_common(20))

Most common positive words [(b'like', 137024), (b'tast', 125919), (b'good', 109663), (b'love', 106431), (b'flavor', 106341), (b'use', 102607), (b'great', 100996), (b'one', 93986), (b'product', 88441), (b'tri', 84990), (b'tea', 81771), (b'coffe', 76581), (b'make', 74336), (b'get', 71653), (b'food', 62998), (b'would', 55093), (b'buy', 53536), (b'time', 53533), (b'realli', 52313), (b'eat', 51294)]
Most common negative words [(b'tast', 33801), (b'like', 31835), (b'product', 27390), (b'one', 20066), (b'flavor', 18884), (b'would', 17852), (b'tri', 17498), (b'use', 15111), (b'good', 14579), (b'coffe', 14280), (b'get', 13698), (b'buy', 13605), (b'order', 12691), (b'food', 12369), (b'dont', 11577), (b'tea', 11337), (b'even', 10809), (b'box', 10547), (b'make', 9775), (b'time', 9557)]


In [42]:
count_vect=CountVectorizer(ngram_range=(1,2))
final_bigram_counts=count_vect.fit_transform(final['Text'].values)

In [None]:
final_bigram_counts.get_shape()

# TF_IDF

In [51]:
tf_idf_vect=TfidfVectorizer(ngram_range=(1,2))
final_tf_idf=tf_idf_vect.fit_transform(final['Text'].values)

In [53]:
final_tf_idf.get_shape()

(364171, 2910192)

In [54]:
features=tf_idf_vect.get_feature_names()
len(features)

2910192

In [55]:
features[1000000:1000010]

['for samples',
 'for sampling',
 'for samu',
 'for san',
 'for sanc',
 'for sand',
 'for sanding',
 'for sandpaper',
 'for sandwhich',
 'for sandwhiches']

In [56]:
print(final_tf_idf[3,:].toarray()[0])

[0. 0. 0. ... 0. 0. 0.]


In [57]:
def top_tfidf_feats(row,features,top_n=25):
    top_ids=np.argsort(row)[::-1][:top_n]
    top_feats=[(features[i],row[i]) for i in top_ids] 
    df=pd.DataFrame(top_feats)
    df.columns=['feature','tfidf']
    return df
top_tfidf=top_tfidf_feats(final_tf_idf[1,:].toarray()[0],features,25)

In [58]:
top_tfidf

Unnamed: 0,feature,tfidf
0,sendak books,0.173437
1,rosie movie,0.173437
2,paperbacks seem,0.173437
3,cover version,0.173437
4,these sendak,0.173437
5,the paperbacks,0.173437
6,pages open,0.173437
7,really rosie,0.168074
8,incorporates them,0.168074
9,paperbacks,0.168074


# Word2Vec

In [None]:
#Not suitable for lower memories
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

model=KeyedVectors.load_word2vec_format('Google-news-vectors-negative300.bin')

model.wv['computer']

model.wv.similarity('woman','man')

model.wv.most_similar('woman')

In [46]:
#Training own word2vec
import gensim
i=0
list_of_sent=[]
for sent in final['Text'].values:
    filtered_sentence=[]
    sent=cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if(cleaned_words.isalpha()):
                filtered_sentence.append(cleaned_words.lower())
            else:
                continue
    list_of_sent.append(filtered_sentence)

In [49]:
print(final['Text'].values[0])
print("***************************************************************")
print(list_of_sent[0])

this witty little book makes my son laugh at loud. i recite it in the car as we're driving along and he always can sing the refrain. he's learned about whales, India, drooping roses:  i love all the new words this book  introduces and the silliness of it all.  this is a classic book i am  willing to bet my son will STILL be able to recite from memory when he is  in college
***************************************************************
['this', 'witty', 'little', 'book', 'makes', 'my', 'son', 'laugh', 'at', 'loud', 'i', 'recite', 'it', 'in', 'the', 'car', 'as', 'were', 'driving', 'along', 'and', 'he', 'always', 'can', 'sing', 'the', 'refrain', 'hes', 'learned', 'about', 'whales', 'india', 'drooping', 'i', 'love', 'all', 'the', 'new', 'words', 'this', 'book', 'introduces', 'and', 'the', 'silliness', 'of', 'it', 'all', 'this', 'is', 'a', 'classic', 'book', 'i', 'am', 'willing', 'to', 'bet', 'my', 'son', 'will', 'still', 'be', 'able', 'to', 'recite', 'from', 'memory', 'when', 'he', 'is', 

In [50]:
w2v_model=gensim.models.Word2Vec(list_of_sent,min_count=5,size=50,workers=4)

  "C extension not loaded, training will be slow. "


KeyboardInterrupt: 

In [None]:
words=list(w2v_model.wv.vocab)
print(len(words))

In [None]:
w2v_model.wv.most_similar('tasty')

In [None]:
count_vect_feat=count_vect.get_feature_names()
count_vect_feat.index('like')
print(count_vect_feat[64055])

# Avg word2Vec, TFIDF Word2Vec

In [47]:
#Avg Word2Vec
sent_vectors=[]
for sent in list_of_sent:
    sent_vec=np.zeros(50)
    cnt_words=0
    for word in sent:
        try:
            vec=w2v_model.wv[word]
            sent_vec+=vec
            cnt_words+=1
        except:
            pass
    sent_vec/=cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

  if sys.path[0] == '':


364171
50


In [59]:
#TF-IDF weighted word2vec
tfidf_feat=tf_idf_vect.get_feature_names()
tfidf_sent_vectors=[]
row=0
for sent in list_of_sent:
    sent_vec=np.zeros(50)
    weight_sum=0;
    for word in sent:
        try:
            vec=w2v.model_wv[word]
            tfidf=final_tf_idf[row,tfidf_feat.index(word)]
            sent_vec+=(vec*tf_idf)
            weight_sum+=tf_idf
        except:
            pass
        sent_vec/=weight_sum
        tfidf_sent_vectors.append(sent_vec)
        row+=1
        
    

  app.launch_new_instance()
