### Load packages

In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm

### Load the Dataset

In [2]:
#using SQLite Table to read data.
con=sqlite3.connect('database.sqlite')

In [3]:
#filtering only positive and negative reviews so not considering reviews with score=3
filtered_df= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score!=3
""",con)

In [4]:
filtered_df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
525809,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
525810,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
525811,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
525812,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [5]:
def partition(x):
    if x < 3:
        return 0
    return 1

*We are replacing the Score ratings with Positive and negative as if Score<3 : negative else positive*

In [6]:
actualScore=filtered_df['Score']
positiveNegative=actualScore.map(partition)
filtered_df['Score']=positiveNegative

In [7]:
filtered_df.shape

(525814, 10)

In [8]:
filtered_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,0,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,1,1350777600,Great taffy,Great taffy at a great price. There was a wid...


### Data Cleaning

In [9]:
# a person can have multiple accounts meaning not necessarily ProfileName and UserID has to be same
print(filtered_df['ProfileName'].unique().shape)
print(filtered_df['UserId'].unique().shape)

(208273,)
(243414,)


In [10]:
display=pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score!=3 and UserId= 'AR5J8UI46CURR'
""",con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


*Clearly it is observable that in same timestamp Miss Geetha is giving the exact reviews for all the products she purchased. This has been identified with ProductId that manufacturer for all of these products are same so maybe if she's giving review for one, this happen for the others too which is a clear case of duplication.*

In [11]:
sorted_df=filtered_df.sort_values('ProductId',axis=0,ascending=True)

In [12]:
#removing duplicates
final_df=sorted_df.drop_duplicates(subset={'UserId','ProfileName','Time','Text'},keep='first',inplace=False)
final_df.shape

(364173, 10)

In [13]:
#how much data still remains
(final_df['Id'].size*1.0/filtered_df['Id'].size*1.0)*100

69.25890143662969

In [14]:
final_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,1,939340800,EVERY book is educational,this witty little book makes my son laugh at l...
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,1,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc..."
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,1,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...
138690,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,1,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...
138691,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,1,1018396800,A great way to learn the months,This is a book of poetry about the months of t...


*Another logic says HelpfulnessNumerator is always should be equal to HelpfulnessDenominator*

**if HelpfulnessNumerator is Yes then HelpfulnessDenominator is Yes+No**

In [15]:
final_df[final_df['HelpfulnessNumerator']>final_df['HelpfulnessDenominator']]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
59301,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,1,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
41159,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,1,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


*Only two rows are showing such issue.*

In [16]:
final_df=final_df[final_df['HelpfulnessNumerator']<=final_df['HelpfulnessDenominator']]

In [17]:
print(final_df.shape)
final_df['Score'].value_counts()

(364171, 10)


Score
1    307061
0     57110
Name: count, dtype: int64

In [18]:
final_df[final_df['ProfileName']=='Ram'].head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
500884,541550,B00014JYNO,A2V0I904FH7ABY,Ram,2,3,0,1338336000,Caution: contains Hydrogenated oil,The ingredients lists hydrogenated oil as one ...
2383,2595,B000EGX2EG,A2V0I904FH7ABY,Ram,1,3,0,1306195200,Great taste - but honestly what goes inside ? ...,I used to buy these as healthy snacks at my of...
51357,55796,B000HD7N7I,A2V0I904FH7ABY,Ram,1,4,0,1327190400,Contains cancer causing artificial butter flavor,"I love this popcorn, until I researched on art..."


## Preprocessing

1. Begin by removing the html tags
2. Remove any punctuations or limited set of special characters like , or . or # etc.
3. Check if the word is made up of english letters and is not alpha-numeric
4. Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
5. Convert the word to lowercase
6. Remove Stopwords
7. Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)

In [19]:
#optional
stop=set(stopwords.words('english')) #set of stopwords
snow=nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence): #to clean the word of any html-tags
    cleanr=re.compile('<.*?>')
    cleantext=re.sub(cleanr,' ',sentence)
    return cleantext
def cleanpunc(sentence): #to clean the word of any punctuation
    cleaned=re.sub('r[?|!|\'|"|#]',r'',sentence)
    cleaned=re.sub('r[.|,|)|(|\|/]',r' ',cleaned)
    return cleaned
print(stop)
print('*************************************')
print(snow.stem('tasty'))

{"hadn't", 'aren', 'yourselves', 'theirs', 'has', 'herself', "mustn't", 'doing', 'between', 'into', "weren't", 'are', 'is', 'if', "it'd", 'her', 'just', 'the', "won't", 'each', 'off', "they'll", 'but', 'does', "i'd", "she'll", "he'd", 'wasn', "she's", "we'd", 'you', 'had', "i'm", 'too', 'ourselves', 'that', 'than', 'been', 'few', "aren't", 'other', 'my', 'our', 'shan', 'their', 'y', 'wouldn', 'all', 'don', "hasn't", 'he', "i've", 'didn', 'about', 'on', 'how', 'there', "couldn't", "isn't", 'no', 'nor', 'have', 't', 'again', "doesn't", 'above', 'they', 'hasn', 'i', 'which', 'both', 'for', 'mightn', 'some', 'against', "don't", "it'll", "they'd", 'of', 'mustn', 'a', 'couldn', 'in', 'hadn', 'what', 'and', 'when', "should've", 'me', 'yourself', "it's", 'o', 'below', 'itself', 'its', 'now', 'so', 'weren', 'myself', 'shouldn', "we've", "shan't", 'haven', "i'll", 'an', 'those', "you're", 'over', 'your', "needn't", 'can', 'm', "she'd", 'his', 'after', 'd', 'yours', 'who', 'will', 'own', 'ma', "w

### Text & Summary Preprocessing

In [20]:
def no_stopwords_preprocessed(data,colname):
    i=0
    str=' '
    final_df_string=[]
    all_positive_words=[]
    all_negative_words=[]
    all_positive_words1=[]
    all_negative_words1=[]
    s=''
    for sent in data[colname].values:
        #print(sent)
        filtered_sentence=[]
        sent=cleanhtml(sent)
        for w in sent.split():
            for cleaned_words in cleanpunc(w).split():
                if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):
                    s=(snow.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if(data[colname]==final_df['Text']).all():
                            if(data['Score'].values)[i] == 'positive':
                                all_positive_words.append(s)
                            elif(data['Score'].values)[i] == 'negative':
                                all_negative_words.append(s)
                            else:
                                continue
                    else:
                        if(data['Score'].values)[i] == 'positive':
                            all_positive_words1.append(s)
                        elif(data['Score'].values)[i] == 'negative':
                            all_negative_words1.append(s)
                        else:
                            continue
                        
                else:
                    continue
                
        #print(filtered_sentence)
        str=b" ".join(filtered_sentence)
    
        final_df_string.append(str)
        i=i+1
    return final_df_string

In [21]:
final_df['CleanedText_nostopwords']=no_stopwords_preprocessed(final_df,'Text')

In [22]:
final_df['CleanedSummary_nostopwords']=no_stopwords_preprocessed(final_df,'Summary')

In [23]:
final_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText_nostopwords,CleanedSummary_nostopwords
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,1,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,this witti littl book make son laugh recit the...,everi book educ
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,1,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",grew read these sendak and watch the realli ro...,love the miss the hard cover version
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,1,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,this fun way for children learn their month th...,chicken soup with rice month
138690,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,1,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...,this great littl book read has nice rhythm wel...,good swingi rhythm for read aloud
138691,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,1,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,this book poetri about the month the yea goe t...,great way learn the month


In [26]:
#Implementing step-by-step for Text column
def stopwords_preprocessed(data,colname):
    i=0
    str=' '
    final_df_string=[]
    all_positive_words=[]
    all_negative_words=[]
    s=''
    for sent in data[colname].values:
        #print(sent)
        filtered_sentence=[]
        sent=cleanhtml(sent)
        for w in sent.split():
            for cleaned_words in cleanpunc(w).split():
                if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):
                    if(cleaned_words.lower() not in stop):
                        s=(snow.stem(cleaned_words.lower())).encode('utf8')
                        filtered_sentence.append(s)
                        if(data[colname]==final_df['Text']).all():
                            if(data['Score'].values)[i] == 'positive':
                                all_positive_words.append(s)
                            elif(data['Score'].values)[i] == 'negative':
                                all_negative_words.append(s)
                            else:
                                continue
                        else:
                            if(data['Score'].values)[i] == 'positive':
                                all_positive_words1.append(s)
                            elif(data['Score'].values)[i] == 'negative':
                                all_negative_words1.append(s)
                            else:
                                continue
                        
                    else:
                        continue
                else:
                    continue
        #print(filtered_sentence)
        str=b" ".join(filtered_sentence)

        final_df_string.append(str)
        i=i+1
    return final_df_string

In [25]:
final_df['CleanedText_stopwords']=stopwords_preprocessed(final_df,'Text')

In [27]:
final_df['CleanedSummary_stopwords']=stopwords_preprocessed(final_df,'Summary')

In [28]:
final_df.shape

(364171, 14)

## Featurization


1. Bag of Words (BoW)
2. Bi-gram and n-gram
3. TF-IDF
4. Word2Vec
5. Avg Word2Vec

### Bag of Words (BoW)

In [29]:
count_vec= CountVectorizer()
final_text_counts=count_vec.fit_transform(final_df['CleanedText_stopwords'].values)
final_summary_counts=count_vec.fit_transform(final_df['CleanedSummary_stopwords'].values)

In [30]:
type(final_text_counts)

scipy.sparse._csr.csr_matrix

In [31]:
print(final_text_counts.get_shape())
print(final_summary_counts.get_shape())

(364171, 59850)
(364171, 20894)


#### Bi-Grams and n-Grams.

Possibly there can be common words both in positive and negative words, for e.g. like so there could be a chance that in negative words it was 'not like'. But with stopwords we removed 'not' so bi-gram can be performed efficiently if we are careful with stopwords.

In [32]:
#for CleanedText
count_vec=CountVectorizer(ngram_range=(1,2))
final_bigram_counts_text=count_vec.fit_transform(final_df['CleanedText_nostopwords'].values)
#for CleanedSummary
count_vec1=CountVectorizer(ngram_range=(1,2))
final_bigram_counts_summary=count_vec1.fit_transform(final_df['CleanedSummary_nostopwords'].values)

In [33]:
print(final_bigram_counts_text.get_shape())
print(final_bigram_counts_summary.get_shape())

(364171, 2064429)
(364171, 223935)


### TF-IDF

In [37]:
# for CleanedText
tf_idf_vec_bigram_text = TfidfVectorizer(ngram_range=(1, 2))
final_tf_idf_bigram_text = tf_idf_vec_bigram_text.fit_transform(final_df['CleanedText_nostopwords'].values)
tf_idf_vec_text= TfidfVectorizer()
final_tf_idf_text = tf_idf_vec_text.fit_transform(final_df['CleanedText_nostopwords'].values)

# for CleanedSummary
tf_idf_vec_bigram_summary = TfidfVectorizer(ngram_range=(1, 2))
final_tf_idf_bigram_summary = tf_idf_vec_bigram_summary.fit_transform(final_df['CleanedSummary_nostopwords'].values)
tf_idf_vec_summary= TfidfVectorizer()
final_tf_idf_summary = tf_idf_vec_text.fit_transform(final_df['CleanedText_nostopwords'].values)

In [38]:
print(final_tf_idf_bigram_text.get_shape())
print(final_tf_idf_bigram_summary.get_shape())
print(final_tf_idf_text.get_shape())
print(final_tf_idf_summary.get_shape())

(364171, 2064429)
(364171, 223935)
(364171, 59895)
(364171, 59895)


In [39]:
final_df['bow_text'] = list(final_text_counts)

In [40]:
final_df['bow_summary']=list(final_summary_counts)

In [41]:
final_df['bigram_text']=list(final_bigram_counts_text)

In [42]:
final_df['bigram_summary']=list(final_bigram_counts_summary)

In [43]:
final_df['tf_idf_bigram_text']=list(final_tf_idf_bigram_text)

In [44]:
final_df['tf_idf_bigram_summary']=list(final_tf_idf_bigram_summary)

In [45]:
final_df['tf_idf_summary']=list(final_tf_idf_summary)

In [46]:
final_df['tf_idf_text']=list(final_tf_idf_text)

In [47]:
final_df.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,...,CleanedText_stopwords,CleanedSummary_stopwords,bow_text,bow_summary,bigram_text,bigram_summary,tf_idf_bigram_text,tf_idf_bigram_summary,tf_idf_summary,tf_idf_text
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,1,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,...,witti littl book make son laugh recit car driv...,everi book educ,"(0, 58502)\t1\n (0, 30286)\t1\n (0, 5933)\...","(0, 6010)\t1\n (0, 1970)\t1\n (0, 5631)\t1","(0, 1811754)\t3\n (0, 2025177)\t1\n (0, 10...","(0, 62516)\t1\n (0, 23759)\t1\n (0, 59526)...","(0, 1811754)\t0.051041095713488575\n (0, 20...","(0, 62516)\t0.2873281713225465\n (0, 23759)...","(0, 52944)\t0.09469757133165593\n (0, 58544...","(0, 52944)\t0.09469757133165593\n (0, 58544..."
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,1,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",...,grew read sendak watch realli rosi movi incorp...,love miss hard cover version,"(0, 48708)\t1\n (0, 30735)\t2\n (0, 22708)...","(0, 10649)\t1\n (0, 11621)\t1\n (0, 8205)\...","(0, 1615298)\t1\n (0, 1762805)\t4\n (0, 67...","(0, 115904)\t1\n (0, 191881)\t2\n (0, 1245...","(0, 1615298)\t0.07802797290183838\n (0, 176...","(0, 115904)\t0.11710115920089506\n (0, 1918...","(0, 48735)\t0.1624591971671617\n (0, 52695)...","(0, 48735)\t0.1624591971671617\n (0, 52695)..."
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,1,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,...,fun way children learn month yea learn poem th...,chicken soup rice month,"(0, 29575)\t2\n (0, 20706)\t1\n (0, 57545)...","(0, 3179)\t1\n (0, 17027)\t1\n (0, 15291)\...","(0, 1811754)\t1\n (0, 1762805)\t4\n (0, 98...","(0, 36385)\t1\n (0, 176004)\t1\n (0, 21631...","(0, 1811754)\t0.02453952192578791\n (0, 176...","(0, 36385)\t0.2591073252978461\n (0, 176004...","(0, 52944)\t0.04805963230241763\n (0, 52695...","(0, 52944)\t0.04805963230241763\n (0, 52695..."


### Word2vec

In [49]:
# After cleaning + removing stopwords
sentences_nostop_text = [text.split() for text in final_df['CleanedText_nostopwords'].values]
sentences_nostop_summary = [text.split() for text in final_df['CleanedSummary_nostopwords'].values]
sentences_widstop_text = [text.split() for text in final_df['CleanedText_stopwords'].values]
sentences_widstop_summary = [text.split() for text in final_df['CleanedSummary_stopwords'].values]
# Train Word2Vec
w2v_model1 = Word2Vec(sentences_nostop_text, vector_size=100, window=5, min_count=2, sg=1)
w2v_model2 = Word2Vec(sentences_nostop_summary, vector_size=100, window=5, min_count=2, sg=1)
w2v_model3 = Word2Vec(sentences_widstop_text, vector_size=100, window=5, min_count=2, sg=1)
w2v_model4 = Word2Vec(sentences_widstop_summary, vector_size=100, window=5, min_count=2, sg=1)

In [50]:
w2v_model1

<gensim.models.word2vec.Word2Vec at 0x1dcb723af10>

In [51]:
w2v_model2

<gensim.models.word2vec.Word2Vec at 0x1da85641410>

In [52]:
w2v_model3

<gensim.models.word2vec.Word2Vec at 0x1dadade2b10>

In [53]:
w2v_model4

<gensim.models.word2vec.Word2Vec at 0x1dcb723b690>

In [54]:
def compute_avg_w2v(sentence, model, vector_size):
    words = sentence.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    else:
        return np.mean(vectors, axis=0)

In [55]:
final_df['w2v_nostop_text']=final_df['CleanedText_nostopwords'].apply(lambda x: compute_avg_w2v(x, w2v_model1, 100))

In [56]:
final_df['w2v_nostop_summary']=final_df['CleanedSummary_nostopwords'].apply(lambda x: compute_avg_w2v(x, w2v_model2, 100))

In [57]:
final_df['w2v_widstop_text']=final_df['CleanedText_stopwords'].apply(lambda x: compute_avg_w2v(x, w2v_model3, 100))

In [58]:
final_df['w2v_widstop_summary']=final_df['CleanedSummary_stopwords'].apply(lambda x: compute_avg_w2v(x, w2v_model4, 100))

In [59]:
final_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,...,bigram_text,bigram_summary,tf_idf_bigram_text,tf_idf_bigram_summary,tf_idf_summary,tf_idf_text,w2v_nostop_text,w2v_nostop_summary,w2v_widstop_text,w2v_widstop_summary
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,1,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,...,"(0, 1811754)\t3\n (0, 2025177)\t1\n (0, 10...","(0, 62516)\t1\n (0, 23759)\t1\n (0, 59526)...","(0, 1811754)\t0.051041095713488575\n (0, 20...","(0, 62516)\t0.2873281713225465\n (0, 23759)...","(0, 52944)\t0.09469757133165593\n (0, 58544...","(0, 52944)\t0.09469757133165593\n (0, 58544...","[-0.15729389, 0.21413487, 0.0957333, 0.2648131...","[-0.20650323, 0.016876014, -0.024918543, -0.15...","[0.1592763, 0.0053047296, -0.30231035, -0.0203...","[-0.22778238, 0.3157369, 0.026158238, 0.056651..."
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,1,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",...,"(0, 1615298)\t1\n (0, 1762805)\t4\n (0, 67...","(0, 115904)\t1\n (0, 191881)\t2\n (0, 1245...","(0, 1615298)\t0.07802797290183838\n (0, 176...","(0, 115904)\t0.11710115920089506\n (0, 1918...","(0, 48735)\t0.1624591971671617\n (0, 52695)...","(0, 48735)\t0.1624591971671617\n (0, 52695)...","[-0.13379656, 0.18996346, 0.14569032, 0.387300...","[-0.31087092, 0.039597157, 0.030838296, -0.048...","[0.002982815, 0.023110123, -0.26565325, -0.050...","[-0.13022096, 0.5719431, 0.023160476, -0.45553..."
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,1,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,...,"(0, 1811754)\t1\n (0, 1762805)\t4\n (0, 98...","(0, 36385)\t1\n (0, 176004)\t1\n (0, 21631...","(0, 1811754)\t0.02453952192578791\n (0, 176...","(0, 36385)\t0.2591073252978461\n (0, 176004...","(0, 52944)\t0.04805963230241763\n (0, 52695...","(0, 52944)\t0.04805963230241763\n (0, 52695...","[-0.07200277, 0.12802768, 0.122765966, 0.25638...","[-0.0828864, 0.25313708, 0.16929294, -0.084005...","[0.21172859, -0.03823375, -0.27492884, -0.0270...","[-0.5748682, 0.84234554, 0.029315952, -0.23654..."
138690,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,1,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...,...,"(0, 1811754)\t2\n (0, 2025177)\t1\n (0, 10...","(0, 83658)\t1\n (0, 185196)\t1\n (0, 16159...","(0, 1811754)\t0.03334287857204083\n (0, 202...","(0, 83658)\t0.09499835323359049\n (0, 18519...","(0, 52944)\t0.07004756649907445\n (0, 58544...","(0, 52944)\t0.07004756649907445\n (0, 58544...","[-0.07425427, 0.16350126, 0.11767914, 0.329750...","[-0.13789484, 0.21989323, -0.2082287, -0.00527...","[0.097942606, 0.012185157, -0.20204568, -0.009...","[-0.22883458, 0.3958758, 0.04987474, 0.0389433..."
138691,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,1,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,...,"(0, 1811754)\t4\n (0, 1014483)\t1\n (0, 21...","(0, 191881)\t1\n (0, 126076)\t1\n (0, 8702...","(0, 1811754)\t0.057517507543626445\n (0, 10...","(0, 191881)\t0.12518856905749115\n (0, 1260...","(0, 52944)\t0.10591207792971696\n (0, 30305...","(0, 52944)\t0.10591207792971696\n (0, 30305...","[-0.07507194, 0.119822145, 0.1550656, 0.330353...","[-0.2072709, 0.2381827, 0.009692291, 0.0918947...","[0.13621019, 0.012776502, -0.22061805, -0.0229...","[-0.13643925, 0.4677559, 0.07931543, 0.0335286..."


In [159]:
## final_df.to_csv('processed_df.csv',index=False)