In [1]:
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('Reviews.csv')

In [4]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


### Subtask 1.2: Inspect the dataframe

Inspect the dataframe's columns, shapes, variable types etc.

In [5]:
# Check the number of rows and columns in the dataframe

df.shape

(568454, 10)

In [6]:
# Check the column-wise info of the dataframe

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [7]:
# Get a summary of the dataframe using 'describe()'

df.describe()

Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,568454.0,568454.0,568454.0,568454.0,568454.0
mean,284227.5,1.743817,2.22881,4.183199,1296257000.0
std,164098.679298,7.636513,8.28974,1.310436,48043310.0
min,1.0,0.0,0.0,1.0,939340800.0
25%,142114.25,0.0,0.0,4.0,1271290000.0
50%,284227.5,0.0,1.0,5.0,1311120000.0
75%,426340.75,2.0,2.0,5.0,1332720000.0
max,568454.0,866.0,923.0,5.0,1351210000.0


## Task 2: Cleaning the Data

-  ### Subtask 2.1: Inspect Null values

Find out the number of Null values in all the columns and rows. Also, find the percentage of Null values in each column. Round-off the percentages upto two decimal places.

In [8]:
# Get the column-wise Null count using 'is.null()' alongwith the 'sum()' function

df.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

### Subtask 2.2: Drop duplicate values

Text Processing 

In [12]:
# value range of score 
df['Score'].unique()

array([5, 1, 4, 2, 3])

In [13]:
#converting reviews into postive and negative category 

def category(x):
    if x<3:
        return('negative')
    return('positive')

In [14]:
df['Score'] = df['Score'].apply(category)

In [15]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## Data Cleaning and Depulication

In [16]:
#sort the data
df.sort_values(by=['ProductId','UserId','ProfileName',], inplace=True)

In [17]:
# removing duplicate values 
final = df.drop_duplicates(subset={'UserId','ProfileName','Time','Text'},keep = 'first')

In [18]:
final.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
150518,150519,6641040,A12HY5OZ2QNK4N,Elizabeth H. Roessner,0,0,positive,1256774400,It's a great book!,I've always loved chicken soup and rice. My la...
150510,150511,6641040,A1C9K534BCI9GO,Laura Purdie Salas,0,0,positive,1344211200,Charming and childlike,"A charming, rhyming book that describes the ci..."
150496,150497,6641040,A1HKYQOFC8ZZCH,"Maria Apolloni ""lanarossa""",2,2,negative,1334707200,"The story is great, the softcover book is disa...",I give five stars to the Maurice Sendak story....
150499,150500,6641040,A1IJKK6Q1GTEAY,A Customer,2,2,positive,1009324800,It Was a favorite!,This was a favorite book of mine when I was a ...
150506,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...


In [19]:
final.shape

(256059, 10)

In [26]:
final['Score'].value_counts()

positive    215428
negative     40631
Name: Score, dtype: int64

## Bag of words

In [28]:
#BOW
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()

In [29]:
final_counts = count_vec.fit_transform(final['Text'].values)

In [30]:
type(final_counts)

scipy.sparse.csr.csr_matrix

In [31]:
final_counts.get_shape()

(256059, 93455)

Row corresponds to each documnet and column is unique words, hence the dimension is 256059, 93455

## Text Pre-Processing 

In [49]:
import re
# https://pymotw.com/2/re/
import nltk 

import string 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
Porter = PorterStemmer()
#from nltk.stem.wordnet import WordNetLematizer 
from nltk.stem import WordNetLemmatizer


In [50]:
stop = set(stopwords.words('english'))
sno = nltk.stem.SnowballStemmer('english')

In [52]:
def cleanhtml(senetence):
    cleanr = re.compile('<.*?>')
    cleanext = re.sub(cleanr,' ',senetence)
    return cleanext

def cleanpunc (senetence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',senetence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return cleaned

print(stop)
print('******************************')
print(sno.stem('tasty'))

{'what', 'as', 'was', 'if', 'yours', 'over', 'until', "isn't", 'against', 'wouldn', 'having', 'haven', 'myself', 'ours', 't', 'too', 'more', 'doesn', "should've", 'on', "you'd", 'each', "you'll", 'o', 'yourself', 'your', 'other', 'then', 'i', 'nor', 'after', 'not', "won't", 'here', 'of', 'further', 'themselves', 'weren', 'himself', 'am', 'been', 'my', "you're", 'needn', 'yourselves', 'didn', 'both', "hadn't", 'should', 'off', 've', 'mustn', 'we', 'through', 'isn', 'because', 'wasn', 'be', 'won', 'but', 'about', 'can', 'me', "doesn't", 'her', 'had', 're', 'the', 'hadn', 'for', 'same', 'his', 'from', 'just', 'such', 'by', 'these', 'where', "haven't", 'don', "it's", 'couldn', 'below', 'how', 'an', 'most', 'their', "wouldn't", 'doing', 'hasn', 'and', "shouldn't", 'ourselves', 'once', "you've", 'in', 'ma', 'y', 'again', 'do', "shan't", 'its', 'whom', 's', 'll', 'mightn', 'than', 'so', 'herself', "wasn't", 'this', "mustn't", 'before', 'above', "didn't", 'any', "she's", 'ain', 'theirs', 'to',

In [55]:
i = 0
str1 = ' '
final_string = []
all_positive_words = []
all_negative_words = []
s = ''

for sent in final['Text'].values:
    filtered_sentence=[]
    sent = cleanhtml(sent) # clean HTML tag
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words) > 2)):
                if(cleaned_words.lower() not in stop):
                    s = (sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if(final['Score'].values)[i]== 'positive':
                        all_positive_words.append(s)
                    if(final['Score'].values)[i]== 'negative':
                        all_negative_words.append(s)
            else:
                continue
        else:
            continue
            
    str1 = b" ".join(filtered_sentence)
    
    final_string.append(str1)
    i+=1

In [56]:
final['cleanedText'] = final_string

In [57]:
final.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,cleanedText
150518,150519,6641040,A12HY5OZ2QNK4N,Elizabeth H. Roessner,0,0,positive,1256774400,It's a great book!,I've always loved chicken soup and rice. My la...,b'ive alway love chicken soup rice late ethel ...
150510,150511,6641040,A1C9K534BCI9GO,Laura Purdie Salas,0,0,positive,1344211200,Charming and childlike,"A charming, rhyming book that describes the ci...",b'charm rhyme book describ circumst eat dont c...
150496,150497,6641040,A1HKYQOFC8ZZCH,"Maria Apolloni ""lanarossa""",2,2,negative,1334707200,"The story is great, the softcover book is disa...",I give five stars to the Maurice Sendak story....,b'give five star mauric sendak stori one star ...
150499,150500,6641040,A1IJKK6Q1GTEAY,A Customer,2,2,positive,1009324800,It Was a favorite!,This was a favorite book of mine when I was a ...,b'favorit book mine littl girl would read time...
150506,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,b'fun way children learn month year learn poem...


## Bi-Gram and n-Gram

In [71]:
from nltk import FreqDist

freq_dist_postive = FreqDist(all_positive_words)
freq_dist_negative = FreqDist(all_negative_words)

print('freq_dist_postive',freq_dist_postive.most_common(10))
print('freq_dist_negative',freq_dist_negative.most_common(10))

freq_dist_postive [(b'like', 82522), (b'tast', 79723), (b'love', 71817), (b'good', 69049), (b'great', 68855), (b'use', 65651), (b'product', 63406), (b'flavor', 62261), (b'one', 59436), (b'tri', 56365)]
freq_dist_negative [(b'tast', 21416), (b'like', 19827), (b'product', 19509), (b'one', 13494), (b'would', 11866), (b'tri', 11384), (b'flavor', 11166), (b'coffe', 10293), (b'order', 10207), (b'use', 9922)]


In [72]:
#bigram and n-grams 

count_vect = CountVectorizer(ngram_range=(1,2))
final_bigram_counts = count_vect.fit_transform(final['cleanedText'].values)

In [73]:
final_bigram_counts.get_shape()

(256059, 2193560)

## TFIDF

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [78]:
tf_idf_Vect = TfidfVectorizer(ngram_range=(1,2))
final_tf_idf = tf_idf_Vect.fit_transform(final['cleanedText'].values)

In [79]:
final_tf_idf.get_shape()

(256059, 2193560)

In [80]:
features = tf_idf_Vect.get_feature_names()

In [81]:
len(features)

2193560

In [86]:
features[1000:1010]

['abl fight',
 'abl figur',
 'abl file',
 'abl fill',
 'abl final',
 'abl find',
 'abl fine',
 'abl fing',
 'abl finish',
 'abl first']

In [88]:
print(final_tf_idf[3,:].toarray()[0])

[0. 0. 0. ... 0. 0. 0.]


In [90]:
#top tfidf features 

def top_tfidf_feat(row, features, top_n = 25):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feature = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feature)
    df.columns = ['features', 'tfidf']
    return df

top_tfidf = top_tfidf_feat(final_tf_idf[1,:].toarray()[0],features, top_n = 25)


In [91]:
top_tfidf

Unnamed: 0,features,tfidf
0,childlik skill,0.177334
1,describ circumst,0.177334
2,charm rhyme,0.177334
3,teacher crazi,0.177334
4,catchi sound,0.177334
5,sing drive,0.177334
6,theyr recess,0.177334
7,recess sing,0.177334
8,cute catchi,0.177334
9,realli childlik,0.177334


### Word to Vec 
using gensim library 

In [94]:
!python -m pip install -U gensim

Collecting gensim
  Downloading gensim-3.8.3-cp37-cp37m-macosx_10_9_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 2.9 MB/s eta 0:00:01
Collecting smart-open>=1.8.1
  Downloading smart_open-2.1.0.tar.gz (116 kB)
[K     |████████████████████████████████| 116 kB 4.1 MB/s eta 0:00:01
Collecting boto3
  Downloading boto3-1.14.14-py2.py3-none-any.whl (128 kB)
[K     |████████████████████████████████| 128 kB 5.5 MB/s eta 0:00:01
Collecting s3transfer<0.4.0,>=0.3.0
  Downloading s3transfer-0.3.3-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 2.9 MB/s eta 0:00:01
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting botocore<1.18.0,>=1.17.14
  Downloading botocore-1.17.14-py2.py3-none-any.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 2.4 MB/s eta 0:00:01     |█████████████████▊              | 3.5 MB 5.8 MB/s eta 0:00:01
[?25hCollecting docutils<0.16,>=0.10
  Downl

In [95]:
import gensim
from gensim.models import Word2Vec
from gensim.models import keyedVectors
import picke

ImportError: cannot import name 'keyedVectors' from 'gensim.models' (/Users/neha/opt/anaconda3/lib/python3.7/site-packages/gensim/models/__init__.py)

In [None]:
!python -m pip uninstall scipy
!python -m conda install scipy

Found existing installation: scipy 1.4.1
Uninstalling scipy-1.4.1:
  Would remove:
    /Users/neha/opt/anaconda3/lib/python3.7/site-packages/scipy-1.4.1.dist-info/*
    /Users/neha/opt/anaconda3/lib/python3.7/site-packages/scipy/*
Proceed (y/n)? ^C
[31mERROR: Operation cancelled by user[0m


In [None]:
y