## Random Forest Classification on Amazon Fine food Dataset from kaggle

## Dataset
This dataset consists of reviews of fine foods from amazon. The data span a period of more than 10 years, including all ~500,000 reviews up to October 2012. 
## Task
 
#### Input - ProductId, UserId, ProfileName, HelpfulnessNumerator,HelpfulnessDenominator,Time,Summary,Text 
#### Ouput- To determine polarity if a review is positive/negative.

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve,auc
from nltk.stem.porter import PorterStemmer

In [2]:
con=sqlite3.connect('database.sqlite')

In [3]:
filtered_data=pd.read_sql_query("""select * from Reviews where score!=3 """,con)

#### We classify the score(4-5) as positive and score(1-2) as negative and discard score=3

In [4]:
filtered_data

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
525809,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
525810,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
525811,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
525812,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [5]:
def partition(x):
    if x<3:
        return 'negative'
    else:
        return 'positive'

In [6]:
real_score=filtered_data['Score']
positivenegative=real_score.map(partition)
filtered_data['Score']=positivenegative

In [7]:
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


### Data Cleaning

In [8]:
sorted_df=filtered_data.sort_values('ProductId',axis=0,ascending=True)

In [9]:
sorted_df.shape

(525814, 10)

#### Removing duplicates for same user with text for different products on same time

In [10]:
sorted_df=sorted_df.drop_duplicates(subset={'UserId','ProfileName','Time','Text'},keep='first',inplace=False)

In [11]:
sorted_df.shape

(364173, 10)

#### Data remaining after removing duplicates = 69% (intial = 525814, now=364173)

In [12]:
final=sorted_df[sorted_df.HelpfulnessNumerator <= sorted_df.HelpfulnessDenominator]

In [13]:
final.shape

(364171, 10)

#### Helpfulness numerator is total positive reviews and helpfullness denominator is total of both +ve,-ve. so, removing redudunant rows

In [14]:
final['Score'].value_counts()

positive    307061
negative     57110
Name: Score, dtype: int64

### Text to vector mapping
### 1. Bag of Words

In [15]:
count_vec=CountVectorizer()
# It will convert all text to lower case by default
final_counts=count_vec.fit_transform(final['Text'].values)

In [16]:
#### final_counts is a sparse matrix represenattion
final_counts.get_shape()

(364171, 115281)

### Text-preprocessing - Stopwords,lemmatization,stemming

1. Removing HTML tags
2. Removing punctuations
3. Check for english letter and word is not alpha-numeric
4. len(words)>2
5. converting words to lowercase 
6. stemming

In [17]:
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

stop=set(stopwords.words('english'))
snow=nltk.stem.SnowballStemmer('english')

In [18]:
def clean_html(sent):
    clean=re.compile('<.*?>')
    cleantext=re.sub(clean,' ',sent)
    return cleantext

In [19]:
def cleanpunc(sent):
    cleaned=re.sub(r'[?|!|\'|"|#]',r'',sent)
    cleaned=re.sub(r'[.|,|)|(|/]',r' ',cleaned)
    return cleaned
             

In [20]:
i=0
final_string=[]
pos_words=[]
neg_words=[]
for sent in final['Text'].values:
    fil_sen=[]
    sent=clean_html(sent)
    for w in sent.split():
        for clean_words in cleanpunc(w).split():
            if((clean_words.isalpha())&(len(clean_words)>2)):
                if(clean_words.lower() not in stop):
                    s=(snow.stem(clean_words.lower())).encode('utf8')
                    fil_sen.append(s)
                    if((final['Score'].values)[i]=='positive'):
                        pos_words.append(s)
                    else:
                        neg_words.append(s)
                else:
                    continue
            else:
                continue
        
    strl=b" ".join(fil_sen)
    final_string.append(strl)
    i+=1

In [21]:
final['cleaned_text']=final_string

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
final[['Text','cleaned_text']].head()

Unnamed: 0,Text,cleaned_text
138706,this witty little book makes my son laugh at l...,b'witti littl book make son laugh loud recit c...
138688,"I grew up reading these Sendak books, and watc...",b'grew read sendak book watch realli rosi movi...
138689,This is a fun way for children to learn their ...,b'fun way children learn month year learn poem...
138690,This is a great little book to read aloud- it ...,b'great littl book read nice rhythm well good ...
138691,This is a book of poetry about the months of t...,b'book poetri month year goe month cute littl ...


### Bi-Grams and N-grams

In [23]:
# calculating freq of +ve and -ve words
freq_dist_pos=nltk.FreqDist(pos_words)
freq_dist_neg=nltk.FreqDist(neg_words)


In [24]:
print('Most common +ve words',freq_dist_pos.most_common(20))
print('Most common -ve words',freq_dist_neg.most_common(20))

Most common +ve words [(b'like', 139429), (b'tast', 129047), (b'good', 112766), (b'flavor', 109624), (b'love', 107357), (b'use', 103888), (b'great', 103870), (b'one', 96726), (b'product', 91033), (b'tri', 86791), (b'tea', 83888), (b'coffe', 78814), (b'make', 75107), (b'get', 72125), (b'food', 64802), (b'would', 55568), (b'time', 55264), (b'buy', 54198), (b'realli', 52715), (b'eat', 52004)]
Most common -ve words [(b'tast', 34585), (b'like', 32330), (b'product', 28218), (b'one', 20569), (b'flavor', 19575), (b'would', 17972), (b'tri', 17753), (b'use', 15302), (b'good', 15041), (b'coffe', 14716), (b'get', 13786), (b'buy', 13752), (b'order', 12871), (b'food', 12754), (b'dont', 11877), (b'tea', 11665), (b'even', 11085), (b'box', 10844), (b'amazon', 10073), (b'make', 9840)]


#### Considering bigram,trigram,ngram because of overlap of words frokm positive and negative freq dist eg. like occured both in +ve and -ve reviews

In [213]:
count_vec=CountVectorizer(ngram_range=(1,2))
final_counts=count_vec.fit_transform(final['cleaned_text'].values)

In [214]:
final_counts.get_shape()

(364171, 2923725)

#### After calculating bigrams dimensions increased from (364171, 115281) to (364171, 2923725)

### TF-IDF

In [215]:
tf_vec=TfidfVectorizer(ngram_range=(1,2))
final_tf_idf=tf_vec.fit_transform(final['cleaned_text'].values)

In [216]:
final_tf_idf.get_shape()

(364171, 2923725)

In [217]:
features=tf_vec.get_feature_names()

In [218]:
features[10000:10009]

['acess keurig',
 'acesufam',
 'acesufam potassium',
 'acesuflamek',
 'acesuflamek sacharin',
 'acesulf',
 'acesulf also',
 'acesulf found',
 'acesulfam']

In [219]:
def top_tfidf_features(rows,features,top_n=25):
    top_ids=np.argsort(rows)[::-1][:top_n]
    top_feat=[(features[i],rows[i]) for i in top_ids]
    df=pd.DataFrame(top_feat)
    df.columns=['features','tfidf']
    return df
top_idf=top_tfidf_features(final_tf_idf[1,:].toarray()[0],features,25)

In [32]:
top_idf

Unnamed: 0,features,tfidf
0,sendak books,0.173437
1,rosie movie,0.173437
2,paperbacks seem,0.173437
3,cover version,0.173437
4,these sendak,0.173437
5,the paperbacks,0.173437
6,pages open,0.173437
7,really rosie,0.168074
8,incorporates them,0.168074
9,paperbacks,0.168074


### Word2Vector

In [33]:
import gensim
i=0
list_sent=[]
for sent in final['Text'].values:
    fil_sen=[]
    sent=clean_html(sent)
    for w in sent.split():
        for clean_words in cleanpunc(w).split():
            if((clean_words.isalpha())):
                fil_sen.append(clean_words)
            else:
                continue
    list_sent.append(fil_sen)
                

In [34]:
print(final['Text'].values[0])
print("******************************************************")
print(list_sent[0])

this witty little book makes my son laugh at loud. i recite it in the car as we're driving along and he always can sing the refrain. he's learned about whales, India, drooping roses:  i love all the new words this book  introduces and the silliness of it all.  this is a classic book i am  willing to bet my son will STILL be able to recite from memory when he is  in college
******************************************************
['this', 'witty', 'little', 'book', 'makes', 'my', 'son', 'laugh', 'at', 'loud', 'i', 'recite', 'it', 'in', 'the', 'car', 'as', 'were', 'driving', 'along', 'and', 'he', 'always', 'can', 'sing', 'the', 'refrain', 'hes', 'learned', 'about', 'whales', 'India', 'drooping', 'i', 'love', 'all', 'the', 'new', 'words', 'this', 'book', 'introduces', 'and', 'the', 'silliness', 'of', 'it', 'all', 'this', 'is', 'a', 'classic', 'book', 'i', 'am', 'willing', 'to', 'bet', 'my', 'son', 'will', 'STILL', 'be', 'able', 'to', 'recite', 'from', 'memory', 'when', 'he', 'is', 'in', 'co

In [35]:
# min_count indicates words with min 5 occurances and construct a vector
# size - dimensions of vectors to be constructed.(more corpus, size should be high)
# workers - for using cores in cpu
w2v_model=gensim.models.Word2Vec(list_sent,min_count=5,size=50,workers=4)

In [36]:
words=list(w2v_model.wv.vocab)
print(len(words))

44605


In [37]:
w2v_model.wv.most_similar('tasty')

[('tastey', 0.9135609865188599),
 ('yummy', 0.8866933584213257),
 ('satisfying', 0.8614116907119751),
 ('delicious', 0.8450340032577515),
 ('filling', 0.8422430753707886),
 ('flavorful', 0.8268660306930542),
 ('addicting', 0.797648549079895),
 ('versatile', 0.7878106236457825),
 ('nutritious', 0.770317554473877),
 ('delish', 0.7684949636459351)]

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.model_selection  import cross_val_score
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.metrics import roc_auc_score

In [39]:
def part(x):
    if x=="positive":
        return 1
    else:
        return 0

In [40]:
X=final_tf_idf
y_map=final['Score'].map(part)

In [41]:
final['score_n']=y_map

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [42]:
y=np.array(final['score_n'])

In [43]:
df_samp_pos=final[final['score_n']==1].sample(n=6000)

In [44]:
df_samp_neg=final[final['score_n']==0].sample(n=6000)

In [46]:
df_samp=pd.concat([df_samp_pos,df_samp_neg])

In [47]:
# Sorting data based on time
df_samp['Time'] = pd.to_datetime(df_samp['Time'], unit = 's')
df_samp = df_samp.sort_values(by = 'Time')

In [48]:
df_samp.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,cleaned_text,score_n
346041,374343,B00004CI84,A1B2IZU1JLZA6,Wes,19,23,negative,2000-01-19,WARNING: CLAMSHELL EDITION IS EDITED TV VERSION,"I, myself always enjoyed this movie, it's very...",b'alway enjoy movi funni entertain didnt hesit...,0
346054,374358,B00004CI84,A1HWMNSQF14MP8,will@socialaw.com,1,2,positive,2000-12-30,A Afterlife Success,"Many movies, have dealt with the figure of dea...",b'mani movi dealt figur death dead good angel ...,1
346077,374382,B00004CI84,A3C3BAQDZWH5YE,Kushana no shinryaku (Kushana's invasion),0,1,positive,2002-02-26,...,"It was on the other night, and, having been a ...",b'night big fan cartoon shown decid watch also...,1
121056,131233,B00004RAMX,A1PYZPS1QYR036,"Kazantzakis ""hinterlands""",5,8,negative,2003-10-29,Woodstream Gopher Trap 0610,This is a poor excuse for a gopher trap. I hav...,b'poor excus gopher trap lot gopher use trap r...,0
264269,286453,B0000DJ7WI,A375QRG43POEW6,dndnd,23,25,negative,2003-11-28,Stay Away...,This product ships in great packaging and it l...,b'product ship great packag look good tast hor...,0


In [49]:
df_samp.shape

(12000, 12)

As dealing with timestamp, we take first 8400 timestamps as train and remaining as test data instead of random split because reviews change according to time and random split performance is less than training into certain intervals of time and predicting next 

In [50]:
datasplit_train=df_samp.iloc[:8400,:]
datasplit_test=df_samp.iloc[8400:,:]

In [51]:
datasplit_train.shape

(8400, 12)

In [52]:
datasplit_test.shape

(3600, 12)

## 1. Applying BOW vectorization on text column

In [252]:
count_vec=CountVectorizer()
# It will convert all text to lower case by default
final_counts=count_vec.fit_transform(datasplit_train['cleaned_text'].values)

In [253]:
X=final_counts

In [254]:
Y=np.array(datasplit_train['score_n'])

### Don't perform fit_transform for text data as count_vect is already trained use transform and use count_vect from train set

In [255]:

# It will convert all text to lower case by default
final_counts=count_vec.transform(datasplit_test['cleaned_text'].values)

In [256]:
X_test=final_counts

In [257]:
Y_test=np.array(datasplit_test['score_n'])

In [258]:
# further dividing data into training and cross-validation sets
X_tr, X_cv, y_tr, y_cv = train_test_split(X, Y, test_size=0.3)

### Simple cross validation

In [327]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
scalar = StandardScaler(with_mean=False)
X_tr= scalar.fit_transform(X_tr)
X_test= scalar.transform(X_test)
X_cv=scalar.transform(X_cv)

base_learners = [20,40,60,80,100,120]
depths=[1,5,10,50,100,500,1000]
param_grid={'n_estimators': base_learners, 'max_depth':depths}
rf = RandomForestClassifier(max_features='sqrt')
model=GridSearchCV(rf,param_grid,scoring='roc_auc',n_jobs=-1,cv=3)
model.fit(X_tr,y_tr)
print("optimal n_estimators",model.best_estimator_.n_estimators)
print("optimal max_depth",model.best_estimator_.max_depth)

optimal n_estimators 120
optimal max_depth 100


### Observation:
1. optimal base models (n_estimators) :  120
2. optimal max tree Depth  :  100

In [328]:
rf=RandomForestClassifier(max_features='sqrt',max_depth=100,n_estimators=120)
rf.fit(X_tr,y_tr)
pred=rf.predict(X_test)
acc=accuracy_score(Y_test,pred)*100

In [329]:
acc

82.83333333333334

### Total accuracy = 82.8%

In [330]:
from sklearn.metrics import precision_score
precision=precision_score(Y_test,pred)

In [331]:
precision

0.8045454545454546

### Precision=80.4%

In [332]:
from sklearn.metrics import recall_score
rec_score=recall_score(Y_test,pred)

In [333]:
rec_score

0.8378698224852071

### Recall=83.78%

In [334]:
from sklearn.metrics import confusion_matrix
c_mat=confusion_matrix(Y_test,pred)

### Confusion_matrix

In [335]:
c_mat

array([[1566,  344],
       [ 274, 1416]], dtype=int64)

## 2. Applying TFIDF vectorization 

In [336]:
tf_vec=TfidfVectorizer(ngram_range=(1,2))
final_tf_idf=tf_vec.fit_transform(datasplit_train['cleaned_text'].values)

In [337]:
X=final_tf_idf

In [338]:
Y=np.array(datasplit_train['score_n'])

In [339]:
final_counts=tf_vec.transform(datasplit_test['cleaned_text'].values)

In [340]:
X_test=final_counts

In [341]:
Y_test=np.array(datasplit_test['score_n'])

In [342]:
X_tr, X_cv, y_tr, y_cv = train_test_split(X, Y, test_size=0.3)

In [343]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
scalar = StandardScaler(with_mean=False)
X_tr= scalar.fit_transform(X_tr)
X_test= scalar.transform(X_test)
X_cv=scalar.transform(X_cv)

base_learners = [20,40,60,80,100,120]
depths=[1,5,10,50,100,500,1000]
param_grid={'n_estimators': base_learners, 'max_depth':depths}
rf = RandomForestClassifier(max_features='sqrt')
model=GridSearchCV(rf,param_grid,scoring='roc_auc',n_jobs=-1,cv=3)
model.fit(X_tr,y_tr)
print("optimal n_estimators",model.best_estimator_.n_estimators)
print("optimal max_depth",model.best_estimator_.max_depth)

optimal n_estimators 100
optimal max_depth 100


### Observation:
1. optimal base models (n_estimators) :  100
2. optimal max tree Depth  :  100

In [344]:
rf=RandomForestClassifier(max_features='sqrt',max_depth=100,n_estimators=100)
rf.fit(X_tr,y_tr)
pred=rf.predict(X_test)
acc=accuracy_score(Y_test,pred)*100



In [345]:
acc

82.86111111111111

### Total accuracy = 82.6%

In [346]:
from sklearn.metrics import precision_score
precision=precision_score(Y_test,pred)

In [347]:
precision

0.8043108338060124

### Precision=80.4%

In [348]:
from sklearn.metrics import recall_score
rec_score=recall_score(Y_test,pred)

In [349]:
rec_score

0.8390532544378698

### Recall=83.9%