In [1]:
import numpy as np
import pandas as pd
# for basic visualizations
import matplotlib.pyplot as plt
import seaborn as sns
# for advanced visualizations
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected = True)
import plotly.figure_factory as ff
import pyrsm as rsm

In [2]:
df = pd.read_csv("data/amazon_baby.csv")
df.shape

(183531, 3)

In [3]:
df.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [4]:
# Classify the rating to different sentiment level. 
df['sentiment']=rsm.ifelse(df.rating>=4,'positive',rsm.ifelse(df.rating==3,'neutual','negative'))

df.head()

Unnamed: 0,name,review,rating,sentiment
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,neutual
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,positive
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,positive
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,positive
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,positive


In [5]:
df.isnull().sum()

name         318
review       829
rating         0
sentiment      0
dtype: int64

In [6]:
# Drop rows with Null value
df=df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
df.isnull().sum()

name         0
review       0
rating       0
sentiment    0
dtype: int64

In [7]:
df.shape

(182384, 4)

### Build Word2Vec and Glove model

Use 100 dimension to build the model

In [8]:
import nltk
nltk.download('punkt') 
nltk.download('stopwords') 

from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer 

ps = PorterStemmer() 

def pre_processing_by_nltk(doc, stemming = True, need_sent = False):
    # step 1: get sentences
    sentences = sent_tokenize(doc)
    # step 2: get tokens
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
#         step 3 (optional): stemming
        if stemming:
            words = [ps.stem(word) for word in words]
        if need_sent:
            tokens.append(words)
        else:
            tokens += words
    return [w.lower() for w in tokens if w.lower() not in stop]

In [10]:
# Build the dataset for later use.
# Stemming dataset
sentences=df.review.apply(pre_processing_by_nltk).values

In [11]:
y = df['sentiment'].values
len(y)

182384

In [12]:
from gensim.models.word2vec import Word2Vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
model = Word2Vec(vector_size=100, min_count=2, epochs=5)

In [13]:
# Build Glove model.

glove_file = 'glove.6B.100d.txt'
tmp_file = get_tmpfile("test_word2vec.txt")

_ = glove2word2vec(glove_file, tmp_file)

model_glove = KeyedVectors.load_word2vec_format(tmp_file)


Call to deprecated `glove2word2vec` (KeyedVectors.load_word2vec_format(.., binary=False, no_header=True) loads GLoVE text vectors.).



In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sentences, y, test_size = 0.2,random_state=42,shuffle=True)

In [15]:
model.build_vocab(X_train)
model.train(X_train, total_examples=model.corpus_count, epochs=model.epochs)
print(len(model.wv.key_to_index))

32709


In [16]:
# Define definition to get average vectoers.
import numpy as np

# For Word2Vec model
def get_mean_vector(doc, wv):
    vecs = []
    for token in doc:
        try:
            vecs.append(wv.wv[token])
        except KeyError:
            pass
    return np.mean(vecs, axis=0)

# For GloVe model
def get_mean_vector1(doc, wv):
    vecs = []
    for token in doc:
        try:
            vecs.append(wv[token])
        except KeyError:
            pass
    return np.mean(vecs, axis=0)

In [17]:
import pickle
embedding_weights = pickle.load(open('data/embedding_weights.pkl', 'rb'))
avg=embedding_weights.mean(axis=0)

In [18]:
embedding_weights_test = pickle.load(open('data/embedding_weights_test.pkl', 'rb'))
avg_test=embedding_weights_test.mean(axis=0)

In [19]:
len(avg_test)

100

In [20]:
# Average train vector for stemming model.
import numpy
all_vectors_train = []
for i in range(0,len(X_train)):
    vec = get_mean_vector(X_train[i],model)
    if numpy.isnan(vec).all():
        all_vectors_train.append(avg)
    else:
        all_vectors_train.append(vec)
    i+=1


Mean of empty slice.


invalid value encountered in double_scalars



In [21]:
all_vectors_train=np.hstack(np.array(all_vectors_train, dtype="object")).reshape((len(all_vectors_train),100))

In [22]:
all_vectors_train.shape

(145907, 100)

In [23]:
# Average train vector for Glove model.
all_vectors_train_glove = []
for i in range(0,len(X_train)):
    vec =  get_mean_vector1(X_train[i],model_glove)
    if numpy.isnan(vec).all():
        all_vectors_train_glove.append(avg)
    else:
        all_vectors_train_glove.append(vec)
    i+=1
    
all_vectors_train_glove=np.hstack(np.array(all_vectors_train_glove,dtype="object")).reshape(len(all_vectors_train_glove),100)
all_vectors_train_glove.shape

(145907, 100)

In [24]:
all_vectors_train_glove=np.hstack(np.array(all_vectors_train_glove, dtype="object")).reshape(len(all_vectors_train_glove),100)

In [25]:
# Average test vector for stemming model.
all_vectors_test = []
for i in range(0,len(X_test)):
    vec = get_mean_vector(X_test[i],model)
    if numpy.isnan(vec).all():
        all_vectors_test.append(avg)
    else:
        all_vectors_test.append(vec)
    i+=1
    
# Average test vector for Glove model.Here I use unstemming dataset.
all_vectors_test_glove = []
for i in range(0,len(X_test)):
    vec =  get_mean_vector1(X_test[i],model_glove)
    if numpy.isnan(vec).all():
        all_vectors_test_glove.append(avg)
    else:
        all_vectors_test_glove.append(vec)
    i+=1
    
# reshape the vectors for logistic model
all_vectors_test=np.hstack(np.array(all_vectors_test, dtype="object")).reshape((len(all_vectors_test),100))
all_vectors_test_glove=np.hstack(np.array(all_vectors_test_glove, dtype="object")).reshape(len(all_vectors_test_glove),100)

#### Logistic Regression Model

In [27]:
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV


cv= LogisticRegressionCV(cv=5,scoring='accuracy',random_state=42,n_jobs=-1,verbose=3,max_iter=1000).fit(all_vectors_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.3min remaining:  2.0min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.4min finished


In [38]:
cv1= LogisticRegressionCV(cv=5,scoring='accuracy',random_state=42,n_jobs=-1,verbose=3,max_iter=1000).fit(all_vectors_train_glove, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.0min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.1min finished


In [28]:
print("Training Accuracy :", cv.score(all_vectors_train, y_train))
print("Testing Accuracy :", cv.score(all_vectors_test, y_test))

Training Accuracy : 0.8158484514108302
Testing Accuracy : 0.8150889601666804


In [39]:
print("Training Accuracy :", cv1.score(all_vectors_train_glove, y_train))
print("Testing Accuracy :", cv1.score(all_vectors_test_glove, y_test))

Training Accuracy : 0.7802161650914624
Testing Accuracy : 0.7808207911834855


In [40]:
y_pred_proba = cv.predict_proba(all_vectors_test)
y_pred = cv.predict(all_vectors_test)

y_pred_proba1 = cv1.predict_proba(all_vectors_test_glove)
y_pred1 = cv1.predict(all_vectors_test_glove)

auc_val = metrics.roc_auc_score(y_test, y_pred_proba,multi_class='ovo')
auc_val1 = metrics.roc_auc_score(y_test, y_pred_proba1,multi_class='ovo')

print(f"""The AUC for word2vec model is {auc_val}.""")
print(f"""The AUC for Glove model is {auc_val1}.""")

The AUC for word2vec model is 0.810579812895571.
The AUC for Glove model is 0.740645583399823.


In [41]:
MicroF1_word2vec=f1_score(y_test, y_pred, average='micro')
MacroF1_word2vec=f1_score(y_test, y_pred, average='macro')

print(f"""The MicroF1 for word2vec model is {MicroF1_word2vec}.""")
print(f"""The MacroF1 for word2vec model is {MacroF1_word2vec}.""")

The MicroF1 for word2vec model is 0.8150889601666805.
The MacroF1 for word2vec model is 0.527113591669395.


In [42]:
MicroF1_Glove=f1_score(y_test, y_pred1, average='micro')
MacroF1_Glove=f1_score(y_test, y_pred1, average='macro')

print(f"""The MicroF1 for Glove model is {MicroF1_Glove}.""")
print(f"""The MacroF1 for Glove model is {MacroF1_Glove}.""")

The MicroF1 for Glove model is 0.7808207911834855.
The MacroF1 for Glove model is 0.4214751867154936.


### Random Forest 

In [33]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth = 8, random_state=42, max_leaf_nodes = 25, min_samples_leaf=25)
clf.fit(all_vectors_train, y_train)

RandomForestClassifier(max_depth=8, max_leaf_nodes=25, min_samples_leaf=25,
                       random_state=42)

In [34]:
print("Training Accuracy :", clf.score(all_vectors_train, y_train))
print("Testing Accuracy :", clf.score(all_vectors_test, y_test))

Training Accuracy : 0.7724303837375863
Testing Accuracy : 0.7725964306275187


In [43]:
clf1 = RandomForestClassifier(max_depth = 8, random_state=42, max_leaf_nodes = 25, min_samples_leaf=25)
clf1.fit(all_vectors_train_glove, y_train)

RandomForestClassifier(max_depth=8, max_leaf_nodes=25, min_samples_leaf=25,
                       random_state=42)

In [35]:
y_pred_proba_rf = clf.predict_proba(all_vectors_test)
y_pred_rf = clf.predict(all_vectors_test)

auc_val = metrics.roc_auc_score(y_test, y_pred_proba_rf,multi_class='ovo')
MicroF1_W_RF=f1_score(y_test, y_pred_rf, average='micro')
MacroF1_W_RF=f1_score(y_test, y_pred_rf, average='macro')

print(f"""The AUC for word2vec vectors using Random Forest Model is {auc_val}.""")
print(f"""The MicroF1 for word2vec vector using Random Forest Model is {MicroF1_W_RF}.""")
print(f"""The MacroF1 for word2vec vector using Random Forest Model is {MacroF1_W_RF}.""")

The AUC for word2vec vectors using Random Forest Model is 0.7313125104128649.
The MicroF1 for word2vec vector using Random Forest Model is 0.7725964306275187.
The MacroF1 for word2vec vector using Random Forest Model is 0.3352105110204975.


In [44]:
print("Training Accuracy :", clf1.score(all_vectors_train_glove, y_train))
print("Testing Accuracy :", clf1.score(all_vectors_test_glove, y_test))

Training Accuracy : 0.7639797953490922
Testing Accuracy : 0.7634399758752091


In [45]:
y_pred_proba1_rf = clf1.predict_proba(all_vectors_test_glove)
y_pred1_rf = clf1.predict(all_vectors_test_glove)

auc_val_rf_g = metrics.roc_auc_score(y_test, y_pred_proba1_rf,multi_class='ovo')
MicroF1_G_RF=f1_score(y_test, y_pred1_rf, average='micro')
MacroF1_G_RF=f1_score(y_test, y_pred1_rf, average='macro')

print(f"""The AUC for Glove vectors using Random Forest Model is {auc_val_rf_g}.""")
print(f"""The MicroF1 for Glove vector using Random Forest Model is {MicroF1_G_RF}.""")
print(f"""The MacroF1 for Glove vector using Random Forest Model is {MacroF1_G_RF}.""")

The AUC for Glove vectors using Random Forest Model is 0.6646588099260203.
The MicroF1 for Glove vector using Random Forest Model is 0.763439975875209.
The MacroF1 for Glove vector using Random Forest Model is 0.28861769659282294.
