In [1]:
import numpy as np
import pandas as pd

# for basic visualizations
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

# for advanced visualizations
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected = True)
import plotly.figure_factory as ff
import pyrsm as rsm

In [2]:
df = pd.read_csv("data/amazon_baby.csv")

In [3]:
# Classify the rating to different sentiment level. 
df['sentiment']=rsm.ifelse(df.rating>=4,'positive',rsm.ifelse(df.rating==3,'neutual','negative'))
df.head()

Unnamed: 0,name,review,rating,sentiment
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,neutual
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,positive
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,positive
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,positive
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,positive


In [4]:
# Drop rows with Null value
df=df.dropna(axis=0, how="any", thresh=None, subset=None, inplace=False)
df.isnull().sum()

name         0
review       0
rating       0
sentiment    0
dtype: int64

In [5]:
df.shape

(182384, 4)

# Phrase Mining

## 1. Pre-processing

In [6]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
df['text'] = df['review'].copy()

In [8]:
ps = PorterStemmer() 

In [9]:
def preprocess_df(df):
    # get English stopwords
    stop_words = set(stopwords.words('english'))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row["text"]
        sent = sent.lower()
        words_list = sent.strip().split()
        filtered_words = [ps.stem(word) for word in words_list if word not in stop_words and len(word) != 1] # also skip space from above translation
        preprocessed_sentences.append(" ".join(filtered_words))
    df["text"] = preprocessed_sentences
    return df

In [10]:
df = preprocess_df(df)

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [15]:
train, test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

In [16]:
import spacy
nlp = spacy.load("en_core_web_sm")

### Divide text into phrases for training

In [19]:
# tagged_data = [[i.text for i in nlp(_d).noun_chunks] for i, _d in enumerate(train["text"])]

In [20]:
# import pickle
# with open("data/tagged_train_phrase.pkl", "wb") as fp:   #Pickling
#     pickle.dump(tagged_data, fp)

In [17]:
tagged_data = pd.read_pickle('data/tagged_train_phrase.pkl')

In [48]:
print(train.loc[131499,'review'])

great video monitor, it works awesome and it has great reception from one room to another. great quality sound and video. good price


In [47]:
print(train.loc[131499,'text'])

great video monitor, work awesom great recept one room another. great qualiti sound video. good price


In [38]:
tagged_data[0]

['great video monitor',
 'awesom great recept',
 'one room',
 'great qualiti sound video',
 'good price']

In [19]:
import itertools
from collections import Counter

In [20]:
def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

In [21]:
word_counts, vocabulary, vocabulary_inv = build_vocab(tagged_data)

In [63]:
len(vocabulary_inv)

710431

In [22]:
inp_data = [[vocabulary[word] for word in text] for text in tagged_data]

In [23]:
from gensim.models import word2vec
from sklearn.linear_model import LogisticRegression
import os
import string





In [67]:
# A function used to learn word embeddings through Word2vec module
def get_embeddings(inp_data, vocabulary_inv, size_features=100,
                   mode='skipgram',
                   min_word_count=2,
                   context=5):
    model_name = "embedding"
    model_name = os.path.join(model_name)
    num_workers = 15  # Number of threads to run in parallel
    downsampling = 1e-3  # Downsample setting for frequent words
    print('Training Word2Vec model...')
    # use inp_data and vocabulary_inv to reconstruct sentences
    sentences = [[vocabulary_inv[w] for w in s] for s in inp_data]
    if mode == 'skipgram':
        sg = 1
        print('Model: skip-gram')
    elif mode == 'cbow':
        sg = 0
        print('Model: CBOW')
    embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                        sg=sg,
                                        vector_size=size_features,
                                        min_count=min_word_count,
                                        window=context,
                                        sample=downsampling)
    print("Saving Word2Vec model {}".format(model_name))
    embedding_weights = np.zeros((len(vocabulary_inv), size_features))
    for i in range(len(vocabulary_inv)):
        word = vocabulary_inv[i]
        if word in embedding_model.wv:
            embedding_weights[i] = embedding_model.wv[word]
        else:
            embedding_weights[i] = np.random.uniform(-0.25, 0.25,
                                                     embedding_model.vector_size)
    return embedding_weights,embedding_model

### Model Training

In [68]:
embedding_weights_skipgram,model = get_embeddings(inp_data, vocabulary_inv)

Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding


In [70]:
import pickle
with open("data/embedding_weights_skipgram.pkl", "wb") as fp:   #Pickling
    pickle.dump(embedding_weights_skipgram, fp)
with open("data/model.pkl", "wb") as fp:   #Pickling
    pickle.dump(model, fp)

In [25]:
embedding_weights_skipgram = pd.read_pickle('data/embedding_weights_skipgram.pkl')

In [73]:
len(list(model.wv.key_to_index.keys()))

66426

### process training and test data

In [31]:
# tagged_train_data = tagged_data.copy()
# tagged_test_data = [[i.text for i in nlp(_d).noun_chunks] for i, _d in enumerate(test["text"])]

In [32]:
# with open("data/tagged_test_phrase.pkl", "wb") as fp:   #Pickling
#     pickle.dump(tagged_test_data, fp)

In [26]:
tagged_train_data = tagged_data.copy()
tagged_test_data = pd.read_pickle('data/tagged_test_phrase.pkl')

### Replace empty data by mean weight

In [28]:
avg = embedding_weights_skipgram.mean(axis=0)

In [29]:
train_vec = []
for doc in tagged_train_data:
    vec = 0
    length = 0
    if len(doc)==0:
        vec += avg
    else:     
        for w in doc:
            try:
                vec += embedding_weights_skipgram[vocabulary[w]]
                length += 1
            except:
                continue
        if length==0:
            vec = avg
    vec = vec / rsm.ifelse(length==0,1,length) 
    train_vec.append(vec)

In [30]:
test_vec = []
for doc in tagged_test_data:
    vec = 0
    length = 0
    if len(doc)==0:
        vec += avg
    else:     
        for w in doc:
            try:
                vec += embedding_weights_skipgram[vocabulary[w]]
                length += 1
            except:
                continue
        if length==0:
            vec = avg
            
    vec = vec / rsm.ifelse(length==0,1,length) 
    test_vec.append(vec)

In [135]:
# check = []

# for i in range(len(test_vec)):
#     if str(test_vec[i].shape) != '(100,)':
#         check += i

In [31]:
X_train = train_vec
X_test = test_vec
y_train = train['sentiment']
y_test = test['sentiment']

In [32]:
from sklearn.linear_model import LogisticRegressionCV

In [33]:
clf = LogisticRegressionCV(cv=5,
                               scoring='accuracy',
                               random_state=42,
                               n_jobs=-1,
                               verbose=3,
                               max_iter=1000).fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.4min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.6min finished


In [49]:
pred = clf.predict(X_test)

In [50]:
clf.score(X_test, y_test)

0.761219398525098

In [51]:
f1_score(y_test, pred, average='macro')

0.35373743337715996

In [53]:
f1_score(y_test, pred, average='micro')

0.761219398525098

In [52]:
roc_auc_score(y_test,clf.predict_proba(X_test),multi_class='ovr')

0.6754736519553314

### Tune model

In [81]:
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV
# # clf_new = LogisticRegression(max_iter=100000)
# pipe = Pipeline([('classifier' , LogisticRegression())])

# param_grid = [
#     {'classifier' : [LogisticRegression(max_iter=100000)],
#      'classifier__penalty' : ['l1','l2'],
#     'classifier__C' : [0.001,0.01,0.1,1, 10, 20, 50],
#     'classifier__solver':['liblinear']}
# ]

# clf_cv = GridSearchCV(pipe, param_grid = param_grid, cv=5,verbose=True, n_jobs=-1)
# best_clf = clf_cv.fit(X_train, y_train)
# best_clf.best_params_

Fitting 5 folds for each of 14 candidates, totalling 70 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



{'classifier': LogisticRegression(C=0.001, max_iter=100000, penalty='l1', solver='liblinear'),
 'classifier__C': 0.001,
 'classifier__penalty': 'l1',
 'classifier__solver': 'liblinear'}

In [82]:
# import pickle
# with open("data/best_clf.pkl", "wb") as fp:   #Pickling
#     pickle.dump(best_clf, fp)

# pred = best_clf.predict(X_test)

# best_clf.score(X_test, y_test)

# f1_score(y_test, pred, average='macro')

# roc_auc_score(y_test,best_clf.predict_proba(X_test),multi_class='ovr')

## Random forest

In [54]:
from sklearn.ensemble import RandomForestClassifier

In [55]:
rf = RandomForestClassifier(max_depth = 8, random_state=42, max_leaf_nodes = 25, min_samples_leaf=25)

In [88]:
# rf = RandomForestClassifier(max_depth=2, random_state=0)

In [56]:
rf_model = rf.fit(X_train, y_train)

In [58]:
pred_rf = rf_model.predict(X_test)

In [59]:
rf_model.score(X_test, y_test)

0.7640430956493133

In [60]:
f1_score(y_test, pred_rf, average='macro')

0.29189111954872726

In [61]:
f1_score(y_test, pred_rf, average='micro')

0.7640430956493133

In [62]:
roc_auc_score(y_test,rf_model.predict_proba(X_test),multi_class='ovr')

0.6476475793271231

### Tune Random Forest

In [103]:
# from sklearn.model_selection import RandomizedSearchCV
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 20, stop = 1000, num = 10)]
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # # Minimum number of samples required to split a node
# # min_samples_split = [2, 5, 10]
# # # Minimum number of samples required at each leaf node
# # min_samples_leaf = [1, 2, 4]
# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_depth': max_depth,
# #                'min_samples_split': min_samples_split,
# #                'min_samples_leaf': min_samples_leaf
#               }

In [104]:
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)


In [None]:
# best_rf = rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



In [None]:
# best_rf.best_params_

In [None]:
# import pickle
# with open("data/best_rf.pkl", "wb") as fp:   #Pickling
#     pickle.dump(best_rf, fp)

In [None]:
# pred_rf = best_rf.predict(X_test)

In [None]:
# rf_model.score(X_test, y_test)

In [None]:
# f1_score(y_test, pred_rf, average='macro')

In [None]:
# roc_auc_score(y_test,rf_model.predict_proba(X_test),multi_class='ovr')