In [1]:
import pickle

import pandas as pd
from helpers import *

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import numpy as np

from nltk.corpus import stopwords
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier



In [2]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix

def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == labels) / predictions.shape[0])

## Load dataset

In [5]:
with open('../data/prepared_data.pickle', 'rb') as data:
    data_dic = pickle.load(data)

In [6]:
train = data_dic['sample_df']['train']
test  = data_dic['sample_df']['test']
valid = data_dic['sample_df']['valid']

## Baseline model

In [7]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [8]:
vectorizer = TfidfVectorizer( max_features = 20000,
                              ngram_range = ( 2, 3),
                              sublinear_tf = True,
                              stop_words='english',
                              tokenizer=LemmaTokenizer())

tfidf = vectorizer.fit_transform(train["text"].tolist())
feature_names = vectorizer.get_feature_names()

In [9]:
def get_n_typical_words(indices,n):
    dense = tfidf.todense()
    
    x=list()
    for i in indices:
        x.append((dense[i]))
        
    dense1=np.vstack(x)
    
    text = np.mean(dense1, axis=0).tolist()[0]
    phrase_scores = [pair for pair in zip(range(0, len((text))), text)]    
    sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
    for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:n]:
        print('{0: <20} {1}'.format(phrase, score))

In [10]:
negative_indices=list()
for i,e in enumerate(train["negative"].tolist()):
    if e==1:
        negative_indices.append(i)

get_n_typical_words(negative_indices,20)

don t                0.020146844424370983
didn t               0.015458757011469499
waste money          0.014435666057173066
doesn t              0.012609677502916269
t work               0.00800776901327242
wa disappointed      0.006576013300820232
year old             0.006457295032417125
wouldn t             0.006413649594494231
won t                0.006083701056422039
t waste              0.005661080290291162
don t waste          0.005479035225266659
t buy                0.005360911014563476
product wa           0.0050786153961288084
did work             0.004882766977229225
t waste money        0.004782422779032631
isn t                0.004773405725060026
couldn t             0.004736789478096593
wasn t               0.004711132774878644
doe work             0.004539677490344062
save money           0.004385822625330937


In [11]:
positive_indices=list()
for i,e in enumerate(train["negative"].tolist()):
    if e==0:
        positive_indices.append(i)

get_n_typical_words(positive_indices,20)

don t                0.0102543242350656
year old             0.009783246577421734
highly recommend     0.006746180459742179
doesn t              0.00645556001586401
great product        0.006324090579373664
work great           0.005804038614406916
month old            0.004984026116021764
didn t               0.004821667607014165
easy use             0.004250342175830946
long time            0.0033554554490061615
recommend product    0.003319034230402814
great price          0.0031700183297970687
really like          0.0031622437728557364
s great              0.0030500269527403965
good product         0.0028547404844335637
love product         0.0028490032980773507
year ago             0.0028487390723376652
haven t              0.0028163438026409565
couldn t             0.0027857207669553428
wa great             0.002715725301379599


In [13]:
train_data_features=tfidf.toarray()
forest = RandomForestClassifier(n_estimators = 500) 
forest = forest.fit(train_data_features, train["negative"] )

In [None]:
test_data_features = vectorizer.transform(test["text"].tolist()).toarray()
result = forest.predict_proba(test_data_features)
result=[result[x][1] for x in range(len(result)) ]