In [1]:
import pickle

import pandas as pd
from helpers import *

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import numpy as np

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk

import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix

## Load dataset

In [3]:
with open('../data/prepared_data.pickle', 'rb') as data:
    data_dic = pickle.load(data)

In [4]:
train = data_dic['sample_df']['train']
test  = data_dic['sample_df']['test']
valid = data_dic['sample_df']['valid']

## Baseline model

In [5]:
stemmer = PorterStemmer()

def stem_tokens(text, stemmer=stemmer):
    tokens = nltk.word_tokenize(text)
    #tokens = [word for word in tokens if word not in stopwords.words('english')]
    stemmed = list(map(lambda x: stemmer.stem(x), tokens))
    return ' '.join(stemmed)

In [None]:
train['text'] = train['text'].apply(lambda x: stem_tokens(x))
test ['text'] = test['text'].apply(lambda x: stem_tokens(x))
valid['text'] = valid['text'].apply(lambda x: stem_tokens(x))

In [None]:
vectorizer = TfidfVectorizer( max_features = 40000,
                              ngram_range = ( 2, 3),
                              sublinear_tf = True)

tfidf = vectorizer.fit_transform(train["text"].tolist())
feature_names = vectorizer.get_feature_names()

In [None]:
def get_n_typical_words(indices,n):
    dense = tfidf.todense()
    
    x=list()
    for i in indices:
        x.append((dense[i]))
        
    dense1=np.vstack(x)
    
    text = np.mean(dense1, axis=0).tolist()[0]
    phrase_scores = [pair for pair in zip(range(0, len((text))), text)]    
    sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
    for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:n]:
        print('{0: <20} {1}'.format(phrase, score))

In [None]:
negative_indices=list()
for i,e in enumerate(train["negative"].tolist()):
    if e==1:
        negative_indices.append(i)

get_n_typical_words(negative_indices,20)

thi product          0.01197494245907462
of the               0.01075732133279488
it wa                0.009890409329617034
in the               0.009787350115405722
it is                0.009038507882953592
at all               0.007674663542569006
on the               0.007663317242389606
and the              0.007270087396399754
is not               0.007222589652192479
did not              0.007127918482168111
do not               0.007086543551265832
and it               0.00691181829891982
the product          0.006547978806706766
doe not              0.006530855979233705
would not            0.006395957307088153
to be                0.006255088639550243
to the               0.006190764382456677
if you               0.006075925383968358
thi is               0.006046358480545815
but it               0.005819082549833437


In [None]:
positive_indices=list()
for i,e in enumerate(train["negative"].tolist()):
    if e==0:
        positive_indices.append(i)

get_n_typical_words(positive_indices,20)

it is                0.01072390550946904
thi is               0.010526932966740954
thi product          0.008542418465225483
of the               0.008041869577336486
in the               0.007976180337618637
love it              0.00785084905514311
for my               0.007811410717202243
and it               0.007515880374773389
and the              0.006330764826507945
the best             0.006321994835523731
love thi             0.0062580463563512295
is the               0.00615627895850716
easi to              0.0061064305135728825
for the              0.005983840041575632
use it               0.0058645638127210195
is great             0.005806288525950364
on the               0.005580906653090528
it wa                0.005546914570059048
with the             0.005476764742793365
year old             0.005219769787075618


In [None]:
train_data_features=tfidf.toarray()
model = MultinomialNB() 
nb = model.fit(train_data_features, train["cat1_num"] )

In [None]:
test_data_features = vectorizer.transform(test["text"].tolist()).toarray()
result = nb.predict(test_data_features)

In [None]:
print("Test accuracy: %.1f%%" % (100*accuracy_score(result,test["cat1_num"])))
confusion_matrix(result,test["negative"])

Test accuracy: 73.5%


array([[ 2500,   494,     0,     0,     0,     0],
       [ 5465,   851,     0,     0,     0,     0],
       [ 2669,   335,     0,     0,     0,     0],
       [11064,  2909,     0,     0,     0,     0],
       [ 2720,   508,     0,     0,     0,     0],
       [ 9239,  1446,     0,     0,     0,     0]])