In [2]:
%matplotlib inline

import time
import functools
import sqlite3
import pandas as pd
import numpy as np
import nltk
nltk.data.path.append('/Users/Hugo/nltk_data/')
import string
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as sparse

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from nltk.corpus import stopwords
from sklearn.cross_validation import StratifiedKFold
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split

### Get Familiar with dataset

In [3]:
# Load csv file into DataFrame
kindle_data = pd.read_csv('sampled_data.csv')
type(kindle_data)

pandas.core.frame.DataFrame

In [4]:
# Print first row
# Format: data_frame.col_nam[row]
print("overall    :", kindle_data.overall[0])
print("reviewText :", kindle_data.reviewText[0])

('overall    :', 'pos')
('reviewText :', 'This book ended even before it started and it made me want for more. Oh oh such a teaser. I want the book now please. So exciting.')


In [5]:
# Length of kindle_data
len(kindle_data)

126871

In [6]:
# Get a sample (head) of the data frame
kindle_data.head()

Unnamed: 0,overall,reviewText
0,pos,This book ended even before it started and it ...
1,pos,This is a great read with so much emotion you ...
2,pos,"It&#8217;s Christmas Eve and miraculously, Sal..."
3,pos,I enjoyed meeting the character of Cassandra. ...
4,pos,"Can I be the next Hunter wife? Again, I have ..."


In [7]:
# Statics on tags
kindle_data.overall.value_counts()

pos    64559
neg    62312
Name: overall, dtype: int64

In [8]:
def splitPosNeg(data_):
    neg = data_.loc[data_.overall=='neg']
    pos = data_.loc[data_.overall=='pos']
    return [pos,neg]

[pos,neg] = splitPosNeg(kindle_data)

In [9]:
print(type(pos))
print("pos:", len(pos), ", neg:", len(neg))

<class 'pandas.core.frame.DataFrame'>
('pos:', 64559, ', neg:', 62312)


### Preprocessing

In [10]:
lemmatizer = nltk.WordNetLemmatizer()
stop = stopwords.words('english')
translation = string.maketrans(string.punctuation,' '*len(string.punctuation))

In [11]:
translation

'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f                0123456789       ABCDEFGHIJKLMNOPQRSTUVWXYZ      abcdefghijklmnopqrstuvwxyz    \x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'

In [12]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
transtbl = string.maketrans('abc','def')
'ababc'.translate(transtbl)

'dedef'

In [21]:
def preprocessing(line):
    tokens=[]
    line = str(line).translate(translation)  # Replace punctuation
    line = nltk.word_tokenize(line.lower())  # Tokenize
    
    for t in line:
        # Remove stopwords
        if t not in stop:
            stemmed = lemmatizer.lemmatize(t)
            tokens.append(stemmed)
    
    return ' '.join(tokens)

In [None]:
# Yet a more compact way to write the code
def preprocessing(line: str) -> str:
    line = str(line).translate(translation)
    line = nltk.word_tokenize(line.lower())
    
    line = [lemmatizer.lemmatize(t) for t in line if t not in stop]
    return ' '.join(line)

In [22]:
nltk.data.path.append('/Users/Hugo/nltk_data1/')
test_str = "I bought it yesterday and I really love it!"
preprocessing(test_str)

'bought yesterday really love'

In [23]:
# Preprocess all data
start = time.time()

pos_data = [preprocessing(p) for p in pos['reviewText']]
neg_data = [preprocessing(p) for p in neg['reviewText']]

end = time.time()
print(end - start)

251.536102057


In [105]:
type(pos_data)

list

In [24]:
# Yet a more modern way to write code
pos_data = list(map(preprocessing, pos['reviewText']))
neg_data = list(map(preprocessing, neg['reviewText']))

### Some modern functions to introduce
- map
- reduce
- filter

They are very useful when running the project on a cluster or distributed compute system like Hadoop or Spark.

In [25]:
# Some useful modern functions
l = [0,1,2,3,4,5,6,7,8,9]

# Map
def square(x: int) -> int:
    return x * x

print( list(map(square, l)) )

SyntaxError: invalid syntax (<ipython-input-25-f8028472e0a0>, line 5)

In [None]:
# Using lambda function
print( list(map(lambda x: x * x, l)) )

In [None]:
# Reduce
# reduce function is moved to functools
def add(x, y):
    return x + y

rst = functools.reduce(add, l)
print ("reduce", l, "by add:", rst)

In [None]:
# Using lambda function
rst = functools.reduce(lambda x, y: x + y, l)
print ("reduce", l, "by add:", rst)

In [None]:
rst = functools.reduce(lambda x, y: max(x, y), l)
print ("reduce", l, "by max:", rst)

In [None]:
# Filter
# Much faster than loop, similar with list comprehension
list(filter(lambda x: x < 5, l))

### Split Training Data & Test Data

In [26]:
data = pos_data + neg_data
# remember this is sampled
labels = np.concatenate((pos['overall'].values,neg['overall'].values))

In [27]:
# Split data into training set and testing set (20:80)
# stratify: make sure pos/neg remains the same in training set and testing set
train_data, test_data, train_labels, test_labels = \
train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=1234)

In [28]:
print("training size = ", len(train_data), "testing size = ", len(test_data))

('training size = ', 101496, 'testing size = ', 25375)


#### Underfitting vs Overfitting
![](http://scikit-learn.org/stable/_images/sphx_glr_plot_underfitting_overfitting_001.png)

In [29]:
# Push all tokens and compute frequency of words
t = []
for line in train_data:
    l = nltk.word_tokenize(line)
    for w in l:
        t.append(w)
        
word_features = nltk.FreqDist(t)

In [31]:
t[:10]

['wife',
 'got',
 'book',
 'loved',
 'reminded',
 'important',
 'santa',
 'thanks',
 'much',
 'writing']

In [None]:
# Yet another more python-y style
tokens = [word for line in train_data \
               for word in nltk.word_tokenize(line)]

word_features = nltk.FreqDist(tokens)

In [33]:
print(word_features)

<FreqDist with 84617 samples and 4195256 outcomes>


In [34]:
word_features.most_common(10)

[('book', 124073),
 (u'story', 62132),
 (u'read', 55777),
 (u'one', 36880),
 ('like', 32032),
 (u'character', 30265),
 (u'good', 28930),
 (u'would', 27660),
 (u'author', 24824),
 (u'love', 24805)]

In [35]:
topwords = [fpair[0] for fpair in list(word_features.most_common(10000))]

In [69]:
print type(topwords)
print topwords[:10]
len(topwords)

<type 'list'>
['book', u'story', u'read', u'one', 'like', u'character', u'good', u'would', u'author', u'love']


10000

### Vectorizer

In [70]:
cnt_vec = CountVectorizer()
cnt_vec

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [72]:
cnt_fit = cnt_vec.fit_transform([' '.join(topwords)])
cnt_fit

<1x9969 sparse matrix of type '<type 'numpy.int64'>'
	with 9969 stored elements in Compressed Sparse Row format>

In [73]:
cnt_vec.vocabulary_

{u'unimaginative': 9371,
 u'foul': 3643,
 u'four': 3649,
 u'hanging': 4076,
 u'gabriella': 3747,
 u'payoff': 6485,
 u'increase': 4530,
 u'eligible': 2923,
 u'electricity': 2910,
 u'unanswered': 9304,
 u'originality': 6286,
 u'opener': 6245,
 u'lori': 5366,
 u'starsi': 8457,
 u'lore': 5365,
 u'lord': 5364,
 u'immature': 4452,
 u'hormone': 4319,
 u'rational': 7163,
 u'delf': 2355,
 u'callie': 1315,
 u'tantalizing': 8832,
 u'yellow': 9926,
 u'politician': 6704,
 u'bringing': 1199,
 u'scholar': 7810,
 u'wooden': 9837,
 u'succession': 8650,
 u'stereotypical': 8500,
 u'straight': 8542,
 u'annie': 483,
 u'charter': 1503,
 u'specially': 8358,
 u'tired': 9060,
 u'miller': 5743,
 u'preface': 6799,
 u'bacon': 796,
 u'pulse': 7024,
 u'sanderson': 7740,
 u'elegant': 2913,
 u'second': 7861,
 u'loathing': 5325,
 u'ruthless': 7686,
 u'thunder': 9025,
 u'cooking': 1992,
 u'contributed': 1956,
 u'designing': 2438,
 u'increasing': 4532,
 u'groupie': 4004,
 u'admiral': 250,
 u'specialist': 8357,
 u'hero':

#### Tf–idf term weighting

- Tf: term-frequency
- idf: inverse document-frequency
- Tf-idf = $tf(t,d) \times idf(t)$

$$
idf(t) = log{\frac{1 + nd}{1 + df(d, t)}} + 1
$$

![](http://www.onemathematicalcat.org/Math/Algebra_II_obj/Graphics/log_base_gt1.gif)

> Sentent 1: The boy **love** the toy

> Sentent 2: The boy **hate** the toy

In [47]:
transformer = TfidfTransformer(smooth_idf=False)
transformer

TfidfTransformer(norm=u'l2', smooth_idf=False, sublinear_tf=False,
         use_idf=True)

In [48]:
counts = [[3, 0, 1],
          [2, 0, 0],
          [3, 0, 0],
          [4, 0, 0],
          [3, 2, 0],
          [3, 0, 2]]
tfidf = transformer.fit_transform(counts)
tfidf

<6x3 sparse matrix of type '<type 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [49]:
tfidf.toarray()

array([[ 0.81940995,  0.        ,  0.57320793],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.47330339,  0.88089948,  0.        ],
       [ 0.58149261,  0.        ,  0.81355169]])

In [58]:
tf_trans = TfidfTransformer()
tf_fit = tf_trans.fit_transform(cnt_fit)
tf_fit

<1x9969 sparse matrix of type '<type 'numpy.float64'>'
	with 9969 stored elements in Compressed Sparse Row format>

In [59]:
tf_fit.toarray()

array([[ 0.01001554,  0.01001554,  0.01001554, ...,  0.01001554,
         0.01001554,  0.01001554]])

In [74]:
# Since CountVectorizer and TfidTransformer are often used together
# There is a class named TfidfVectorizer that combine these two steps
tf_vec = TfidfVectorizer(min_df=1)
tf_fit = tf_vec.fit_transform([' '.join(topwords)])
tf_fit

<1x9969 sparse matrix of type '<type 'numpy.float64'>'
	with 9969 stored elements in Compressed Sparse Row format>

In [75]:
tf_vec.vocabulary_

{u'unimaginative': 9371,
 u'foul': 3643,
 u'four': 3649,
 u'hanging': 4076,
 u'gabriella': 3747,
 u'payoff': 6485,
 u'increase': 4530,
 u'eligible': 2923,
 u'electricity': 2910,
 u'unanswered': 9304,
 u'originality': 6286,
 u'opener': 6245,
 u'lori': 5366,
 u'starsi': 8457,
 u'lore': 5365,
 u'lord': 5364,
 u'immature': 4452,
 u'hormone': 4319,
 u'rational': 7163,
 u'delf': 2355,
 u'callie': 1315,
 u'tantalizing': 8832,
 u'yellow': 9926,
 u'politician': 6704,
 u'bringing': 1199,
 u'scholar': 7810,
 u'wooden': 9837,
 u'succession': 8650,
 u'stereotypical': 8500,
 u'straight': 8542,
 u'annie': 483,
 u'charter': 1503,
 u'specially': 8358,
 u'tired': 9060,
 u'miller': 5743,
 u'preface': 6799,
 u'bacon': 796,
 u'pulse': 7024,
 u'sanderson': 7740,
 u'elegant': 2913,
 u'second': 7861,
 u'loathing': 5325,
 u'ruthless': 7686,
 u'thunder': 9025,
 u'cooking': 1992,
 u'contributed': 1956,
 u'designing': 2438,
 u'increasing': 4532,
 u'groupie': 4004,
 u'admiral': 250,
 u'specialist': 8357,
 u'hero':

### Feature Extraction

In [76]:
# Extract features from training set
# Vocabulary is from topwords
train_features = tf_vec.transform(train_data)

# cnt_train_features = cnt_vec.transform(train_data)
# train_features = tf_trans.transform(cnt_train_features)

In [81]:
# Array[n_train_data * n_features]
train_features.shape

(101496, 9969)

In [82]:
# Extract features from test set
test_features = tf_vec.transform(test_data)

# cnt_test_features = cnt_vec.transform(test_data)
# test_features = tf_trans.transform(cnt_test_features)

In [83]:
# (Uni+Bi)-Gram
bg_tf_vec = TfidfVectorizer(ngram_range=(1,2))
bg_tf_vec.fit([' '.join(topwords)])
bg_train_features = bg_tf_vec.transform(train_data)

bg_train_features.shape

# Array[n_train_data * (uni_gram_features + bi_gram_features)]

(101496, 19937)

In [84]:
# Extract (uni+bi)-gram test features
bg_test_features = bg_tf_vec.transform(test_data)

In [89]:
for n in bg_test_features.toarray()[1,:]:
    if n !=0:
        print n

0.258198889747
0.516397779494
0.258198889747
0.258198889747
0.258198889747
0.258198889747
0.258198889747
0.258198889747
0.258198889747
0.258198889747
0.258198889747
0.258198889747


### [Multinomial NB](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)

The multinomial Naive Bayes classifier is suitable for **classification with discrete features** (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [90]:
from sklearn.naive_bayes import MultinomialNB

In [91]:
mnb_model = MultinomialNB()

In [92]:
# Train Model
start = time.time()
mnb_model.fit(train_features, train_labels)
end = time.time()

print("Multinomial NB model trained in %f seconds" % (end-start))

Multinomial NB model trained in 0.370143 seconds


In [93]:
# Predict
pred = mnb_model.predict(test_features)
print(pred)

['neg' 'pos' 'pos' ..., 'neg' 'pos' 'neg']


In [94]:
# Metrics
# mtrics.accuracy_score(y_true, y_pred)
accuracy = metrics.accuracy_score(pred,test_labels)
print(accuracy)

0.819901477833


In [95]:
# Use keyword arguments to set arguments explicitly
print(metrics.classification_report(y_true=test_labels, y_pred=pred))

             precision    recall  f1-score   support

        neg       0.83      0.80      0.81     12463
        pos       0.81      0.84      0.83     12912

avg / total       0.82      0.82      0.82     25375



In [96]:
# Example from sklearn documentation

y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ['class 0', 'class 1', 'class 2']
print(metrics.classification_report(y_true, y_pred, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.50      1.00      0.67         1
    class 1       0.00      0.00      0.00         1
    class 2       1.00      0.67      0.80         3

avg / total       0.70      0.60      0.61         5



#### Train & test using Uni-Gram + Bi-Gram features

In [97]:
# Train & test using (uni+bi)-gram features
bg_mnb_model = MultinomialNB()
bg_mnb_model.fit(bg_train_features, train_labels)
bg_pred = bg_mnb_model.predict(bg_test_features)
print(bg_pred)

['neg' 'pos' 'pos' ..., 'neg' 'pos' 'neg']


In [98]:
# Statistics
bg_accuracy = metrics.accuracy_score(bg_pred,test_labels)
print(bg_accuracy)

0.819665024631


In [99]:
print(metrics.classification_report(y_true=test_labels, y_pred=bg_pred))

             precision    recall  f1-score   support

        neg       0.83      0.80      0.81     12463
        pos       0.81      0.84      0.83     12912

avg / total       0.82      0.82      0.82     25375



### Predict new sentences

In [101]:
# Predict a new sentence
# vectorizer needs to be pre-fitted
# At the end of the project, the function signature should be something like:
# predict_new(sentent: str, vec, model) -> str

def predict_new(sentence):
    sentence = preprocessing(sentence)
    features = tf_vec.transform([sentence])
    pred = mnb_model.predict(features)
    return pred[0]

In [102]:
predict_new("I love it")

'pos'

### Save model

In [103]:
import pickle

# Save vectorizer
with open('tf_vec.pkl', 'wb') as pkl_file:
    pickle.dump(tf_vec, pkl_file)

In [104]:
with open('mnb_model.pkl', 'wb') as pkl_file:
    pickle.dump(mnb_model, pkl_file)