In [1]:
from textblob.classifiers import NaiveBayesClassifier
import os
import unidecode
from textblob import TextBlob
import random
import numpy as np
import io
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re
import hashlib
from collections import Counter
import glob
import io
import math
import matplotlib.pyplot as plt
%matplotlib inline
from __future__ import print_function
import subprocess
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [2]:
path = 'manual_labels'
print('subdirectories are:' + str(os.listdir(path)))

subdirectories are:['test', 'train']


In [3]:
def get_files(path):
    return sorted([path+ os.sep +f for f in os.listdir(path) if f.endswith(".txt")])

In [4]:
pos_train_files = get_files(path + os.sep + 'train' + os.sep + 'pos')
neg_train_files = get_files(path + os.sep + 'train' + os.sep + 'neg')
all_train_files = pos_train_files + neg_train_files

pos_test_files = get_files(path + os.sep + 'test' + os.sep + 'pos')
neg_test_files = get_files(path + os.sep + 'test' + os.sep + 'neg')
all_test_files = pos_test_files + neg_test_files

print('found %d positive and %d negative training files' %
      (len(pos_train_files), len(neg_train_files)))

print('found %d positive and %d negative test files' %
      (len(pos_test_files), len(neg_test_files)))

print('first positive file: %s' % pos_train_files[0])
print('first negative file: %s' % neg_train_files[0])

found 79 positive and 33 negative training files
found 26 positive and 7 negative test files
first positive file: manual_labels/train/pos/1.txt
first negative file: manual_labels/train/neg/102.txt


In [5]:
train = []
for pos_file in pos_train_files:
    with open(pos_file, 'r') as content_file:
        content = content_file.read()
    train.append((unicode(content, 'unicode-escape'),'pos'))
    
for neg_file in neg_train_files:
    with open(neg_file, 'r') as content_file:
        content = content_file.read()
    train.append((unicode(content, 'unicode-escape'),'neg'))

In [6]:
cl = NaiveBayesClassifier(train)

In [7]:
test = []
for pos_file in pos_test_files:
    with open(pos_file, 'r') as content_file:
        content = content_file.read()
    test.append((unicode(content, 'unicode-escape'),'pos'))
    
for neg_file in neg_test_files:
    with open(neg_file, 'r') as content_file:
        content = content_file.read()
    test.append((unicode(content, 'unicode-escape'),'neg'))

In [8]:
# just test the accuracy score of the Naivebayes classifier
cl.accuracy(test)

0.8181818181818182

In [9]:
# test the most informative features
cl.show_informative_features(5)

Most Informative Features
            contains(ok) = True              neg : pos    =      8.6 : 1.0
           contains(All) = True              neg : pos    =      8.6 : 1.0
          contains(oily) = True              neg : pos    =      7.1 : 1.0
          contains(With) = True              neg : pos    =      7.1 : 1.0
         contains(still) = True              neg : pos    =      6.1 : 1.0


In [14]:
def do_evaluation (pairs, pos_cls='pos', verbose=False):
    N = len(pairs)
    (ctr,correct, tp, tn, fp,fn) = (0,0,0,0,0,0)
    for (predicted, actual) in pairs:
        ctr += 1
        if predicted == actual:
            correct += 1
            if actual == pos_cls:
                tp += 1
            else:
                tn += 1
        else:
            if actual == pos_cls:
                fp += 1
            else:
                fn += 1
    (accuracy, precision, recall) = (float(correct)/N,float(tp)/(tp + fp),float(tp)/(tp + fn))
    if verbose:
        print_results(precision, recall, accuracy, pos_cls)
    return (accuracy, precision, recall)

def print_results (precision, recall, accuracy, pos_cls):
    banner =  'Evaluation with pos_cls = %s' % pos_cls
    print 
    print (banner)
    print ('=' + '*' + len(banner))
    print ('Precision',precision*100)
    print ('Recall',recall*100)
    print ('Accuracy',accuracy*100)

In [15]:
pairs = [(cl.classify(predicted), actual)
            for (predicted, actual) in test]
do_evaluation (pairs)
do_evaluation (pairs, pos_cls='neg')

(0.8181818181818182, 0.14285714285714285, 1.0)

In [16]:
def get_true_labels(file_names):
    labels = []
    for f in file_names:
        if 'pos' in f:
            labels.append(1);
        else:
            labels.append(0);
    return np.array(labels)

labels = get_true_labels(all_train_files)
print('first 3 and last 3 labels are: %s' % str(labels[[1,2,3,-3,-2,-1]]))

first 3 and last 3 labels are: [1 1 1 0 0 0]


In [17]:
def file2string(filename):
    return io.open(filename, encoding='utf8').readlines()[0]
    
file2string(pos_train_files[20])

u'Great place to have dinner with my best friend. I have been in New York for 6 years and I came from Shanghai. I feel Yaso Tangbao is the closest taste to my hometown. We ordered Yaso Pork Soup Dumplings, Pork And Bok Choy Wonton, Spicy Diced Pork Noodle , Chicken Curry Noodle Soup, Braised Pork Meatballs Over Rice With Eggs, Wine Chicken, Garlic Cucumber Salad and Spicy Cabbage Salad. They all taste so local and I love the environment very much. Strongly recommend!!!'

In [18]:
def tokenize(text):
    return filter(None,re.split(r"[\W]",text.lower()))

In [19]:
def tokenize_with_punct(text):
    tokens = re.findall(r"\w+|\S", text.lower(),flags = re.L)
    return [t.encode('utf-8').decode('utf-8').encode('utf-8') for t in tokens]

In [20]:
def tokenize_with_not(text):
    tokens = tokenize_with_punct(text)
    for i,token in enumerate(tokens):
        if (token == 'not'):
            if (i+1 <= len(tokens)-1):
                tokens[i+1] = token + '_' + tokens[i+1]
            if (i+2 <= len(tokens)-1):
                tokens[i+2] = token + '_' + tokens[i+2]
    return tokens

In [21]:
def do_vectorize(filenames, tokenizer_fn=tokenize, min_df=1,
                 max_df=1., binary=True,stop_words=None, ngram_range=(1,1)):
    if type(filenames) is np.ndarray:
        filenames = filenames.tolist()
    vec = CountVectorizer(input='content',tokenizer=tokenizer_fn, min_df=min_df,
                 max_df=max_df, binary=binary, ngram_range=ngram_range,stop_words=stop_words)
    data  = []
    for fil in filenames:
        f = open(fil, 'r')
        data.append(unicode(f.read(), 'unicode-escape'))
    return vec.fit_transform(data),vec
    
matrix, vec = do_vectorize(all_train_files)
print ('matrix represents %d documents with %d features' % (matrix.shape[0], matrix.shape[1]))
print('first doc has terms:\n%s' % (str(sorted(matrix[0].nonzero()[1]))))

matrix represents 112 documents with 2294 features
first doc has terms:
[46, 65, 75, 84, 87, 98, 103, 105, 110, 111, 139, 151, 160, 193, 207, 216, 231, 246, 263, 270, 284, 288, 304, 333, 355, 380, 391, 414, 514, 515, 589, 612, 614, 615, 660, 763, 765, 780, 783, 817, 825, 841, 894, 915, 944, 959, 975, 1004, 1007, 1036, 1108, 1118, 1125, 1172, 1247, 1249, 1266, 1287, 1293, 1301, 1307, 1332, 1360, 1388, 1414, 1475, 1477, 1592, 1639, 1678, 1781, 1834, 1922, 1956, 2002, 2004, 2006, 2007, 2014, 2022, 2046, 2069, 2080, 2124, 2147, 2200, 2206, 2209, 2223, 2230, 2234, 2244, 2251, 2285]


In [22]:
vec.get_feature_names()

[u'00',
 u'1',
 u'10',
 u'100th',
 u'11',
 u'1130am',
 u'11am',
 u'11pm',
 u'12',
 u'14',
 u'140',
 u'15',
 u'16',
 u'18',
 u'192',
 u'1st',
 u'2',
 u'20',
 u'21',
 u'24',
 u'25',
 u'28',
 u'2nd',
 u'3',
 u'30',
 u'30pm',
 u'36',
 u'3pm',
 u'4',
 u'400',
 u'45pm',
 u'5',
 u'50',
 u'5ish',
 u'5pm',
 u'6',
 u'7',
 u'75',
 u'7pm',
 u'8',
 u'875',
 u'8pm',
 u'9',
 u'95',
 u'_',
 u'___________',
 u'a',
 u'a4',
 u'able',
 u'about',
 u'absolutely',
 u'abundance',
 u'accepts',
 u'accessible',
 u'accidentally',
 u'accompanying',
 u'accustomed',
 u'achar',
 u'acquire',
 u'across',
 u'act',
 u'active',
 u'actual',
 u'actually',
 u'add',
 u'added',
 u'addition',
 u'additional',
 u'admit',
 u'adorable',
 u'advise',
 u'aesthetics',
 u'affordability',
 u'affordable',
 u'aficionados',
 u'after',
 u'afternoon',
 u'afterwards',
 u'again',
 u'against',
 u'ago',
 u'agree',
 u'ahead',
 u'ahhh',
 u'air',
 u'aka',
 u'alcohol',
 u'all',
 u'allergic',
 u'allow',
 u'allows',
 u'almonds',
 u'almost',
 u'alone',


In [23]:
def repeatable_random(seed):
    hash = str(seed)
    while True:
        hash = hashlib.md5(hash).digest()
        for c in hash:
            yield ord(c)

def repeatable_shuffle(X, y, filenames):
    r = repeatable_random(42) 
    indices = sorted(range(X.shape[0]), key=lambda x: next(r))
    return X[indices], y[indices], np.array(filenames)[indices]

X, y, filenames = repeatable_shuffle(matrix, labels, all_train_files)

print('first shuffled document %s has label %d and terms: %s' % 
      (filenames[0], y[0], sorted(X[0].nonzero()[1])))

first shuffled document manual_labels/train/pos/62.txt has label 1 and terms: [1, 4, 8, 10, 11, 16, 17, 20, 23, 27, 29, 35, 39, 40, 42, 46, 49, 54, 56, 63, 76, 87, 88, 98, 102, 106, 107, 110, 111, 115, 127, 129, 133, 135, 137, 139, 143, 145, 147, 151, 155, 168, 169, 180, 182, 189, 190, 193, 194, 198, 203, 213, 215, 219, 221, 222, 229, 233, 239, 252, 290, 292, 294, 297, 303, 304, 311, 312, 316, 323, 325, 336, 355, 356, 357, 363, 370, 373, 378, 391, 397, 398, 402, 438, 456, 473, 489, 497, 505, 515, 518, 525, 540, 545, 547, 555, 564, 578, 584, 587, 593, 597, 608, 611, 612, 614, 616, 621, 622, 624, 634, 642, 643, 649, 654, 655, 668, 683, 686, 703, 712, 724, 743, 744, 753, 760, 762, 763, 765, 772, 780, 782, 788, 815, 818, 821, 829, 835, 848, 849, 853, 860, 878, 885, 887, 889, 890, 894, 898, 907, 911, 915, 918, 942, 944, 953, 957, 959, 961, 967, 975, 978, 988, 989, 999, 1004, 1007, 1011, 1012, 1020, 1024, 1036, 1057, 1068, 1083, 1095, 1100, 1102, 1114, 1116, 1130, 1131, 1135, 1139, 1143, 115

In [24]:
def get_clf():
    return LogisticRegression(random_state=42)

In [25]:
def do_cross_validation(X, y, n_folds=5, verbose=False):
    kf = KFold(X.shape[0],n_folds,shuffle=False,random_state=None)
    c = tot =0
    clf = get_clf()
    for k, (train, test) in enumerate(kf):
        clf.fit(X[train], y[train])
        score = accuracy_score(y[test], clf.predict(X[test]))
        if (verbose):
            print "fold", c ,"accuracy=",score
            c+=1
        tot += score
    return (1.* tot)/ n_folds
    
print('average cross validation accuracy=%.4f' %
      do_cross_validation(X, y, verbose=True))

SyntaxError: invalid syntax (<ipython-input-25-79e9801650a8>, line 9)

In [26]:
def do_expt(filenames, y, tokenizer_fn=tokenize,
            min_df=1, max_df=1., binary=True,
            ngram_range=(1,1), n_folds=5):
    return do_cross_validation(do_vectorize(filenames.tolist(),binary=binary,tokenizer_fn=tokenizer_fn,min_df=min_df,max_df=max_df)[0], y,n_folds=n_folds)

In [27]:
print('accuracy using default settings: %.4g' % do_expt(filenames, y))

NameError: global name 'do_cross_validation' is not defined

In [28]:
def compare_n_folds(filenames, y):
    arr = [2,5,10,20,30,40,50,60,70]
    val = [do_expt(filenames, y,n_folds=i) for i in arr]
    plt.figure()
    plt.xlabel('n_folds')
    plt.ylabel('accuracy')
    plt.plot(arr,val,'bo-')
    return val
    
compare_n_folds(filenames, y)

# considering 50 folds for later experiments

NameError: global name 'do_cross_validation' is not defined

In [29]:
def compare_binary(filenames, y):
    return [do_expt(filenames, y,binary=True),do_expt(filenames, y,binary=False)]
          
compare_binary(filenames, y)

NameError: global name 'do_cross_validation' is not defined

In [30]:
def tokenizer_expt(all_train_files, y):
    return [do_expt(all_train_files, y,tokenizer_fn=tokenize),do_expt(all_train_files, y,tokenizer_fn=tokenize_with_punct),do_expt(all_train_files, y,tokenizer_fn=tokenize_with_not)]

tokenizer_expt(filenames, y)

NameError: global name 'do_cross_validation' is not defined

In [31]:
def min_df_expt(filenames, y):
    mindf = range(1,11)
    val = [do_expt(filenames, y,min_df=i,tokenizer_fn=tokenize) for i in mindf]
    plt.figure()
    plt.xlabel('min_df')
    plt.ylabel('accuracy')
    plt.plot(mindf,val,'bo-')
    return val

min_df_expt(filenames, y)

NameError: global name 'do_cross_validation' is not defined

In [32]:
def max_df_expt(filenames, y):
    maxdf = np.linspace(0.1,1,10)
    print maxdf
    val = [do_expt(filenames, y,min_df=6,tokenizer_fn=tokenize,max_df=i) for i in maxdf]
    plt.figure()
    plt.xlabel('max_df')
    plt.ylabel('accuracy')
    plt.plot(maxdf,val,'bo-')
    return val
    
max_df_expt(filenames, y)

SyntaxError: invalid syntax (<ipython-input-32-bf08a50ef9de>, line 3)

In [33]:
"""So, based on the above experiments, we set:
binary=True
tokenizer=tokenize
min_df=6
max_df=1."""

X, vec = do_vectorize(filenames, tokenizer_fn=tokenize,
                      binary=True, min_df=6, max_df=1.)
clf = get_clf()
clf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0)

In [34]:
print type(clf.coef_)
print clf.coef_.shape

SyntaxError: invalid syntax (<ipython-input-34-27b338df0748>, line 1)

In [35]:
print(clf.coef_[0][:10])
print(vec.get_feature_names()[:10])

[-0.38535502 -0.16709321  0.06644954  0.04920645 -0.02750608 -0.06655038
  0.20261799 -0.06001036  0.10430076  0.09163123]
[u'10', u'2', u'3', u'4', u'5', u'50', u'6', u'8', u'a', u'about']


In [36]:
def get_top_coefficients(clf, vec, n=10):
    sorted_tups = sorted(zip(vec.get_feature_names(),clf.coef_[0].tolist()), key=lambda tup: tup[1],reverse=True)
    return sorted_tups[:5],sorted_tups[:-6:-1]

pos_coef, neg_coef = get_top_coefficients(clf, vec, n=5)
print('top positive coefs: %s' % str(pos_coef))
print('top negative coefs: %s' % str(neg_coef))

top positive coefs: [(u'definitely', 0.840714144602624), (u'great', 0.8309602535321319), (u'ever', 0.7905831652749882), (u'the', 0.6043520265517477), (u'very', 0.5551453772535825)]
top negative coefs: [(u'not', -0.8964606207435467), (u'on', -0.6371309514160892), (u'nothing', -0.5890143378280941), (u'taste', -0.5074490348623851), (u'bit', -0.48402064120734045)]


In [37]:
X_test = vec.transform(all_test_files)
y_test = np.array([1] * len(pos_test_files) + [0] * len(neg_test_files))
print('X_test represents %d documents with %d features' % (X_test.shape[0], X_test.shape[1]))
print('y_test has %d positive and %d negative labels' % (len(np.where(y_test==1)[0]),
                                                          len(np.where(y_test==0)[0])))
print('first testing file is %s' % all_test_files[0])
print('last testing file is %s' % all_test_files[-1])
print('testing accuracy=%.10g' % accuracy_score(y_test, clf.predict(X_test)))

X_test represents 33 documents with 325 features
y_test has 26 positive and 7 negative labels
first testing file is manual_labels/test/pos/100.txt
last testing file is manual_labels/test/neg/97.txt
testing accuracy=0.7878787879


In [38]:
def index_of_term(vec, term):
    return vec.get_feature_names().index(term)

index_of_term(vec, 'the')

272

In [39]:
def train_after_removing_features(X, y, vec, features_to_remove):
    listofcols = [index_of_term(vec, f) for f in features_to_remove]
    for i in listofcols:
        X[:,i] = 0
    clf = get_clf()
    clf.fit(X,y)
    return clf
    
clf = train_after_removing_features(X.copy(), y, vec, ['the'])
print('testing accuracy=%.10g' % accuracy_score(y_test, clf.predict(X_test)))

testing accuracy=0.7878787879




In [40]:
def get_top_errors(X_test, y_test, filenames, clf, n=10):
    truth_array = clf.predict(X_test)
    prob_list = clf.predict_proba(X_test)
    mis_classified = sorted([{'filename':filenames[i],'index':i,'predicted':truth_array[i],'probas':prob_list[i][truth_array[i]],'truth':y_test[i]} for i,t in enumerate(truth_array) if t != y_test[i]],key=lambda d: d['probas'],reverse=True)[:10]
    for m in mis_classified:
        m['probas'] = prob_list[m['index']]
    return mis_classified
    
errors = get_top_errors(X_test, y_test, all_test_files, clf)
errors

[{'filename': 'manual_labels/test/neg/105.txt',
  'index': 26,
  'predicted': 1,
  'probas': array([ 0.4040528,  0.5959472]),
  'truth': 0},
 {'filename': 'manual_labels/test/neg/122.txt',
  'index': 27,
  'predicted': 1,
  'probas': array([ 0.4040528,  0.5959472]),
  'truth': 0},
 {'filename': 'manual_labels/test/neg/148.txt',
  'index': 28,
  'predicted': 1,
  'probas': array([ 0.4040528,  0.5959472]),
  'truth': 0},
 {'filename': 'manual_labels/test/neg/40.txt',
  'index': 29,
  'predicted': 1,
  'probas': array([ 0.4040528,  0.5959472]),
  'truth': 0},
 {'filename': 'manual_labels/test/neg/84.txt',
  'index': 30,
  'predicted': 1,
  'probas': array([ 0.4040528,  0.5959472]),
  'truth': 0},
 {'filename': 'manual_labels/test/neg/94.txt',
  'index': 31,
  'predicted': 1,
  'probas': array([ 0.4040528,  0.5959472]),
  'truth': 0},
 {'filename': 'manual_labels/test/neg/97.txt',
  'index': 32,
  'predicted': 1,
  'probas': array([ 0.4040528,  0.5959472]),
  'truth': 0}]