## Get the Training and Test data

In [1]:
%pylab inline
import numpy as np
import pandas as pd
import math
from pandas import DataFrame, read_csv
from __future__ import division

%cd ~/mhardist/DSE210/homework2/20news-bydate/

#Get the training data
X= pd.read_csv('../20news-bydate2/matlab/train.data', delim_whitespace=True, names=['doc_idx','word_idx','occurance_count'])
Y = pd.read_csv('../20news-bydate2/matlab/train.label', delim_whitespace=True , names=['label_idx'])
Y['doc_idx'] = Y.index + 1

#Get the test data
X_test = pd.read_csv('../20news-bydate2/matlab/test.data', delim_whitespace=True, names=['doc_idx','word_idx','occurance_count'])
Y_test = pd.read_csv('../20news-bydate2/matlab/test.label', delim_whitespace=True , names=['label_idx'])
Y_test['doc_idx'] = Y_test.index + 1

Populating the interactive namespace from numpy and matplotlib
/Users/mikihardisty/Documents/DSE/mhardist/DSE210/homework2/20news-bydate


In [2]:
#Get the vocabulary words
V = pd.read_csv('../vocabulary.txt', header=None)
V.columns = ['word']
V['word_idx'] = [a for a in range(1,len(V)+1)]
V['key'] = 1
V

Unnamed: 0,word,word_idx,key
0,archive,1,1
1,name,2,1
2,atheism,3,1
3,resources,4,1
4,alt,5,1
5,last,6,1
6,modified,7,1
7,december,8,1
8,version,9,1
9,atheist,10,1


In [3]:
s = pd.Series([i+1 for i in range(20)], index=[i+1 for i in range(20)]).to_frame()
s['key'] = 1
s.columns = ['label_idx', 'key']
s = pd.merge(V, s, on='key')

## Build a multinomial Naive Bayes model using the training data

# &pi;<sub>j</sub> 
### Get the fraction of documents that belong to each class

In [4]:
class_prob = Y.groupby(['label_idx']).agg({'doc_idx': 'count'})/len(Y)
class_prob = class_prob.rename(columns = {'doc_idx': 'Pi_j'})
class_prob = class_prob.reset_index()
class_prob

Unnamed: 0,label_idx,Pi_j
0,1,0.042595
1,2,0.051557
2,3,0.050759
3,4,0.05209
4,5,0.051025
5,6,0.052533
6,7,0.051646
7,8,0.052533
8,9,0.052888
9,10,0.052711


# P<sub>j</sub>
### Find the probability distribution over V that models the documents for each class

In [5]:
P_jw = pd.merge(X,Y,how='inner',on='doc_idx')
P_jw = P_jw.groupby(['word_idx', 'label_idx']).agg({'occurance_count':sum}).reset_index()

P_jw = pd.merge(right=P_jw, left=s, how='outer', on=['word_idx','label_idx']).fillna(value=0)
P_jw['x_smooth'] = P_jw['occurance_count']+1
P_jw = P_jw.groupby(['label_idx', 'word_idx']).agg({'x_smooth':sum})
P_jw = P_jw.groupby(level=0).transform(lambda x: x/x.sum())
P_jw.columns = ['P_j']
#P_jw['P_j'] = P_jw['x_smooth'] / sum(P_jw['x_smooth'] )
P_jw = P_jw.reset_index()
P_jw[P_jw['label_idx'] == 20].P_j.sum()

1.0000000000016915

## Classifying new documents
This function takes a dataframe with doc_idx, word_idx, and count and classifies it to return a dataframe with the document_id and label.

In [6]:
def classify(data):
    a = pd.merge(data, P_jw, on='word_idx')
    b = pd.merge(a, class_prob, on='label_idx' )
    b['xlogp'] = b['occurance_count']*b['P_j'].apply(lambda x: log(x))
    b = b.groupby(['doc_idx', 'label_idx', 'Pi_j']).agg({'xlogp':sum})
    b = b.reset_index()
    b['logPi_j'] = b['Pi_j'].apply(lambda x: log(x))
    b['calc'] = b['logPi_j'] + b['xlogp']
    b[['doc_idx','label_idx', 'calc']]
    idx = b.groupby(['doc_idx'])['calc'].transform(max) == b['calc']
    predict = b[idx][['doc_idx','label_idx']].reset_index()
    return predict[['doc_idx', 'label_idx']]

#this is to test the classifier
test_classifier_data = X_test[X_test['doc_idx']==4]
classify(test_classifier_data)

Unnamed: 0,doc_idx,label_idx
0,4,16


## Checking Accuracy
This checks the accuracy of the model againsts all of the test data

In [7]:
def check_accuracy(data, label):
    predict = classify(data)
    check = pd.merge(label, predict, on=['doc_idx'])
    check['compare'] = check['label_idx_x'] - check['label_idx_y']
    accuracy = len(check[check['compare'] == 0])*100/len(check)
    return accuracy
    
#this is to test the classifier
test_classifier_data = X_test[X_test['doc_idx']<100]
test_classifier_label = Y_test[Y_test['doc_idx']<100]
check_accuracy(test_classifier_data, test_classifier_label)

76.76767676767676

In [8]:
c = check_accuracy(X_test, Y_test)
print "Precision:", c
#Error Rate:
print "Error Rate: ", 100-c

Precision: 78.107928048
Error Rate:  21.892071952


## Better-performing Model
Get a better performing mode.  Currently, I'm at an accuracy of 78%.  If I want to get a better match...

#### Split the training model into a smaller training set and validation set

In [9]:
#define the vocab
def vocab(words,label_count,reduce_vocab=False):
    ##tweek vocabulary here
    if reduce_vocab == True:
        words = words[:40000]

    s = pd.Series([i+1 for i in range(label_count)], index=[i+1 for i in range(20)]).to_frame()
    s['key'] = 1
    s.columns = ['label_idx', 'key']
    s = pd.merge(words, s, on='key')
    return s

#next get Pi_j
def Pi_j(label):
    class_prob = label.groupby(['label_idx']).agg({'doc_idx': 'count'})/len(Y)
    class_prob = class_prob.rename(columns = {'doc_idx': 'Pi_j'})
    class_prob = class_prob.reset_index()
    return class_prob

#now get P_jw
def P_jw(data, label, vocab):
    #use inputs here
    P_jw = pd.merge(data,label,how='inner',on='doc_idx')
    s = vocab
    
    #calcs here
    P_jw = P_jw.groupby(['word_idx', 'label_idx']).agg({'occurance_count':sum}).reset_index()
    P_jw = pd.merge(right=P_jw, left=s, how='outer', on=['word_idx','label_idx']).fillna(value=0)
    P_jw['x_smooth'] = P_jw['occurance_count']+1
    P_jw = P_jw.groupby(['label_idx', 'word_idx']).agg({'x_smooth':sum})
    P_jw = P_jw.groupby(level=0).transform(lambda x: x/x.sum())
    P_jw.columns = ['P_j']
    P_jw = P_jw.reset_index()
    
    return P_jw

def classify(data, P_jw, Pi_j):    
    #use inputs here
    a = pd.merge(data, P_jw, on='word_idx')
    b = pd.merge(a, Pi_j, on='label_idx' )
    
    
    b['xlogp'] = b['occurance_count']*b['P_j'].apply(lambda x: log(x))
    b = b.groupby(['doc_idx', 'label_idx', 'Pi_j']).agg({'xlogp':sum})
    b = b.reset_index()
    b['logPi_j'] = b['Pi_j'].apply(lambda x: log(x))
    b['calc'] = b['logPi_j'] + b['xlogp']
    b[['doc_idx','label_idx', 'calc']]
    idx = b.groupby(['doc_idx'])['calc'].transform(max) == b['calc']
    predict = b[idx][['doc_idx','label_idx']].reset_index()
    
    return predict[['doc_idx', 'label_idx']]

def check_accuracy(data, label, P_jw, Pi_j):
    predict = classify(data, P_jw, Pi_j)
    check = pd.merge(label, predict, on=['doc_idx'])
    check['compare'] = check['label_idx_x'] - check['label_idx_y']
    accuracy = len(check[check['compare'] == 0])*100/len(check)
    return accuracy


#I'm going to split the data with a modulo of the doc_idx.
# Modulo 4 gets every fourth doc_idx...which is roughly 25% of all documents
# Modulo 5 gets every fifth doc_idx... which is roughly 20% of all documents
# Modulo 6 gets every sixth doc_idx.. and so on
results = []
validation = []
for i in range(4,25):
    X_validate = X[X['doc_idx']%i == 0]
    X_split = X[X['doc_idx']%i != 0]

    Y_validate = Y[Y['doc_idx']%i == 0]
    Y_split = Y[Y['doc_idx']%i != 0]

    df_Pi_j = Pi_j(Y_split)
    df_P_jw = P_jw(X_split, Y_split, vocab(V,20))
    #classify(X_validate, df_P_jw, df_Pi_j)

    validation.append([i, 100-check_accuracy(X_validate, Y_validate, df_P_jw, df_Pi_j)]) 
    #results.append([i, check_accuracy(X, Y, df_P_jw, df_Pi_j)])
validation

[[4, 15.548455804046853],
 [5, 15.002219263204623],
 [6, 13.84451544195953],
 [7, 14.045991298943449],
 [8, 13.849431818181813],
 [9, 11.341853035143771],
 [10, 13.94316163410302],
 [11, 14.0625],
 [12, 12.034078807241741],
 [13, 14.31870669745959],
 [14, 13.805970149253724],
 [15, 13.98135818908122],
 [16, 14.630681818181813],
 [17, 13.74622356495469],
 [18, 11.821086261980824],
 [19, 13.827993254637434],
 [20, 12.433392539964473],
 [21, 14.179104477611943],
 [22, 13.671875],
 [23, 13.70143149284253],
 [24, 12.153518123667382]]

In [None]:
results
#after running this, it seems that modulo 9 gets the highest accuracy for the validation set

[[4, 84.45154419595315],
 [5, 84.99778073679538],
 [6, 86.15548455804047],
 [7, 85.95400870105655],
 [8, 86.15056818181819],
 [9, 88.65814696485623],
 [10, 86.05683836589698],
 [11, 85.9375],
 [12, 87.96592119275826],
 [13, 85.68129330254041],
 [14, 86.19402985074628],
 [15, 86.01864181091878],
 [16, 85.36931818181819],
 [17, 86.25377643504531],
 [18, 88.17891373801918],
 [19, 86.17200674536257],
 [20, 87.56660746003553],
 [21, 85.82089552238806],
 [22, 86.328125],
 [23, 86.29856850715747],
 [24, 87.84648187633262]]

#### Now let's tweek by eliminating the stop words

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

#define the vocab
def vocab(words,label_count,reduce_vocab=False):
    ##tweek vocabulary here
    if reduce_vocab == True:
        words = [(word_idx,word, key) for (a, word,word_idx,key) in words.itertuples() if word not in stop_words]
        words = pd.DataFrame(data=words, columns=['word_idx', 'word','key'])

    s = pd.Series([i+1 for i in range(label_count)], index=[i+1 for i in range(20)]).to_frame()
    s['key'] = 1
    s.columns = ['label_idx', 'key']
    s = pd.merge(words, s, on='key')
    return s

len(vocab(V,20,reduce_vocab=True)), len(vocab(V,20,reduce_vocab=False))

validation = []
for i in range(5,20):
    X_validate = X[X['doc_idx']%i == 0]
    X_split = X[X['doc_idx']%i != 0]

    Y_validate = Y[Y['doc_idx']%i == 0]
    Y_split = Y[Y['doc_idx']%i != 0]

    df_Pi_j = Pi_j(Y_split)
    df_P_jw = P_jw(X_split, Y_split, vocab(V,20,True))
    validation.append([i, check_accuracy(X_validate, Y_validate, df_P_jw, df_Pi_j), True]) 
    
    df_P_jw = P_jw(X_split, Y_split, vocab(V,20,False))
    validation.append([i, 100-check_accuracy(X_validate, Y_validate, df_P_jw, df_Pi_j), False]) 
validation

#### Now let's try replacing the frequency (f) of a word by log(1+f)
We will not use the modified Vocabulary (without stop_words), because it didn't really improve the overall precision of the model.

In [None]:
def P_jw(data, label, vocab):
    #use inputs here
    P_jw = pd.merge(data,label,how='inner',on='doc_idx')
    s = vocab
    
    #calcs here
    P_jw = P_jw.groupby(['word_idx', 'label_idx']).agg({'occurance_count':sum}).reset_index()
    P_jw = pd.merge(right=P_jw, left=s, how='outer', on=['word_idx','label_idx']).fillna(value=0)
    
    #this is where we replace the frequency of a word by log(1+f)
    P_jw['x_smooth'] = P_jw['occurance_count'].apply(lambda x: log(x+2))
    P_jw = P_jw.groupby(['label_idx', 'word_idx']).agg({'x_smooth':sum})
    P_jw = P_jw.groupby(level=0).transform(lambda x: x/x.sum())
    P_jw.columns = ['P_j']
    P_jw = P_jw.reset_index()
    
    return P_jw

results = []
for i in range(9,10):
    X_validate = X[X['doc_idx']%i == 0]
    X_split = X[X['doc_idx']%i != 0]

    Y_validate = Y[Y['doc_idx']%i == 0]
    Y_split = Y[Y['doc_idx']%i != 0]

    df_P_jw = P_jw(X_split, Y_split, vocab(V,20))
    results.append([i, 100-check_accuracy(X_validate, Y_validate, df_P_jw, df_Pi_j), False])
results
#The results are not as good...

The best accuracy out of the options I've tried comes from splitting the training data by modulo 9...and it doesn't matter whether we pull our the stop_words from the vocabulary.

In [None]:
def P_jw(data, label, vocab):
    #use inputs here
    P_jw = pd.merge(data,label,how='inner',on='doc_idx')
    s = vocab
    
    #calcs here
    P_jw = P_jw.groupby(['word_idx', 'label_idx']).agg({'occurance_count':sum}).reset_index()
    P_jw = pd.merge(right=P_jw, left=s, how='outer', on=['word_idx','label_idx']).fillna(value=0)
    P_jw['x_smooth'] = P_jw['occurance_count']+1
    P_jw = P_jw.groupby(['label_idx', 'word_idx']).agg({'x_smooth':sum})
    P_jw = P_jw.groupby(level=0).transform(lambda x: x/x.sum())
    P_jw.columns = ['P_j']
    P_jw = P_jw.reset_index()
    
    return P_jw

results = []
i = 9

X_validate = X[X['doc_idx']%i == 0]
X_split = X[X['doc_idx']%i != 0]

Y_validate = Y[Y['doc_idx']%i == 0]
Y_split = Y[Y['doc_idx']%i != 0]

df_P_jw = P_jw(X_split, Y_split, vocab(V,20))
results.append([i, 100-check_accuracy(X, Y, df_P_jw, df_Pi_j), False])
results