# Environment Setup

The files used in this .ipynb file were sourced from http://qwone.com/~jason/20Newsgroups/. In particular, analysis was conducted on the "Matlab/Octave" processed set of files.

In [1]:
# downloading matlab processed version of "20 Newsgroups" data set
import numpy as np, requests

matlab_url = 'http://qwone.com/~jason/20Newsgroups/20news-bydate-matlab.tgz'
vocab_url = 'http://qwone.com/~jason/20Newsgroups/vocabulary.txt'

r = requests.get(matlab_url)
open('20news-bydate-matlab.tgz', 'wb').write(r.content)

r = requests.get(vocab_url)
open('vocabulary.txt', 'wb').write(r.content)

494093

In [2]:
# unpacking tar files and cleaning up directory
!rm -rf data
!mkdir data
!tar -xvzf 20news-bydate-matlab.tgz
!mv ./20news-bydate/matlab/* ./data
!mv vocabulary.txt ./data
!rm -r 20news-bydate 20news-bydate-matlab.tgz

20news-bydate/matlab/
20news-bydate/matlab/train.data
20news-bydate/matlab/train.label
20news-bydate/matlab/train.map
20news-bydate/matlab/test.data
20news-bydate/matlab/test.label
20news-bydate/matlab/test.map


In [3]:
# loading in test and train data / labels
train_data = np.loadtxt('./data/train.data', delimiter = ' ')
train_labels = np.loadtxt('./data/train.label', delimiter = ' ')

test_data = np.loadtxt('./data/test.data', delimiter = ' ')
test_labels = np.loadtxt('./data/test.label', delimiter = ' ')

In [4]:
# loading vocab and mapper objects
vocabulary = np.loadtxt('./data/vocabulary.txt', dtype = object)
train_map = np.loadtxt('./data/train.map', dtype = object, delimiter = ' ')
test_map = np.loadtxt('./data/test.map', dtype = object, delimiter = ' ')

In [5]:
# correcting column data types
train_data = train_data.astype(int)
train_labels = train_labels.astype(int)
test_data = test_data.astype(int)
test_labels = test_labels.astype(int)
train_map[:, 1] = train_map[:, 1].astype(int)
test_map[:, 1] = test_map[:, 1].astype(int)

# Classifier Construction

This multinomial naive Bayes classifier relies on two main calculations: label priors (percent of documents attributed to each label) and word probabilities (probability of words appears in documents from each label).

In [6]:
# calculating log label priors
import pandas as pd

doc_total = train_labels.shape[0]
_, data = np.unique(train_labels, return_counts = True) # only keeping counts, trashing unique label structure
data = np.log(data/doc_total)
df_label_fracs = pd.DataFrame(data, index = range(1, 21), columns = ['fracs'])
df_label_fracs


Unnamed: 0,fracs
1,-3.156025
2,-2.96506
3,-2.980672
4,-2.954786
5,-2.975441
6,-2.946304
7,-2.96334
8,-2.946304
9,-2.93957
10,-2.942932


In [7]:
# calculating word probabilities per class
df_data = pd.DataFrame(train_data, columns = ['docId', 'wordId', 'count'])
df_labels = pd.DataFrame(train_labels, columns = ['classId'])
df_labels['docId'] = df_labels.index + 1
df_merge = df_data.merge(df_labels, how = 'left', left_on = 'docId', right_on = 'docId')
vocab_len = vocabulary.shape[0] # total vocabulary count
df_word_probs_by_class = pd.DataFrame([], index = range(1, 21), columns = range(1, vocab_len + 1)) # dataframe to house word probabilities
for i in range(1, 21):
    df_word_counts = df_merge[['wordId', 'count']][df_merge.classId == i].groupby(['wordId']).sum() # initial word counts
    class_vocab_len = df_word_counts['count'].sum() # per class vocabulary count
    df_dummy_vocab = pd.DataFrame([], index = range(1, vocab_len + 1)) # dummy dataframe
    df_total_word_counts = df_dummy_vocab.merge(df_word_counts['count'], how = 'left', left_index = True, right_on = 'wordId') # total word counts
    df_total_word_counts.fillna(0, inplace = True)
    df_total_word_counts.set_index('wordId', drop = True, inplace = True)
    df_total_word_counts['count'] = df_total_word_counts['count'].apply(lambda x: np.log((x + 1)/(class_vocab_len + vocab_len))) # convert to log smoothed values
    df_word_probs_by_class.loc[i, :] = df_total_word_counts['count']
df_word_probs_by_class

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,61179,61180,61181,61182,61183,61184,61185,61186,61187,61188
1,-9.615805,-8.09598,-6.634462,-9.952278,-7.836022,-8.517193,-10.308953,-11.561716,-8.699515,-7.306103,...,-12.254863,-12.254863,-12.254863,-12.254863,-12.254863,-12.254863,-12.254863,-12.254863,-12.254863,-12.254863
2,-7.941733,-7.958262,-12.052607,-9.162235,-9.344557,-7.975069,-9.654711,-10.106697,-6.899315,-10.666312,...,-12.052607,-12.052607,-12.052607,-12.052607,-12.052607,-12.052607,-12.052607,-12.052607,-12.052607,-12.052607
3,-9.446433,-7.682844,-11.93134,-9.040968,-8.840297,-8.375992,-10.832727,-10.832727,-6.954606,-11.93134,...,-11.93134,-11.93134,-11.93134,-11.93134,-11.93134,-11.93134,-11.93134,-11.93134,-11.93134,-11.93134
4,-9.78779,-8.519279,-11.985014,-11.985014,-9.587119,-8.093194,-10.886402,-11.291867,-8.093194,-11.985014,...,-11.985014,-11.985014,-11.985014,-11.985014,-11.985014,-11.985014,-11.985014,-11.985014,-11.985014,-11.985014
5,-9.954846,-8.374395,-11.900756,-11.207609,-11.207609,-8.029555,-11.207609,-11.900756,-8.029555,-11.900756,...,-11.900756,-11.900756,-11.900756,-11.900756,-11.900756,-11.900756,-11.900756,-11.900756,-11.900756,-11.900756
6,-8.402689,-6.866718,-12.27389,-7.891864,-9.501301,-8.303598,-9.182848,-10.887596,-6.801619,-12.27389,...,-12.27389,-12.27389,-12.27389,-12.27389,-12.27389,-12.27389,-12.27389,-12.27389,-12.27389,-12.27389
7,-11.714085,-8.346789,-11.714085,-10.615473,-10.615473,-8.217578,-11.714085,-11.714085,-8.346789,-10.327791,...,-11.714085,-11.714085,-11.714085,-11.714085,-11.714085,-11.714085,-11.714085,-11.714085,-11.714085,-11.714085
8,-9.771612,-8.066864,-12.074197,-12.074197,-9.43514,-7.608289,-9.994755,-10.687903,-9.129758,-12.074197,...,-12.074197,-12.074197,-12.074197,-12.074197,-12.074197,-12.074197,-12.074197,-12.074197,-12.074197,-12.074197
9,-9.298467,-7.78701,-12.006517,-10.39708,-10.39708,-7.575701,-10.620223,-11.31337,-10.39708,-12.006517,...,-12.006517,-12.006517,-12.006517,-12.006517,-12.006517,-12.006517,-12.006517,-12.006517,-12.006517,-12.006517
10,-11.345016,-8.511802,-12.038163,-10.93955,-11.345016,-6.32443,-11.345016,-12.038163,-10.93955,-12.038163,...,-12.038163,-12.038163,-12.038163,-12.038163,-12.038163,-12.038163,-12.038163,-12.038163,-12.038163,-12.038163


In [8]:
# function for document classification
def predict_document_label(docId):
    '''
    Returns predicted label of passed document id.
    '''
    df_word_counts = df_test_data.loc[df_test_data.docId == docId].copy()
    df_word_counts.loc[:, 'count'] = df_word_counts.loc[:, 'count'].apply(lambda x: np.log(x + 1)) # log smoothed counts
    df_total_word_counts = df_dummy_vocab.merge(df_word_counts, how = 'left', left_index = True, right_on = 'wordId') # counts of ALL words
    df_total_word_counts.fillna(0, inplace = True)
    df_total_word_counts.set_index('wordId', drop = True, inplace = True)
    df_total_word_counts.drop(['docId'], axis = 1, inplace = True)
    df_label_probs = df_word_probs_by_class.dot(df_total_word_counts).loc[:, 'count'] + df_label_fracs.loc[:, 'fracs'] # log probabilities calculation
    label_prediction = np.argmax(df_label_probs) + 1
    return label_prediction

# Prediction Accuracy

In [9]:
# predicting labels, takes a while to run, with 5 cores -> 4 minutes
df_test_data = pd.DataFrame(test_data, columns = ['docId', 'wordId', 'count'])
df_predictions = pd.DataFrame(test_labels, index = range(1, test_labels.shape[0] + 1), columns = ['realLabel'])
for i in range(1, df_predictions.shape[0] + 1):
    df_predictions.loc[i, 'predLabel'] = predict_document_label(i)

In [10]:
# calculating error rate
wrong_predictions = df_predictions.query('realLabel != predLabel').shape[0]
error_rate = round(100*wrong_predictions/df_predictions.shape[0], 2)
print(f'Error rate: {error_rate}%')

Error rate: 20.92%


The first attempt for this classifier is decently accurate, though this can likely be improved upon. The above code already implements smoothing for zero-probability mitigation, and compensation for word burstiness. The next approach to improve accuracy would be to downweight particularly common words.

# Model Improvement



In [11]:
# inverse document frequencies per word
df_word_weights = pd.DataFrame([], index = range(1, vocab_len + 1), columns = ['idf'])
df_word_weights.loc[:, 'idf'] = df_data.groupby(['wordId', 'docId']).count().groupby(['wordId']).sum()
document_count = df_data.docId.unique().shape[0]
df_word_weights = df_word_weights.apply(lambda x: np.log(2*document_count/(1 + x))).T.fillna(np.log(2*document_count))
df_word_weights

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,61179,61180,61181,61182,61183,61184,61185,61186,61187,61188
idf,5.132609,3.357274,5.378567,5.235466,4.744843,2.857465,5.388229,5.65351,3.845014,5.269368,...,10.022958,10.022958,10.022958,10.022958,10.022958,10.022958,10.022958,10.022958,10.022958,10.022958


In [12]:
# applying weights to word probabilities
for i in range(1, 21):
    df_word_probs_by_class.loc[i, :] = df_word_probs_by_class.loc[i, :].mul(df_word_weights).values
df_word_probs_by_class

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,61179,61180,61181,61182,61183,61184,61185,61186,61187,61188
1,-49.354169,-27.180425,-35.683899,-52.104815,-37.180698,-24.337578,-55.546998,-65.364277,-33.449755,-38.498544,...,-122.829976,-122.829976,-122.829976,-122.829976,-122.829976,-122.829976,-122.829976,-122.829976,-122.829976,-122.829976
2,-40.761809,-26.718069,-64.825755,-47.968573,-44.338457,-22.788478,-52.021797,-57.138312,-26.527963,-56.204724,...,-120.802772,-120.802772,-120.802772,-120.802772,-120.802772,-120.802772,-120.802772,-120.802772,-120.802772,-120.802772
3,-48.484847,-25.793417,-64.173512,-47.333683,-41.945826,-23.934099,-58.369217,-61.242935,-26.740557,-62.870618,...,-119.587317,-119.587317,-119.587317,-119.587317,-119.587317,-119.587317,-119.587317,-119.587317,-119.587317,-119.587317
4,-50.236897,-28.601555,-64.462205,-62.747139,-45.489379,-23.126016,-58.658428,-63.838687,-31.118444,-63.15345,...,-120.125297,-120.125297,-120.125297,-120.125297,-120.125297,-120.125297,-120.125297,-120.125297,-120.125297,-120.125297
5,-51.094331,-28.115143,-64.009015,-58.677058,-53.178349,-22.944169,-60.389164,-67.281045,-30.873751,-62.709461,...,-119.280778,-119.280778,-119.280778,-119.280778,-119.280778,-119.280778,-119.280778,-119.280778,-119.280778,-119.280778
6,-43.127717,-23.053457,-66.015942,-41.317586,-45.082187,-23.727238,-49.479287,-61.553134,-26.152322,-64.675642,...,-123.020686,-123.020686,-123.020686,-123.020686,-123.020686,-123.020686,-123.020686,-123.020686,-123.020686,-123.020686
7,-60.123818,-28.022462,-63.004994,-55.57695,-50.368756,-23.481437,-63.118174,-66.2257,-32.093521,-54.420929,...,-117.409784,-117.409784,-117.409784,-117.409784,-117.409784,-117.409784,-117.409784,-117.409784,-117.409784,-117.409784
8,-50.153863,-27.082675,-64.94188,-63.214052,-44.76826,-21.740416,-53.854032,-60.424167,-35.104047,-63.623386,...,-121.01917,-121.01917,-121.01917,-121.01917,-121.01917,-121.01917,-121.01917,-121.01917,-121.01917,-121.01917
9,-47.725396,-26.143128,-64.57786,-54.43356,-49.332514,-21.647296,-57.224195,-63.960254,-39.976916,-63.266757,...,-120.340821,-120.340821,-120.340821,-120.340821,-120.340821,-120.340821,-120.340821,-120.340821,-120.340821,-120.340821
10,-58.229528,-28.576455,-64.748067,-57.273648,-53.830322,-18.071835,-61.129543,-68.057876,-42.062724,-63.433508,...,-120.658,-120.658,-120.658,-120.658,-120.658,-120.658,-120.658,-120.658,-120.658,-120.658


In [13]:
# predicting again and checking accuracy
df_test_data = pd.DataFrame(test_data, columns = ['docId', 'wordId', 'count'])
df_predictions = pd.DataFrame(test_labels, index = range(1, test_labels.shape[0] + 1), columns = ['realLabel'])
for i in range(1, df_predictions.shape[0] + 1):
    df_predictions.loc[i, 'predLabel'] = predict_document_label(i)
wrong_predictions = df_predictions.query('realLabel != predLabel').shape[0]
error_rate = round(100*wrong_predictions/df_predictions.shape[0], 2)
print(f'Error rate: {error_rate}%')


Error rate: 18.09%
