In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [3]:
df = pd.read_csv('dm-end2-1.csv', encoding='latin-1')

In [4]:
df

Unnamed: 0,text,label
0,From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...,2
1,From: danmg@grok85.ColumbiaSC.NCR.COM (Daniel ...,7
2,From: PA146008@utkvm1.utk.edu (David Veal)\nSu...,18
3,From: u895027@franklin.cc.utas.edu.au (Mark Ma...,1
4,From: cescript@mtu.edu (Charles Scripter)\nSub...,16
...,...,...
1995,From: mccullou@snake2.cs.wisc.edu (Mark McCull...,0
1996,From: josephc@cco.caltech.edu (Joseph Chiu)\nS...,12
1997,From: khioe@juno.jpl.nasa.gov (Kent Hioe)\nSub...,3
1998,From: rscharfy@magnus.acs.ohio-state.edu (Ryan...,18


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.40, random_state=22)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1200,) (1200,)
(800,) (800,)


In [6]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
vocab = vectorizer.get_feature_names()
print('Vocabulary size:', len(vocab))
print(vocab[:10])  # debug

Vocabulary size: 37308
['00', '000', '00000000', '0000000005', '00000074', '000062david42', '000100255pixel', '000152', '0002', '0005111312']


In [7]:
X_train_bow = vectorizer.transform(X_train)
X_test_bow = vectorizer.transform(X_test)
print('X_train_bow:')
print(repr(X_train_bow))
print('X_test_bow:')
print(repr(X_test_bow))

X_train_bow:
<1200x37308 sparse matrix of type '<class 'numpy.int64'>'
	with 191804 stored elements in Compressed Sparse Row format>
X_test_bow:
<800x37308 sparse matrix of type '<class 'numpy.int64'>'
	with 108873 stored elements in Compressed Sparse Row format>


In [8]:
Xbow = pd.DataFrame(X_train_bow.toarray(), 
                    index=y_train, columns=vocab)
display(Xbow.head())

Unnamed: 0_level_0,00,000,00000000,0000000005,00000074,000062david42,000100255pixel,000152,0002,0005111312,...,zznkzz,zzo,zzt,zzzzzz,âªlâ,â³ation,âº,âº_________________________________________________âº_____________________âº,âºnd,ã½ã
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
model = MultinomialNB(alpha=1.0)
model.fit(X_train_bow, y_train)
print(model.classes_)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


In [10]:
train_score = model.score(X_train_bow, y_train)
print('Train accuracy:', train_score)

Train accuracy: 0.8941666666666667


In [11]:
# Obtain each prediction  
y_test_pred = model.predict(X_test_bow)
df_pred = pd.DataFrame({
            'pred': y_test_pred,
            'true': y_test
          }).reset_index(drop=True)
display(df_pred.head())

Unnamed: 0,pred,true
0,16,12
1,12,12
2,17,13
3,0,0
4,16,8


In [12]:
# Prediction accuracy  
test_score = model.score(X_test_bow, y_test)
print('Test accuracy:', test_score)

Test accuracy: 0.4675


In [13]:
# Make crosstable  
ctab = pd.crosstab(df_pred['pred'], df_pred['true'])
display(ctab)

true,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,8,27,33,33,4,17,0,1,0,0,0,6,0,1,1,0,0,0,1
4,0,1,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,1,4,0,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,1,0,5,27,5,2,1,0,2,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,1,12,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,0,0,24,0,0,0,0,0,0,0,0,0,0
