Reference: https://github.com/jerry-shijieli/Text_Classification_Using_EM_And_Semisupervied_Learning/blob/master/code/EM_NB_text_classification_v1.ipynb

In [1]:
# Import packages and libraries
import numpy as np
import random as rnd

from scipy.sparse import csr_matrix
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from pprint import pprint

In [2]:
# Load train and test data set with class labels 
train_Xy = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_Xy = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [50]:
train_Xy

{'data': ['I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
  "A fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and 

In [146]:
# Convert all text data into tf-idf vectors
vectorizer = TfidfVectorizer(stop_words='english', min_df=3, ngram_range=(1,2))
train_vec = vectorizer.fit_transform(train_Xy.data)
test_vec = vectorizer.transform(test_Xy.data)

In [169]:
# Divide train data set into labeled and unlabeled data sets
n_train_data = train_vec.shape[0]
split_ratio = 0.1 # labeled vs unlabeled
X_l, X_u, y_l, y_u = train_test_split(train_vec, train_Xy.target, train_size=split_ratio)
print(X_l.shape, X_u.shape)

(1131, 62739) (10183, 62739)


In [170]:
pd.Series(vectorizer.vocabulary_)
# .to_clipboard()

wondering         61201
enlighten         20833
car               11776
saw               49231
day               16627
                  ...  
bank account       8991
gun safe          26270
hunch             28040
traffic safety    56554
blanking          10204
Length: 62739, dtype: int64

In [171]:
pd.Series(vectorizer.get_feature_names())
# .to_clipboard()

0           00
1        00 00
2        00 01
3        00 02
4        00 03
         ...  
62734       zx
62735    zx 11
62736       zy
62737    zyxel
62738       zz
Length: 62739, dtype: object

In [172]:
# Train Naive Bayes classifier (imported) 
# using both labeled and unlabeled data set
clf = MultinomialNB(alpha=0.1)
clf.fit(X_l, y_l)
# clf.fit(train_vec, train_Xy.target)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [173]:
# Evaluate NB classifier using test data set
pred = clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.47      0.18      0.26       319
           comp.graphics       0.68      0.34      0.46       389
 comp.os.ms-windows.misc       0.64      0.30      0.41       394
comp.sys.ibm.pc.hardware       0.29      0.80      0.43       392
   comp.sys.mac.hardware       0.72      0.05      0.10       385
          comp.windows.x       0.64      0.64      0.64       395
            misc.forsale       0.35      0.79      0.48       390
               rec.autos       0.64      0.51      0.56       396
         rec.motorcycles       0.65      0.57      0.61       398
      rec.sport.baseball       0.83      0.62      0.71       397
        rec.sport.hockey       0.67      0.83      0.74       399
               sci.crypt       0.51      0.72      0.59       396
         sci.electronics       0.48      0.30      0.37       393
                 sci.med       0.74      0.61      0.66       396
         

In [174]:
pprint(metrics.confusion_matrix(test_Xy.target, pred))


array([[ 57,   3,   0,   7,   0,   4,  16,   5,   6,   6,   7,  17,   6,
         12,   7, 123,  28,   6,   1,   8],
       [  1, 134,  21,  92,   2,  42,  24,   1,   2,   0,   1,  21,  18,
          1,  23,   2,   2,   1,   1,   0],
       [  1,  12, 118, 123,   2,  42,  39,   4,   2,   1,   7,  21,   2,
          3,  13,   0,   3,   0,   1,   0],
       [  0,   1,   7, 315,   3,   5,  34,   2,   0,   0,   4,   6,  11,
          0,   3,   0,   0,   1,   0,   0],
       [  1,   1,   6, 231,  21,   8,  65,   4,   3,   1,   3,  18,  14,
          1,   7,   1,   0,   0,   0,   0],
       [  0,  16,  18,  37,   0, 252,  22,   2,   7,   1,   2,  12,   6,
          1,  17,   0,   1,   0,   1,   0],
       [  0,   3,   2,  47,   0,   0, 310,   2,   3,   1,   9,   5,   2,
          1,   3,   0,   2,   0,   0,   0],
       [  3,   4,   3,  23,   0,   1,  61, 200,  38,   1,   7,   7,  11,
          4,  12,   2,  14,   2,   3,   0],
       [  1,   2,   0,  14,   0,   4,  43,  29, 227,   5,  11,  

In [175]:
print(metrics.accuracy_score(test_Xy.target, pred))


0.528146574614976


In [176]:
# from scipy.linalg import get_blas_funcs
# b_w_d = (X_u > 0).T.toarray()
# lp_w_c = (clf.feature_log_prob_)
# lp_d_c = get_blas_funcs("gemm", (lp_w_c, b_w_d))
# print type(lp_w_c), type(b_w_d), type(lp_d_c)
# # lp_d_c(alpha=1.0, a=lp_w_c, b=b_w_d.T, trans_a=True, trans_b=True)
# lp_d_c(alpha=1.0, a=lp_w_c, b=b_w_d).shape

In [177]:
# find the most informative features 
import numpy as np
def show_topK(classifier, vectorizer, categories, K=10):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        topK = np.argsort(classifier.coef_[i])[-K:] 
        # coef_ gives per output category per feature_name a feaure_log_prob for that class
        # by sorting it and getting the last k names, we get the most strong predicting features of that output class
        print("%s: %s" % (category, ",".join(feature_names[topK])))

In [178]:
print((clf.coef_).shape)

(20, 62739)


In [179]:
show_topK(clf, vectorizer, train_Xy.target_names, K=20)


alt.atheism: blew bronx,beauchaine bobbe,loans,exactly,believe,viewpoint,evolution,beautiful,muslim,say,homosexuality,koresh,atheists,mean,don,islamic,people,god,islam,just
comp.graphics: virtual reality,color,thanks help,try,need,virtual,does,tea,help,windows,anybody,thanks,video,mail,know,keywords,format,files,graphics,agree
comp.os.ms-windows.misc: cica indiana,hi,smartdrv,advance,help,zip,program,mail,edu,desktop,thanks,problem,ftp,use,pc,files,dos,file,cica,windows
comp.sys.ibm.pc.hardware: drives,use,just,board,pc,thanks,disks,machine,memory,vl bus,monitor,isa,vl,does,drive,controller,scsi,ide,bus,card
comp.sys.mac.hardware: don,monitor,fax,ibm,keyboard,know,syquest,apple,portable,use,thanks,internal video,external,art,modem,duo,problem,internal,drive,mac
comp.windows.x: ftp,using,read,application,error,font,code,like,thanks,windows,x11r5,server,program,motif,use,running,xterm,window,hi,widget
misc.forsale: software,30,email,best offer,edu,000,make offer,mail,condition,monitor,25

### Building base semi supervised NB

In [180]:
%run Semi_EM_NB.ipynb
# from Semi_EM_NB import Semi_EM_MultinomialNB


In [181]:
print(train_vec.shape)
print(test_vec.shape)

(11314, 62739)
(7532, 62739)


In [160]:
# Divide train data set into labeled and unlabeled data sets
n_train_data = train_vec.shape[0]
split_ratio = 0.5 # labeled vs unlabeled
X_l, X_u, y_l, y_u = train_test_split(train_vec, train_Xy.target, train_size=split_ratio, stratify=train_Xy.target)
print(X_l.shape, X_u.shape)


(5657, 62739) (5657, 62739)


In [161]:
# Train Naive Bayes classifier (imported) 
# using labeled data set only
nb_clf = MultinomialNB(alpha=1e-2)
nb_clf.fit(X_l, y_l)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [162]:
# Evaluate original NB classifier using test data set
pred = nb_clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))
# pprint(metrics.confusion_matrix(test_Xy.target, pred))
print(metrics.accuracy_score(test_Xy.target, pred))

                          precision    recall  f1-score   support

             alt.atheism       0.58      0.45      0.50       319
           comp.graphics       0.59      0.67      0.63       389
 comp.os.ms-windows.misc       0.63      0.55      0.59       394
comp.sys.ibm.pc.hardware       0.62      0.66      0.64       392
   comp.sys.mac.hardware       0.68      0.66      0.67       385
          comp.windows.x       0.80      0.72      0.76       395
            misc.forsale       0.79      0.74      0.76       390
               rec.autos       0.72      0.67      0.69       396
         rec.motorcycles       0.75      0.69      0.72       398
      rec.sport.baseball       0.86      0.81      0.83       397
        rec.sport.hockey       0.58      0.90      0.71       399
               sci.crypt       0.76      0.69      0.73       396
         sci.electronics       0.65      0.54      0.59       393
                 sci.med       0.81      0.74      0.77       396
         

In [163]:
# Train Naive Bayes classifier (imported) 
# using both labeled and unlabeled data set
em_nb_clf = Semi_EM_MultinomialNB(alpha=1e-2) # semi supervised EM based Naive Bayes classifier
em_nb_clf.fit(X_l, y_l, X_u)
# em_nb_clf.fit_with_clustering(X_l, y_l, X_u)
# em_nb_clf.partial_fit(X_l, y_l, X_u)

Initial expected log likelihood = -4215068.432

EM iteration #1
	Expected log likelihood = -3953194.860
EM iteration #2
	Expected log likelihood = -3953062.012
EM iteration #3
	Expected log likelihood = -3952966.748
EM iteration #4
	Expected log likelihood = -3952966.748


<__main__.Semi_EM_MultinomialNB at 0x1e84f490088>

In [164]:

# Evaluate semi-supervised EM NB classifier using test data set
pred = em_nb_clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))
# pprint(metrics.confusion_matrix(test_Xy.target, pred))
print(metrics.accuracy_score(test_Xy.target, pred))

                          precision    recall  f1-score   support

             alt.atheism       0.57      0.39      0.46       319
           comp.graphics       0.57      0.72      0.64       389
 comp.os.ms-windows.misc       0.69      0.54      0.60       394
comp.sys.ibm.pc.hardware       0.61      0.68      0.64       392
   comp.sys.mac.hardware       0.70      0.61      0.65       385
          comp.windows.x       0.80      0.72      0.76       395
            misc.forsale       0.76      0.75      0.75       390
               rec.autos       0.72      0.72      0.72       396
         rec.motorcycles       0.77      0.68      0.72       398
      rec.sport.baseball       0.91      0.76      0.83       397
        rec.sport.hockey       0.57      0.93      0.71       399
               sci.crypt       0.78      0.69      0.73       396
         sci.electronics       0.66      0.55      0.60       393
                 sci.med       0.84      0.76      0.79       396
         

In [165]:
show_topK(nb_clf, vectorizer, train_Xy.target_names, K=10) # keywords for each class by semisupervised EM NB classifier

alt.atheism: moral,does,say,people,atheists,think,islam,don,atheism,god
comp.graphics: use,gif,color,3d,file,program,files,thanks,image,graphics
comp.os.ms-windows.misc: use,problem,card,files,drivers,dos,thanks,driver,file,windows
comp.sys.ibm.pc.hardware: dos,isa,pc,disk,ide,controller,card,bus,drive,scsi
comp.sys.mac.hardware: duo,problem,use,monitor,lc,know,simms,drive,apple,mac
comp.windows.x: application,x11r5,thanks,mit,use,widget,code,server,motif,window
misc.forsale: email,sell,price,interested,new,condition,offer,shipping,00,sale
rec.autos: new,just,oil,dealer,good,ford,like,engine,cars,car
rec.motorcycles: dog,just,motorcycle,like,bikes,dod,riding,helmet,ride,bike
rec.sport.baseball: hit,good,think,runs,players,games,baseball,game,team,year
rec.sport.hockey: playoffs,pens,season,nhl,games,play,players,team,hockey,game
sci.crypt: secure,algorithm,security,nsa,government,keys,chip,encryption,clipper,key
sci.electronics: electronics,voltage,don,know,like,power,output,does,use,c

In [166]:
show_topK(em_nb_clf, vectorizer, train_Xy.target_names, K=10) # keywords for each class by semisupervised EM NB classifier

alt.atheism: does,atheists,religion,say,islam,people,atheism,think,don,god
comp.graphics: format,know,ftp,program,hi,file,files,image,thanks,graphics
comp.os.ms-windows.misc: card,program,use,thanks,drivers,driver,files,dos,file,windows
comp.sys.ibm.pc.hardware: isa,drives,pc,disk,ide,controller,bus,card,scsi,drive
comp.sys.mac.hardware: quadra,know,does,problem,monitor,simms,thanks,drive,apple,mac
comp.windows.x: program,x11r5,xterm,application,thanks,use,widget,motif,server,window
misc.forsale: interested,asking,email,price,new,condition,shipping,offer,00,sale
rec.autos: good,don,ford,new,dealer,just,like,engine,cars,car
rec.motorcycles: dog,like,motorcycle,helmet,riding,just,bikes,ride,dod,bike
rec.sport.baseball: think,braves,pitching,runs,hit,games,baseball,game,team,year
rec.sport.hockey: teams,year,nhl,season,play,games,players,hockey,team,game
sci.crypt: security,people,escrow,nsa,government,keys,chip,clipper,encryption,key
sci.electronics: good,used,does,voltage,know,thanks,li

In [167]:
print(nb_clf.class_log_prior_)
print(em_nb_clf.clf.class_log_prior_)


[-3.16001007 -2.96389519 -2.95028954 -2.95367364 -2.97422231 -2.94691686
 -2.96389519 -2.94691686 -2.94020542 -2.94020542 -2.93686652 -2.94691686
 -2.95028954 -2.94691686 -2.95028954 -2.93686652 -3.0311772  -2.99874192
 -3.19391162 -3.40420703]
[-3.24702145 -2.93187898 -3.00229433 -2.95367364 -2.99697044 -2.92856772
 -2.9952021  -2.95536999 -2.97942614 -2.99697044 -2.69653814 -2.99343687
 -2.95706923 -2.98991574 -2.9952021  -2.78128829 -2.98640697 -3.00407527
 -3.17681719 -3.64343672]


In [168]:
show_topK(nb_clf, vectorizer, train_Xy.target_names, K=100) # keywords for each class by semisupervised EM NB classifier

alt.atheism: qur,vice,saying,kent,life,did,rushdie,doesn,question,try,objective,bronx,manhattan,sank,alt,agree,theory,person,queens,alt atheism,didn,bob beauchaine,queens stay,manhattan sea,com said,sank manhattan,away sank,said queens,blew bronx,stay blew,beauchaine,beauchaine bobbe,bronx away,course,statement,cheers kent,make,yes,tek,animals,point,claim,bobbe vice,bobbe,science,good,allah,things,words,really,tek com,position,way,ico,wrong,post,ico tek,vice ico,species,book,mean,isn,atheist,exist,belief,example,like,right,fact,time,argument,explain,define,deletion,bobby,world,ve,sex,jesus,islamic,said,true,evidence,religious,believe,religion,know,morality,just,bible,moral,does,say,people,atheists,think,islam,don,atheism,god
comp.graphics: 3do,programs,cview,yes,amiga,problem,hope,unix,group,ray,viewer,mac,post,directory,look,radiosity,write,try,data,greatly appreciated,display,address,info,line,bmp,sure,lines,bit,mpeg,question,24,surface,source,256,formats,tiff,just,pub,anybody,don,co

In [None]:
topK = np.argsort(classifier.coef_[i])[-K:] 
        # coef_ gives per output category per feature_name a feaure_log_prob for that class
        # by sorting it and getting the last k names, we get the most strong predicting features of that output class
        print("%s: %s" % (category, ",".join(feature_names[topK])))