Reference : https://github.com/jerry-shijieli/Text_Classification_Using_EM_And_Semisupervied_Learning/blob/master/code/EM_NB_text_classification_v3.ipynb

In [2]:
# Import packages and libraries
import numpy as np
import random as rnd
import nltk as nk
import re

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit
from sklearn import metrics
from pprint import pprint
from copy import deepcopy

# from Semi_EM_NB import Semi_EM_MultinomialNB
%run Semi_EM_NB.ipynb
from time import time
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


In [3]:
# Load train and test data set with class labels 
train_Xy = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_Xy = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [4]:
def remove_noise(sentence):
    result = ''
    poster = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stopword_set = set(stopwords.words('english'))
    wordlist = re.sub(r"\n|(\\(.*?){)|}|[!$%^&*#()_+|~\-={}\[\]:\";'<>?,.\/\\]|[0-9]|[@]", ' ', sentence) # remove punctuation
    wordlist = re.sub('\s+', ' ', wordlist) # remove extra space
    wordlist_normal = [poster.stem(word.lower()) for word in wordlist.split()] # restore word to its original form (stemming)
    wordlist_normal = [lemmatizer.lemmatize(word, pos='v') for word in wordlist_normal] # restore word to its root form (lemmatization)
    wordlist_clean = [word for word in wordlist_normal if word not in stopword_set] # remove stopwords
    result = ' '.join(wordlist_clean)
    return result

In [5]:
# preprocess train and test text data
train_Xy.data_clean = map(remove_noise, train_Xy.data)
test_Xy.data_clean = map(remove_noise, test_Xy.data)


In [7]:
# Convert all text data into tf-idf vectors 
vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.95, ngram_range=(1,2))
# vectorizer = TfidfVectorizer()
train_vec = vectorizer.fit_transform(train_Xy.data_clean)
test_vec = vectorizer.transform(test_Xy.data_clean)
print(train_vec.shape, test_vec.shape)

(11314, 28591) (7532, 28591)


In [9]:
# Divide train data set into labeled and unlabeled data sets
n_train_data = train_vec.shape[0]
split_ratio = 0.2 # labeled vs total(labeled+unlabeled)
X_l, X_u, y_l, y_u = train_test_split(train_vec, train_Xy.target, train_size=split_ratio, stratify=train_Xy.target)
print(X_l.shape, X_u.shape)

(2262, 28591) (9052, 28591)


In [12]:
def cross_validation(clf, data_X, data_y, unlabeled=None, n_folds=5):
    print('=' * 80)
    print("Validation: ")
    print(clf)
    kf = StratifiedKFold(n_splits=n_folds)
    start_time = time()
    train_accuracies= list() # training accuracy
    fold_count = 1
    original_clf = deepcopy(clf)
    for train_ids, valid_ids in kf.split(data_X, data_y):
        cv_clf = deepcopy(original_clf)
        print("Fold # %d" % fold_count)
        fold_count += 1
        train_X, train_y, valid_X, valid_y = data_X[train_ids], data_y[train_ids], data_X[valid_ids], data_y[valid_ids]
        if unlabeled==None:
            cv_clf.fit(train_X, train_y)
        else:
            cv_clf.fit(train_X, train_y, unlabeled)
        pred = cv_clf.predict(valid_X)
        train_accuracies.append(metrics.accuracy_score(valid_y, pred))
    train_time = time() - start_time
    print("Validation time: %0.3f seconds" % train_time)
    print("Average training accuracy: %0.3f" % np.mean(np.array(train_accuracies)))
    return train_accuracies, train_time

In [13]:
# Cross validation for Naive Bayes classifier 
# using labeled data set only
nb_clf = MultinomialNB(alpha=1e-2)
cross_validation(nb_clf, X_l, y_l)

Validation: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
Fold # 1
Fold # 2
Fold # 3
Fold # 4
Fold # 5
Validation time: 0.122 seconds
Average training accuracy: 0.668


([0.6775599128540305,
  0.649890590809628,
  0.7120879120879121,
  0.6592427616926503,
  0.6402714932126696],
 0.12244248390197754)

In [14]:
# Cross validation for semisupervised EM Naive Bayes classifier 
# using both labeled and unlabeled data set
em_nb_clf = Semi_EM_MultinomialNB(alpha=1e-2) # semi supervised EM based Naive Bayes classifier
cross_validation(em_nb_clf, X_l, y_l, X_u)

Validation: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
Fold # 1
Initial expected log likelihood = -5640060.023

EM iteration #1
	Expected log likelihood = -5193666.978
EM iteration #2
	Expected log likelihood = -5192338.230
EM iteration #3
	Expected log likelihood = -5192089.145
EM iteration #4
	Expected log likelihood = -5192089.145
Fold # 2
Initial expected log likelihood = -5643146.147

EM iteration #1
	Expected log likelihood = -5197182.405
EM iteration #2
	Expected log likelihood = -5194837.307
EM iteration #3
	Expected log likelihood = -5194576.083
EM iteration #4
	Expected log likelihood = -5194576.083
Fold # 3
Initial expected log likelihood = -5639953.609

EM iteration #1
	Expected log likelihood = -5198405.282
EM iteration #2
	Expected log likelihood = -5196805.531
EM iteration #3
	Expected log likelihood = -5196645.714
EM iteration #4
	Expected log likelihood = -5196645.714
Fold # 4
Initial expected log likelihood = -5635301.561

EM iteration #1
	Expected l

([0.690631808278867,
  0.687089715536105,
  0.7362637362637363,
  0.6592427616926503,
  0.6561085972850679],
 40.447001695632935)

In [15]:
# Evaluate original NB classifier using test data set
nb_clf = MultinomialNB(alpha=1e-2).fit(X_l, y_l)
pred = nb_clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))
# pprint(metrics.confusion_matrix(test_Xy.target, pred))
print(metrics.accuracy_score(test_Xy.target, pred))

                          precision    recall  f1-score   support

             alt.atheism       0.52      0.30      0.38       319
           comp.graphics       0.50      0.61      0.55       389
 comp.os.ms-windows.misc       0.59      0.48      0.53       394
comp.sys.ibm.pc.hardware       0.55      0.61      0.58       392
   comp.sys.mac.hardware       0.61      0.57      0.59       385
          comp.windows.x       0.67      0.66      0.67       395
            misc.forsale       0.70      0.65      0.68       390
               rec.autos       0.65      0.61      0.63       396
         rec.motorcycles       0.73      0.63      0.67       398
      rec.sport.baseball       0.85      0.76      0.80       397
        rec.sport.hockey       0.57      0.89      0.69       399
               sci.crypt       0.72      0.66      0.69       396
         sci.electronics       0.56      0.45      0.50       393
                 sci.med       0.74      0.62      0.67       396
         

In [16]:

# Evaluate semi-supervised EM NB classifier using test data set
em_nb_clf = Semi_EM_MultinomialNB(alpha=1e-2).fit(X_l, y_l, X_u)
pred = em_nb_clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))
# pprint(metrics.confusion_matrix(test_Xy.target, pred))
print(metrics.accuracy_score(test_Xy.target, pred))

Initial expected log likelihood = -5602758.945

EM iteration #1
	Expected log likelihood = -5192818.083
EM iteration #2
	Expected log likelihood = -5191720.451
EM iteration #3
	Expected log likelihood = -5191615.689
EM iteration #4
	Expected log likelihood = -5191615.689
                          precision    recall  f1-score   support

             alt.atheism       0.65      0.26      0.37       319
           comp.graphics       0.51      0.67      0.58       389
 comp.os.ms-windows.misc       0.63      0.45      0.53       394
comp.sys.ibm.pc.hardware       0.54      0.68      0.60       392
   comp.sys.mac.hardware       0.68      0.56      0.62       385
          comp.windows.x       0.72      0.73      0.72       395
            misc.forsale       0.79      0.67      0.72       390
               rec.autos       0.73      0.64      0.68       396
         rec.motorcycles       0.69      0.68      0.69       398
      rec.sport.baseball       0.91      0.75      0.82       397
 

In [21]:
# find the most informative features 
import numpy as np
import matplotlib.pyplot as plt
# from wordcloud import WordCloud 
%matplotlib inline
def show_topK(classifier, vectorizer, categories, K=10):
    feature_names = np.asarray(vectorizer.get_feature_names())
#     fig, axes = plt.subplots(figsize=(50, 40), nrows=5, ncols=4)
    for i, category in enumerate(categories):
        topK = np.argsort(classifier.coef_[i])[-K:]
        text = ",".join(feature_names[topK])
        print("%s: %s" % (category, text))
#         wordcloud = WordCloud().generate(text)
#         axes[i//4, i%4].imshow(wordcloud, cmap=plt.cm.gray, interpolation='bilinear')
#         axes[i//4, i%4].axis("off")
#         axes[i//4, i%4].set_title(category, fontweight="bold", size=24)
#     plt.show()

In [22]:
show_topK(nb_clf, vectorizer, train_Xy.target_names, K=10) # keywords for each class by original NB classifier


alt.atheism: behavior,wa,delet,make,think,say,peopl,moral,thi,god
comp.graphics: softwar,packag,imag,program,ani,thi,thank,file,use,graphic
comp.os.ms-windows.misc: card,win,version,thi,problem,font,driver,use,file,window
comp.sys.ibm.pc.hardware: control,ide,thi,disk,irq,scsi,card,use,monitor,drive
comp.sys.mac.hardware: fpu,card,doe,drive,appl,thi,use,lc,simm,mac
comp.windows.x: program,display,file,client,server,ani,use,thi,widget,window
misc.forsale: mail,condit,manual,price,includ,new,ship,offer,sell,sale
rec.autos: new,drive,like,wa,dealer,thi,auto,ford,engin,car
rec.motorcycles: ani,rear,thi,shaft,wa,dod,dog,motorcycl,ride,bike
rec.sport.baseball: basebal,player,hi,team,win,brave,cub,year,wa,game
rec.sport.hockey: win,pittsburgh,playoff,wa,year,player,hockey,play,team,game
sci.crypt: escrow,govern,nsa,use,clipper,secur,chip,thi,encrypt,key
sci.electronics: amp,detector,ani,current,power,radar,line,thi,signal,use
sci.med: dsl,shame surrend,cadr dsl,edu shame,pitt,gordon bank,pitt

In [23]:
show_topK(em_nb_clf, vectorizer, train_Xy.target_names, K=10) # keywords for each class by semisupervised EM NB classifier


alt.atheism: wa,object,make,atheist,peopl,think,say,god,thi,moral
comp.graphics: know,anyon,program,ani,use,thi,imag,file,thank,graphic
comp.os.ms-windows.misc: card,problem,thank,program,font,thi,use,driver,file,window
comp.sys.ibm.pc.hardware: ide,problem,thi,disk,mb,bu,use,scsi,card,drive
comp.sys.mac.hardware: know,ha,thank,card,use,drive,thi,simm,appl,mac
comp.windows.x: display,ani,file,program,motif,widget,server,use,thi,window
misc.forsale: email,price,condit,pleas,new,includ,ship,sell,offer,sale
rec.autos: ani,new,dealer,buy,like,drive,engin,thi,wa,car
rec.motorcycles: dog,rider,helmet,like,dod,thi,motorcycl,wa,ride,bike
rec.sport.baseball: thi,win,hit,player,pitch,wa,hi,team,year,game
rec.sport.hockey: nhl,win,playoff,season,player,hockey,wa,play,team,game
sci.crypt: escrow,nsa,govern,secur,use,clipper,chip,thi,encrypt,key
sci.electronics: like,wire,good,ani,amp,voltag,power,circuit,thi,use
sci.med: gordon,geb,gordon bank,doctor,wa,food,patient,diseas,msg,thi
sci.space: satel