In [3]:
import numpy as np
import random as rnd
import re
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit
from sklearn import metrics

from time import time
from pprint import pprint
from copy import deepcopy
# from wordcloud import WordCloud 
# from Semi_EM_NB import Semi_EM_MultinomialNB
%run Semi_EM_NB.ipynb
from os import path
from PIL import Image

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [4]:
def remove_noise(sentence):
    result = ''
    poster = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stopword_set = set(stopwords.words('english'))
    wordlist = re.sub(r"\n|(\\(.*?){)|}|[!$%^&*#()_+|~\-={}\[\]:\";'<>?,.\/\\]|[0-9]|[@]", ' ', sentence) # remove punctuation
    wordlist = re.sub('\s+', ' ', wordlist) # remove extra space
    wordlist_normal = [poster.stem(word.lower()) for word in wordlist.split()] # restore word to its original form (stemming)
    wordlist_normal = [lemmatizer.lemmatize(word, pos='v') for word in wordlist_normal] # restore word to its root form (lemmatization)
    wordlist_clean = [word for word in wordlist_normal if word not in stopword_set] # remove stopwords
    result = ' '.join(wordlist_clean)
    return result


In [5]:
def cross_validation(clf, data_X, data_y, unlabeled=None, n_folds=5):
    print('=' * 80)
    print("Validation: ")
    print(clf)
    kf = StratifiedKFold(n_splits=n_folds)
    start_time = time()
    train_accuracies= list() # training accuracy
    fold_count = 1
    original_clf = deepcopy(clf)
    for train_ids, valid_ids in kf.split(data_X, data_y):
        cv_clf = deepcopy(original_clf)
        print("Fold # %d" % fold_count)
        fold_count += 1
        train_X, train_y, valid_X, valid_y = data_X[train_ids], data_y[train_ids], data_X[valid_ids], data_y[valid_ids]
        if unlabeled==None:
            cv_clf.fit(train_X, train_y)
        else:
            cv_clf.fit(train_X, train_y, unlabeled)
        pred = cv_clf.predict(valid_X)
        train_accuracies.append(metrics.accuracy_score(valid_y, pred))
    train_time = time() - start_time
    print("Validation time: %0.3f seconds" % train_time)
    print("Average training accuracy: %0.3f" % np.mean(np.array(train_accuracies)))
    return train_accuracies, train_time

In [7]:
def show_topK(classifier, vectorizer, categories, K=10):
    feature_names = np.asarray(vectorizer.get_feature_names())
    nrows, ncols = 5, 4
#     fig, axes = plt.subplots(figsize=(50, 40), nrows=nrows, ncols=ncols)
    #d = path.dirname(__file__)
#     circle_mask = np.array(Image.open(path.join('./', "circle.png")))
    for i, category in enumerate(categories):
        topK = np.argsort(classifier.coef_[i])[-K:]
        text = " ".join(feature_names[topK])
        print("%s: %s" % (category, text))
#         wordcloud = WordCloud(background_color="white", mask=circle_mask).generate(text)
#         axes[i//ncols, i%ncols].imshow(wordcloud, cmap=plt.cm.cool_r, interpolation='bilinear')
#         axes[i//ncols, i%ncols].axis("off")
#         axes[i//ncols, i%ncols].set_title(category, fontweight="bold", size=24)
    plt.show()

In [8]:
# Load data set with class labels and split into train and test set
test_size_ratio = 0.2
data_Xy = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'), shuffle=True)
category_names = data_Xy.target_names # text names of all categories
train_X, test_X, train_y, test_y = train_test_split(data_Xy.data, data_Xy.target, test_size=test_size_ratio, stratify=data_Xy.target)
print("Training set size: %8d\tTest set size: %8d" % (len(train_X), len(test_X)))

Training set size:    15076	Test set size:     3770


In [10]:
# preprocess train and test text data
train_X_clean = map(remove_noise, train_X)
test_X_clean = map(remove_noise, test_X)

In [13]:
# Convert all text data into tf-idf vectors 
vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.95, ngram_range=(1,2))
# vectorizer = TfidfVectorizer()
train_vec = vectorizer.fit_transform(train_X_clean)
test_vec = vectorizer.transform(test_X_clean)
print(train_vec.shape, test_vec.shape)

(15076, 38677) (3770, 38677)


In [14]:
# Divide train data set into labeled and unlabeled data sets
split_ratio = 0.1 # labeled vs total(labeled+unlabeled)
X_l, X_u, y_l, y_u = train_test_split(train_vec, train_y, train_size=split_ratio, stratify=train_y)
print(X_l.shape, X_u.shape)

(1507, 38677) (13569, 38677)


In [15]:
# Cross validation for Naive Bayes classifier 
# using labeled data set only
nb_clf = MultinomialNB(alpha=1e-2)
cross_validation(nb_clf, X_l, y_l)

Validation: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
Fold # 1
Fold # 2
Fold # 3
Fold # 4
Fold # 5
Validation time: 0.150 seconds
Average training accuracy: 0.589


([0.5522875816993464,
  0.6143790849673203,
  0.6085526315789473,
  0.5833333333333334,
  0.5876288659793815],
 0.15016531944274902)

In [16]:
# Cross validation for semisupervised EM Naive Bayes classifier 
# using both labeled and unlabeled data set
em_nb_clf = Semi_EM_MultinomialNB(alpha=1e-2, tol=100, print_log_lkh=False) # semi supervised EM based Naive Bayes classifier
cross_validation(em_nb_clf, X_l, y_l, X_u)

Validation: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
Fold # 1
Fold # 2
Fold # 3
Fold # 4
Fold # 5
Validation time: 112.902 seconds
Average training accuracy: 0.664


([0.6666666666666666,
  0.6437908496732027,
  0.6907894736842105,
  0.6266666666666667,
  0.6941580756013745],
 112.90190744400024)

In [17]:

# Evaluate original NB classifier using test data set
nb_clf = MultinomialNB(alpha=1e-2).fit(X_l, y_l)
pred = nb_clf.predict(test_vec)
print(metrics.classification_report(test_y, pred, target_names=category_names))
# pprint(metrics.confusion_matrix(test_Xy.target, pred))
print(metrics.accuracy_score(test_y, pred))

                          precision    recall  f1-score   support

             alt.atheism       0.61      0.42      0.50       160
           comp.graphics       0.54      0.60      0.57       195
 comp.os.ms-windows.misc       0.59      0.53      0.56       197
comp.sys.ibm.pc.hardware       0.53      0.53      0.53       196
   comp.sys.mac.hardware       0.57      0.64      0.60       193
          comp.windows.x       0.65      0.78      0.71       198
            misc.forsale       0.77      0.63      0.69       195
               rec.autos       0.64      0.63      0.63       198
         rec.motorcycles       0.46      0.58      0.51       199
      rec.sport.baseball       0.81      0.81      0.81       199
        rec.sport.hockey       0.84      0.77      0.80       200
               sci.crypt       0.67      0.72      0.69       198
         sci.electronics       0.59      0.53      0.56       197
                 sci.med       0.77      0.78      0.78       198
         

In [18]:
# Evaluate semi-supervised EM NB classifier using test data set
em_nb_clf = Semi_EM_MultinomialNB(alpha=1e-2, tol=100, print_log_lkh=False).fit(X_l, y_l, X_u)
pred = em_nb_clf.predict(test_vec)
print(metrics.classification_report(test_y, pred, target_names=category_names))
# pprint(metrics.confusion_matrix(test_Xy.target, pred))
print(metrics.accuracy_score(test_y, pred))

                          precision    recall  f1-score   support

             alt.atheism       0.66      0.33      0.44       160
           comp.graphics       0.52      0.60      0.56       195
 comp.os.ms-windows.misc       0.72      0.58      0.64       197
comp.sys.ibm.pc.hardware       0.60      0.62      0.61       196
   comp.sys.mac.hardware       0.63      0.68      0.66       193
          comp.windows.x       0.68      0.79      0.73       198
            misc.forsale       0.81      0.63      0.71       195
               rec.autos       0.69      0.72      0.71       198
         rec.motorcycles       0.77      0.60      0.68       199
      rec.sport.baseball       0.93      0.80      0.86       199
        rec.sport.hockey       0.90      0.82      0.86       200
               sci.crypt       0.80      0.78      0.79       198
         sci.electronics       0.73      0.55      0.63       197
                 sci.med       0.78      0.84      0.81       198
         

In [19]:
show_topK(nb_clf, vectorizer, category_names, K=30) # keywords for each class by original NB classifier


alt.atheism: wa goal mean nicknam true read argument conner believ bobbi world ha doe religion moral belief theism bcci peopl atheism exist make atheist think whi post thi say god islam
comp.graphics: print vesa hello ha comp color algorithm directori anyon comp graphic line help bite format veri mail know window viewer use thank program look ani tiff pleas thi graphic file imag
comp.os.ms-windows.misc: product font zip help program manag nt desktop need thank set cica tri icon mode problem group card ax ax ax ani run program ms thi directori win use driver file window
comp.sys.ibm.pc.hardware: like thi monitor doe anyon jumper buy control bu pleas motherboard video need dx video card help pc use pin modem ani thank work problem board scsi ide thi monitor drive card
comp.sys.mac.hardware: upgrad thank machin quadra know powerbook ha disk centri extern buy mb problem ani instal lc drive processor doe anyon vram card simm use comput color mac monitor thi appl
comp.windows.x: like help ke

In [20]:
show_topK(em_nb_clf, vectorizer, category_names, K=30) # keywords for each class by semisupervised EM NB classifier


alt.atheism: definit becaus agre argument thing know ani doe statement word mean delet whi belief religion peopl make wa post believ atheism exist think object atheist islam moral say thi god
comp.graphics: site anyon know inform edu softwar email appreci post color doe bite help window ftp convert mail look anyon gif know use program format pleas ani thi graphic thank imag file
comp.os.ms-windows.misc: know tri ha edu font chang instal directori ms like mode zip disk os cica ani ftp card thank work version win problem run program thi use driver file window
comp.sys.ibm.pc.hardware: isa port need buy pleas motherboard problem disk work driver ha video pc board know doe anyon monitor dx mb ani control thank thi ide use bu scsi drive card
comp.sys.mac.hardware: quadra price speed buy new centri lc wa comput machin scsi mb disk work mhz ha thank doe card know simm ani problem monitor anyon use thi drive appl mac
comp.windows.x: set client font like edu tri mail know anyon help doe code pl