In [1]:
import os
import sys
import numpy as np
import pandas as pd  
from sklearn import tree 
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Perform Naiive bayes classification on an easy data set 
easy = fetch_20newsgroups(subset='train', 
                          categories=('rec.autos', 'rec.sport.hockey'),
                          remove=('headers', 'footers', 'quotes'))
tf_vec = CountVectorizer(max_df=500, min_df=0, max_features =10000, ngram_range =(1,1), 
                         stop_words='english')
tf_easy_matrix=tf_vec.fit_transform(easy.data[:500])  #sparse matrix
t_easy=np.asarray(easy.target[:500])  
full_easy_matrix = pd.DataFrame(tf_easy_matrix.todense(),columns=tf_vec.get_feature_names())
print ("Easy Data set: %d rows and %d columns " %
       (full_easy_matrix.shape[0], full_easy_matrix.shape[1]))

xtrain, xtest, ytrain, ytest = train_test_split(full_easy_matrix.as_matrix(),t_easy,random_state=50) 
clf = NB()
y_easy_pred_nb = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_easy_pred_nb != ytest).sum()
print ("Easy Data set - NB Classifier: Number of mislabels out of %d points: %d"
       % (xtest.shape[0],error ))

clf = tree.DecisionTreeClassifier()
y_easy_pred_dt = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_easy_pred_dt != ytest).sum()
print ("Easy Data set - DT Classifier: Number of mislabels out of %d points: %d" % (xtest.shape[0],error ))

Easy Data set: 500 rows and 9241 columns 
Easy Data set - NB Classifier: Number of mislabels out of 125 points: 14
Easy Data set - DT Classifier: Number of mislabels out of 125 points: 22


In [3]:
diff = fetch_20newsgroups(subset='train', 
                          categories=('rec.motorcycles', 'rec.autos'),
                          remove=('headers', 'footers', 'quotes'))
tf_vec = CountVectorizer(max_df=500, min_df=0, max_features =10000, ngram_range =(1,1), 
                         stop_words='english')
tf_diff_matrix=tf_vec.fit_transform(diff.data[:500])  #sparse matrix
t_diff=np.asarray(diff.target[:500])  
full_diff_matrix = pd.DataFrame(tf_diff_matrix.todense(),columns=tf_vec.get_feature_names())
print ("Difficult Data set: %d rows and %d columns " %
       (full_diff_matrix.shape[0], full_diff_matrix.shape[1]))

xtrain, xtest, ytrain, ytest = train_test_split(full_diff_matrix.as_matrix(),t_diff,random_state=50) 
clf = NB()
y_diff_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_diff_pred != ytest).sum()
print ("Difficult Data set - NB Classifier: Number of mislabels out of %d points: %d"
       % (xtest.shape[0],error ))

clf = tree.DecisionTreeClassifier()
y_diff_pred_dt = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_diff_pred_dt != ytest).sum()
print ("Difficult Data set - DT Classifier: Number of mislabels out of %d points: %d" %
       (xtest.shape[0],error ))

Difficult Data set: 500 rows and 8268 columns 
Difficult Data set - NB Classifier: Number of mislabels out of 125 points: 33
Difficult Data set - DT Classifier: Number of mislabels out of 125 points: 34


In [4]:
# Discussion:
# We can see that indeed it's easier to classify the easy data set (as the missclassification 
# error rate is lower)
# We can also see that Naiive Base classifer does better on this data set than a Decision Tree

In [5]:
# Bagging
bagging = BaggingClassifier (tree.DecisionTreeClassifier(), )
y_pred = bagging.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("Difficult Data set - DT with Bagging: Number of mislabels out of %d points: %d"
       % (xtest.shape[0],error ))

Difficult Data set - DT with Bagging: Number of mislabels out of 125 points: 30


In [6]:
# AdaBoost
clf = AdaBoostClassifier(NB(), algorithm="SAMME", n_estimators=300)
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("Difficult Data set - NB with AdaBoost: Number of mislabels out of %d points: %d"
       % (xtest.shape[0],error ))

Difficult Data set - NB with AdaBoost: Number of mislabels out of 125 points: 32


In [7]:
# Random Forest
clf = RandomForestClassifier(n_estimators=100, max_depth=None,random_state=10, max_features='auto')
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("Difficult Data set - RF: Number of mislabels out of %d points: %d"
       % (xtest.shape[0],error ))

Difficult Data set - RF: Number of mislabels out of 125 points: 35


In [8]:
# We can see that all the ensembles improve the classification, but to the same degree

In [9]:
sample = fetch_20newsgroups(subset='train',
                            categories=('comp.graphics', 'rec.autos', 'talk.politics.guns',
                                       'soc.religion.christian'),
                            remove=('headers', 'footers', 'quotes'))
tf_sample_vec = CountVectorizer (max_df=500, min_df=0, max_features=30000, ngram_range =(1,1),
                                 stop_words='english')
tf_sample_matrix=tf_sample_vec.fit_transform(sample.data)  #sparse matrix
print ("Sample: data has %d rows and %d columns "
       % (tf_sample_matrix.shape[0], tf_sample_matrix.shape[1]))
full_sample_matrix = pd.DataFrame(tf_sample_matrix.todense(),
                                  columns=tf_sample_vec.get_feature_names())
t_sample=np.asarray(sample.target)   
xtrain, xtest, ytrain, ytest = train_test_split(full_sample_matrix.as_matrix(),
                                                t_sample,random_state=50) 

Sample: data has 2323 rows and 29847 columns 


In [10]:
clf = NB() # NB

y_pred = OneVsRestClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("One-vs-All NB: Number of mislabels out of %d points in the test test: %d"
       % (xtest.shape[0],error ))

y_pred = OneVsOneClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("All-vs-All NB: Number of mislabels out of %d points in the test test: %d"
       % (xtest.shape[0],error ))

One-vs-All NB: Number of mislabels out of 581 points in the test test: 125
All-vs-All NB: Number of mislabels out of 581 points in the test test: 71


In [11]:
clf =  tree.DecisionTreeClassifier() # DT

y_pred = OneVsRestClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("One-vs-All DT: Number of mislabels out of %d points in the test test: %d"
       % (xtest.shape[0],error ))

y_pred = OneVsOneClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("All-vs-All DT: Number of mislabels out of %d points in the test test: %d"
       % (xtest.shape[0],error ))

One-vs-All DT: Number of mislabels out of 581 points in the test test: 119
All-vs-All DT: Number of mislabels out of 581 points in the test test: 121


In [12]:
clf= LinearSVC(random_state=10) # LinearSVC

y_pred = OneVsRestClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("One-vs-All LinearSVC: Number of mislabels out of %d points in the test test: %d"
       % (xtest.shape[0],error ))

y_pred = OneVsOneClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("All-vs-All LinearSVC: Number of mislabels out of %d points in the test test: %d"
       % (xtest.shape[0],error ))

One-vs-All LinearSVC: Number of mislabels out of 581 points in the test test: 92
All-vs-All LinearSVC: Number of mislabels out of 581 points in the test test: 100


In [13]:
# Ranking from best performer to worst perfomer:
# AvA NB > OvA LinearSVC > AvA LinearSVC > OvA DT > AvA DT > OvA NB 

In [14]:
# Given (x1,y1)….(xn,yn) and initial weights of data points wi = 1/n, i=1…n
# Weights are updated according to the formula: wi = wi*exp⁡(cm.1(y≠fm(x)), i=1…n  