In [14]:
import sklearn
from sklearn.datasets import fetch_20newsgroups, fetch_20newsgroups_vectorized

In [8]:
d = fetch_20newsgroups()
d.keys()

dict_keys(['data', 'description', 'DESCR', 'target', 'filenames', 'target_names'])

In [11]:
d['target_names']

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [21]:
dv = fetch_20newsgroups_vectorized()
type(dv)
dv.data

<11314x130107 sparse matrix of type '<class 'numpy.float64'>'
	with 1787565 stored elements in Compressed Sparse Row format>

In [46]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
import numpy as np

a = np.array([ [ 1000., -1.,  2.],
               [ 2010.,  0.,  0.],
               [ 900.,  1., -1.]], dtype=np.float64)

print(np.mean(a, axis=(0,)), np.std(a, axis=(0,)))

std_scaler = StandardScaler()
std_scaler.fit(a)
s = std_scaler.transform(a)
print(s)

rob_scaler = RobustScaler()
rob_scaler.fit(a)
r = rob_scaler.transform(a)
print(r)

[  1.30333333e+03   0.00000000e+00   3.33333333e-01] [ 501.35372299    0.81649658    1.24721913]
[[-0.60502858 -1.22474487  1.33630621]
 [ 1.40951714  0.         -0.26726124]
 [-0.80448856  1.22474487 -1.06904497]]
[[ 0.         -1.          1.33333333]
 [ 1.81981982  0.          0.        ]
 [-0.18018018  1.         -0.66666667]]


In [47]:
from sklearn.cross_validation import train_test_split

In [51]:

splits = train_test_split(dv.data, dv.target)

In [52]:
splits

[<8485x130107 sparse matrix of type '<class 'numpy.float64'>'
 	with 1343713 stored elements in Compressed Sparse Row format>,
 <2829x130107 sparse matrix of type '<class 'numpy.float64'>'
 	with 443852 stored elements in Compressed Sparse Row format>,
 array([16,  9, 13, ..., 17,  2,  7]),
 array([ 2,  3, 15, ...,  6, 15,  6])]

In [85]:
from sklearn.cross_validation import KFold

In [86]:
dv.data.shape

(11314, 130107)

In [87]:
f = KFold(dv.data.shape[0])

In [88]:
f

sklearn.cross_validation.KFold(n=11314, n_folds=3, shuffle=False, random_state=None)

In [89]:
for tr, ts in f:
    print(tr, ts)

[ 3772  3773  3774 ..., 11311 11312 11313] [   0    1    2 ..., 3769 3770 3771]
[    0     1     2 ..., 11311 11312 11313] [3772 3773 3774 ..., 7540 7541 7542]
[   0    1    2 ..., 7540 7541 7542] [ 7543  7544  7545 ..., 11311 11312 11313]


In [58]:
cls = MultinomialNB()

In [59]:
cls.fit()

TypeError: fit() missing 2 required positional arguments: 'X' and 'y'

In [130]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [150]:
docs = ['hello world how are you how', 
        'able was I ere I saw elba', 
        'betty bought some butter but the butter was bitter so betty bought some more butter']

counts = CountVectorizer()
term_freqs = counts.fit_transform(docs)
print(counts.get_feature_names())
print(term_freqs)

['able', 'are', 'betty', 'bitter', 'bought', 'but', 'butter', 'elba', 'ere', 'hello', 'how', 'more', 'saw', 'so', 'some', 'the', 'was', 'world', 'you']
  (0, 9)	1
  (0, 17)	1
  (0, 10)	2
  (0, 1)	1
  (0, 18)	1
  (1, 0)	1
  (1, 16)	1
  (1, 8)	1
  (1, 12)	1
  (1, 7)	1
  (2, 16)	1
  (2, 2)	2
  (2, 4)	2
  (2, 14)	2
  (2, 6)	3
  (2, 5)	1
  (2, 15)	1
  (2, 3)	1
  (2, 13)	1
  (2, 11)	1


In [151]:
tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(docs)
print(tfidf.get_feature_names())
print(tfidfs)
print(tfidf.idf_)

['able', 'are', 'betty', 'bitter', 'bought', 'but', 'butter', 'elba', 'ere', 'hello', 'how', 'more', 'saw', 'so', 'some', 'the', 'was', 'world', 'you']
  (0, 18)	0.353553390593
  (0, 1)	0.353553390593
  (0, 10)	0.707106781187
  (0, 17)	0.353553390593
  (0, 9)	0.353553390593
  (1, 7)	0.467350981811
  (1, 12)	0.467350981811
  (1, 8)	0.467350981811
  (1, 16)	0.35543246785
  (1, 0)	0.467350981811
  (2, 11)	0.193970456486
  (2, 13)	0.193970456486
  (2, 3)	0.193970456486
  (2, 15)	0.193970456486
  (2, 5)	0.193970456486
  (2, 6)	0.581911369459
  (2, 14)	0.387940912973
  (2, 4)	0.387940912973
  (2, 2)	0.387940912973
  (2, 16)	0.147519531834
[ 1.69314718  1.69314718  1.69314718  1.69314718  1.69314718  1.69314718
  1.69314718  1.69314718  1.69314718  1.69314718  1.69314718  1.69314718
  1.69314718  1.69314718  1.69314718  1.69314718  1.28768207  1.69314718
  1.69314718]


In [178]:
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler
from sklearn.cross_validation import train_test_split, KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import average_precision_score, recall_score, precision_score, f1_score, cohen_kappa_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from xgboost.sklearn import XGBClassifier

import numpy as np

from collections import Counter

In [80]:
data = fetch_20newsgroups_vectorized()
train_x, test_x, train_y, test_y = train_test_split(data.data, data.target)
print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)

(8485, 130107) (8485,)
(2829, 130107) (2829,)


In [174]:
bin_counts = np.bincount(data.target)
print(bin_counts / len(data.target) * 100.0)

[ 4.24253138  5.16174651  5.22361676  5.21477815  5.10871487  5.24129397
  5.17058512  5.25013258  5.28548701  5.2766484   5.30316422  5.25897119
  5.22361676  5.25013258  5.24129397  5.29432561  4.82587944  4.98497437
  4.10995227  3.33215485]


In [81]:
scaler = MaxAbsScaler()
scaler.fit(train_x)
scaled_train_x = scaler.transform(train_x)
scaled_test_x = scaler.transform(test_x)

In [82]:
cls = MultinomialNB()
cls.fit(scaled_train_x, train_y)
predicted_y = cls.predict(scaled_test_x)
print(predicted_y.shape)

(2829,)


In [83]:
acc_score = accuracy_score(test_y, predicted_y)
print("Accuracy:", acc_score)

Accuracy: 0.838812301166


In [84]:
precision, recall, fscore, support = precision_recall_fscore_support(test_y, predicted_y)
print("Precision:\n", precision)
print("Recall:\n", recall)
print("F1-score:\n", fscore)
print("Support:\n", support)

Precision:
 [ 0.91666667  0.71052632  0.77380952  0.65        0.90441176  0.7721519
  0.86363636  0.87012987  0.91891892  0.98726115  0.91975309  0.79190751
  0.88235294  0.82        0.89240506  0.65957447  0.86754967  0.92352941
  0.88990826  0.88888889]
Recall:
 [ 0.80487805  0.81818182  0.45774648  0.82394366  0.75460123  0.87142857
  0.67375887  0.87012987  0.95774648  0.90643275  0.98026316  0.93197279
  0.78947368  0.9389313   0.90967742  0.95384615  0.95620438  0.97515528
  0.81512605  0.42105263]
F1-score:
 [ 0.85714286  0.76056338  0.57522124  0.72670807  0.82274247  0.81879195
  0.75697211  0.87012987  0.93793103  0.94512195  0.94904459  0.85625
  0.83333333  0.87544484  0.90095847  0.77987421  0.90972222  0.94864048
  0.85087719  0.57142857]
Support:
 [123 132 142 142 163 140 141 154 142 171 152 147 152 131 155 130 137 161
 119  95]


In [201]:
def evaluate(dataset, classifier, scaler):
    
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    
    # Since class counts are slightly skewed, use StratifiedKFold instead of plain KFold.
    # StratifiedKFold preserves the proportions of classes while sampling.
    #
    # What's the difference between StratifiedKFold, StratifiedKFold with shuffle=True, 
    # and StratifiedShuffleSplit? This question 
    # http://stackoverflow.com/questions/37635460/stratifiedkfold-vs-stratifiedshufflesplit-vs-stratifiedkfold-shuffle
    # asks exactly that but doesn't provide an exact answer...
    # http://scikit-learn.org/stable/modules/cross_validation.html probably answers it better, 
    # atleast the difference between Cross validation and shuffle splits, but needs to be understood
    # in detail.
    #
    #folds = KFold(dataset.data.shape[0], n_folds = 10, shuffle = True)
    folds = StratifiedKFold(dataset.target, n_folds = 10, shuffle = False)
        
    for training_rows, testing_rows in folds:
        train_x = dataset.data[training_rows]
        train_y = dataset.target[training_rows]

        test_x = dataset.data[testing_rows]
        test_y = dataset.target[testing_rows]

        scaled_train_x = scaler.fit_transform(train_x)
        scaled_test_x = scaler.transform(test_x)

        cls.fit(scaled_train_x, train_y)
        predicted_y = cls.predict(scaled_test_x)
        #cls.fit(train_x, train_y)
        #predicted_y = cls.predict(test_x)

        acc_score = accuracy_score(test_y, predicted_y)
        accuracies.append(acc_score)

        # This is the average precision across all classes.
        avg_prec = precision_score(test_y, predicted_y, average='macro')
        precisions.append(avg_prec)

        # This is the average recall across all classes.
        avg_recall = recall_score(test_y, predicted_y, average='macro')
        recalls.append(avg_recall)
    
    print("Accuracy:", np.mean(accuracies))
    print("Precision:", np.mean(precisions))
    print("Recall:", np.mean(recalls))
    

In [197]:
evaluate(data, MultinomialNB(fit_prior=False), MaxAbsScaler())

Accuracy: 0.849587833524
Precision: 0.858766856998
Recalls: 0.842873622222


In [200]:
evaluate(data, MultinomialNB(fit_prior=True), MaxAbsScaler())

Accuracy: 0.852327518887
Precision: 0.861582207013
Recalls: 0.845618342495


In [189]:
evaluate(data, MultinomialNB(), StandardScaler(with_mean=False))

Accuracy: 0.773277734168
Precision: 0.774682391808
Recalls: 0.773356405237


In [191]:
evaluate(data, RandomForestClassifier(), MaxAbsScaler())

Accuracy: 0.851326717706
Precision: 0.86123036074
Recalls: 0.844607339581


In [192]:
evaluate(data, LogisticRegression(), MaxAbsScaler())

Accuracy: 0.850261687895
Precision: 0.858972829991
Recalls: 0.843628910402


In [193]:
evaluate(data, GradientBoostingClassifier(), MaxAbsScaler())

Accuracy: 0.850818580052
Precision: 0.859536632568
Recalls: 0.844620412125


In [194]:
evaluate(data, SGDClassifier(), MaxAbsScaler())

Accuracy: 0.852492931905
Precision: 0.86253961696
Recalls: 0.845750733135


In [195]:
evaluate(data, XGBClassifier(), MaxAbsScaler())

Accuracy: 0.853105486186
Precision: 0.861934433513
Recalls: 0.846761403924
