In [1]:
import numpy as np
import os
from os.path import join, isdir, getsize, exists
# from nltk.stem.snowball import SnowballStemmer
import json
np.set_printoptions(suppress=True)
import matplotlib.pyplot as plt
%matplotlib notebook

# Load and proprocess documents
**Load document labels**

In [2]:
segmented_path = u'./corpus/segmented-docs' # it will listdir into unicode
doc_labels = [fn for fn in os.listdir(segmented_path) if isdir(join(segmented_path, fn))] # list only folders
doc_labels_idx = {}
n_labels = len(doc_labels)
for i, label in enumerate(doc_labels):
    print i, label
    doc_labels_idx[label] = i
print 'Total Labels:', n_labels

0 บริหารธุรกิจ
1 ประมง
2 มนุษยศาสตร์
3 วนศาสตร์
4 วิทยาการจัดการ
5 วิทยาศาสตร์
6 วิทยาศาสตร์การกีฬา
7 วิศวกรรมศาสตร์
8 ศิลปศาสตร์และวิทยาศาสตร์
9 ศึกษาศาสตร์
10 ศึกษาศาสตร์และพัฒนศาสตร์
11 สถาปัตยกรรมศาสตร์
12 สังคมศาสตร์
13 สัตวแพทยศาสตร์
14 สิ่งแวดล้อม
15 อุตสาหกรรมเกษตร
16 เกษตร
17 เศรษฐศาสตร์
18 โครงการจัดตั้งวิทยาเขตสุพรรณบุรี
19 โครงการสหวิทยาการระดับบัณฑิตศึกษา
Total Labels: 20


** Load dataset **

In [3]:
%%time
dataset_contents, dataset_labels = [], []
for i, label in enumerate(doc_labels):
    curr_dir = join(segmented_path, label)
    fns = os.listdir(curr_dir)
    for fn in fns:
        file_path = join(curr_dir, fn)
        with open(file_path, 'r') as f:
            content = unicode(f.read(), 'utf8')
            dataset_contents.append(content)
            dataset_labels.append(i)
N = len(dataset_contents)
print 'Total Segmented Documents:', N

Total Segmented Documents: 2549
Wall time: 1min 15s


**Test English word stemmer from Natural Language Toolkit**

In [8]:
# stemmer = SnowballStemmer('english')
# test_words = u'reply represent representation representative expression cats feeling นำเสนอนะ'.split()
# for word in test_words:
#     print word, stemmer.stem(word)

reply repli
represent repres
representation represent
representative repres
expression express
cats cat
feeling feel
นำเสนอนะ นำเสนอนะ


** Define a function that trims and stems words then replace all PIPELINE by space **

In [4]:
def pretty_trim(text):
    words = text.split(u'|')
    stripped_words_generator = (word.strip() for word in words)
#     stemmed_words_generator = (stemmer.stem(word) for word in stripped_words_generator)
    trimmed_words = (word for word in stripped_words_generator if word) # retains words that are not empty
    return u' '.join(trimmed_words)

**Show sample content**

In [5]:
print 'Content:', dataset_contents[1][:2**9], '...'
print 'Label:', dataset_labels[1]

Content: i|49737869| |i| |page| |I|วิทยานิพนธ์| |P|การ|วิเคราะห์|ต้นทุน|ต่อ|หน่วย|ผลผลิต| |ใน|การผลิต|บัณฑิต|ระดับ|ปริญญาตรี|ของ|วิทยาลัย|เอกชน| |จังหวัด|สุราษฎร์ธานี| |i| |analysi| |of| |cost| |per| |output| |unit| |in| |produc| |undergradu| |iof| |a| |privat| |colleg| |in| |surat| |thani| |prov| |inc| |I|นางสาว|กา|ญ|จน|ธัช| |บัว|พา| |I|บัณฑิต|วิทยาลัย| |มหาวิทยาลัยเกษตรศาสตร์| |I|พ| |ศ| |๒๕๕๔| |I|ใบรับรอง|วิทยานิพนธ์| |I|บัณฑิต|วิทยาลัย| |มหาวิทยาลัยเกษตรศาสตร์| |I|บัญชีมหาบัณฑิต| |I|ปริญญา| |I|บัญชี| |บัญชี| |I|ส ...
Label: 0


** Show sample content after pretty_trimmed() **

In [6]:
print 'Content:', pretty_trim(dataset_contents[1][:2**9]), '...'
print 'Label Str:', doc_labels[dataset_labels[1]]

Content: i 49737869 i page I วิทยานิพนธ์ P การ วิเคราะห์ ต้นทุน ต่อ หน่วย ผลผลิต ใน การผลิต บัณฑิต ระดับ ปริญญาตรี ของ วิทยาลัย เอกชน จังหวัด สุราษฎร์ธานี i analysi of cost per output unit in produc undergradu iof a privat colleg in surat thani prov inc I นางสาว กา ญ จน ธัช บัว พา I บัณฑิต วิทยาลัย มหาวิทยาลัยเกษตรศาสตร์ I พ ศ ๒๕๕๔ I ใบรับรอง วิทยานิพนธ์ I บัณฑิต วิทยาลัย มหาวิทยาลัยเกษตรศาสตร์ I บัญชีมหาบัณฑิต I ปริญญา I บัญชี บัญชี I ส ...
Label Str: บริหารธุรกิจ


## Trim all documents

In [7]:
%%time
fp = u'./corpus/dataset_contents_trimmed.json'
if exists(fp):
    print 'Loading trimmed documents ...'
    with open(fp, 'r') as f:
        dataset_contents_trimmed = json.load(f)
else:
    print 'Trimming documents ...'
    dataset_contents_trimmed = map(pretty_trim, dataset_contents)
    print 'Dumping ...'
    # dumb into a big file for later use because this list is very costful to compute
    with open(fp, 'w') as f:
        json.dump(dataset_contents_trimmed, f, ensure_ascii=True)
print 'Size in GB:', getsize(fp) / 1024.0 / 1024.0 / 1024.0

Loading trimmed documents ...
Size in GB: 1.80299591459
Wall time: 1min 23s


In [8]:
del dataset_contents

** Count number of words for each document **

In [9]:
%time dataset_words_count = np.array([len(content.split()) for content in dataset_contents_trimmed])
print 'Words Count Mean: ', np.mean(dataset_words_count)
dataset_words_count[:min(40,N)]

Wall time: 10.6 s
Words Count Mean:  42285.5268733


array([45472, 78883, 26280, 45636, 40991, 43624, 51190, 58068, 35827,
       48881, 39265, 55482, 27362, 35126, 34087, 52162, 42342, 38450,
       40987, 34572, 32222, 27636, 28945, 43572, 53255, 54756, 42671,
       29462, 43227, 40838, 53799, 68424, 38497, 48087, 84428, 55090,
       52365, 74338, 60454, 45600])

** Show words count histogram **

In [10]:
plt.figure()
plt.hist(dataset_words_count, bins=200)
plt.xlabel('Words Count')
plt.ylabel('Document Frequency')
plt.show()

<IPython.core.display.Javascript object>

# Machine Learning section

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import RFE
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, classification_report
from sklearn.cross_validation import train_test_split, ShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.learning_curve import learning_curve
from sklearn.neural_network import BernoulliRBM
from collections import Counter
from scipy.sparse import vstack

## Train/Test Split
Split dataset into 2 parts and leave the test part untouched (not fitting it with any model)

Split using stratified sampling might be useful if you want to test all label including the skewed low frequency label

In [12]:
X_train, X_test, y_train, y_test = train_test_split(dataset_contents_trimmed, np.array(dataset_labels),
                                                    test_size=0.2, stratify=dataset_labels, random_state=42)
print 'Train Size:', len(X_train)
print 'Test Size:', len(X_test)
train_counter, test_counter = Counter(y_train), Counter(y_test)
print 'Un-trained label:', list(set(xrange(n_labels)) - set(train_counter))
print 'Un-tested label:', list(set(xrange(n_labels)) - set(test_counter))

Train Size: 2039
Test Size: 510
Un-trained label: []
Un-tested label: []


### Plot bar chart of dataset frequency per label

In [13]:
train_label_freqs = np.zeros(n_labels, np.int32)
test_label_freqs = np.zeros(n_labels, np.int32)
dataset_label_freqs = np.zeros(n_labels, np.int32)
for k,v in train_counter.iteritems():
    train_label_freqs[k] = v
for k,v in test_counter.iteritems():
    test_label_freqs[k] = v
for k,v in Counter(dataset_labels).iteritems():
    dataset_label_freqs[k] = v
plt.figure()
plt.bar(np.arange(n_labels)-0.5, dataset_label_freqs, 1, color='b')
plt.bar(np.arange(n_labels)-0.5, train_label_freqs, 1, color='g')
plt.bar(np.arange(n_labels)-0.5, test_label_freqs, 1, color='r')
plt.xticks(np.arange(len(doc_labels)))
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.legend(['Before Split','Train', 'Test'], loc='best')
plt.grid()
plt.show()

<IPython.core.display.Javascript object>

## Feature Extraction
### Bag of Words Representation
Initialize a vectorizer that counts word instances and apply Tfidf (Term-Frequency * Inverse-Document-Frequency) to them

In [20]:
## saving English stop words to disk
# from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
# stop_words = ENGLISH_STOP_WORDS
# fp = 'stop_words.txt'
# with open(fp, 'w') as f:
#     words = u'\n'.join(sorted(stop_words))
#     f.write(words.encode('utf-8'))
# print len(stop_words)

318


In [14]:
## reading stop words from disk
fp = 'stop_words_unique.txt'
with open(fp, 'r') as f:
    stop_words = unicode(f.read(), 'utf-8')
stop_words = stop_words.split(u'\n')
print 'Stop Words:', len(stop_words)

Stop Words: 928


In [15]:
%%time
print 'Counting...'
counter = CountVectorizer(encoding=u'utf-8', stop_words=stop_words, ngram_range=(1,2), max_features=100000)
%time X_train_count = counter.fit_transform(X_train)
%time X_test_count = counter.transform(X_test)

print 'Transforming Tf-Idf...'
tfidf = TfidfTransformer()
%time X_train_vectorized = tfidf.fit_transform(X_train_count)
%time X_test_vectorized = tfidf.transform(X_test_count)

Wall time: 5min 33s
Wall time: 1min 28s
Wall time: 1.16 s
Wall time: 143 ms
Wall time: 7min 4s


** Save extracted feature names to disk **

In [16]:
%%time
feature_names = counter.get_feature_names()
fn = 'feature_names.txt'
with open(fn, 'w') as f:
    f.write(u'\n'.join(feature_names).encode('utf8'))
print 'Check file %s to see all extracted feature names' % fn
print 'Total names:', len(feature_names)

Check file feature_names.txt to see all extracted feature names
Total names: 100000
Wall time: 522 ms


**Vectorized Dataset Statistics**

In [17]:
print 'Train Shape:', X_train_vectorized.shape
print 'Sample content of type %s:' % type(X_train_vectorized)
print X_train_vectorized

Train Shape: (2039, 100000)
Sample content of type <class 'scipy.sparse.csr.csr_matrix'>:
  (0, 70781)	0.000334392792156
  (0, 48414)	0.000426716430159
  (0, 47270)	0.000354022518015
  (0, 70200)	0.000392114472283
  (0, 88436)	0.000350918750751
  (0, 19252)	0.000385705558385
  (0, 44093)	0.000472689820817
  (0, 87348)	0.0002762557756
  (0, 81345)	0.000345989810409
  (0, 41589)	0.000354022518015
  (0, 81346)	0.000256439822049
  (0, 89576)	0.000269877556425
  (0, 38140)	0.000282267905987
  (0, 88227)	0.000355083357605
  (0, 38035)	0.000338664230048
  (0, 49087)	0.000275022052566
  (0, 61403)	0.0005525115512
  (0, 88412)	0.000381203122338
  (0, 69862)	0.000392114472283
  (0, 75994)	0.000290204840401
  (0, 89375)	0.000339545452892
  (0, 88988)	0.000325702234665
  (0, 70481)	0.000308176996529
  (0, 88825)	0.000375555210436
  (0, 56487)	0.000218949431904
  :	:
  (2038, 57916)	0.0327968821608
  (2038, 56946)	0.00217381013027
  (2038, 97103)	0.00196081606049
  (2038, 7521)	0.00174326979335
  (

## Feature Selection

Recursive feature elimination using weights of Linear Kernel Support Vector Machine

In [138]:
%%time
rfe = RFE(SGDClassifier(loss='perceptron', n_iter=7), n_features_to_select=20000, step=0.2, verbose=1)
X_train_selected = rfe.fit_transform(X_train_vectorized, y_train)
X_test_selected = rfe.transform(X_test_vectorized)
print X_train_selected.shape, X_test_selected.shape

Fitting estimator with 100000 features.
Fitting estimator with 80000 features.
Fitting estimator with 60000 features.
Fitting estimator with 40000 features.
(2039, 20000) (510, 20000)
Wall time: 19.5 s


** Save top feature names to file **

In [139]:
%%time
top_features = np.array([feature for feature, support in zip(feature_names, rfe.support_) if support])
file_name = 'feature_names_top.txt'
with open(file_name, 'w') as f:
    f.write(u'\n'.join(top_features).encode('utf8'))
print 'Go check file %s' % file_name

Go check file feature_names_top.txt
Wall time: 151 ms


** Save word count to disk **

In [140]:
%%time
X_train_count_support = X_train_count[:,rfe.support_].T
m, n = X_train_count_support.shape[0], len(doc_labels)
freqs = np.empty((m, n), np.int32)

for label in xrange(n):
    freqs[:,label] = X_train_count_support[:,y_train==label].sum(axis=1).flatten()

with open('word_count_per_label.txt', 'w') as f:
    for i in xrange(m):
        line = []
        for j in xrange(n):
            line.append(str(freqs[i,j]))
        f.write('\t'.join(line) + '\n')

print freqs.shape

(20000L, 20L)
Wall time: 1.88 s


## Dimensionality Reduction

#### Unsupervised non-linear dimension reduction

Pre-training with Bernoulli Restricted Boltzmann Machine

In [136]:
# %%time
# rbm = BernoulliRBM(n_components=50, learning_rate=0.2, batch_size=20, n_iter=20, random_state=42, verbose=1)
# X_train_rbm = rbm.fit_transform(X_train_selected)
# X_test_rbm = rbm.transform(X_test_selected)
# print X_train_rbm.shape

In [137]:
# print rbm.components_.shape
# print rbm.components_ # weight of each edge, components_[i,j] = weight of edge from hidden node i to visible node j

Truncated SVD (Single Value Decomposition) is called Latent Semantic Analysis (LSA) in text analysis context

In [345]:
# %%time
# svd = TruncatedSVD(n_components=200) # works on sparse data
# X_train_reduced = svd.fit_transform(X_train_selected)
# X_test_reduced = svd.transform(X_test_selected)
# print 'Train Shape:', X_train_reduced.shape
# print 'Explained Variance Ratio Sum:', svd.explained_variance_ratio_.sum()
# print 'Top 5 Explained Variance Ratio:', svd.explained_variance_ratio_[:5]

Train Shape: (620L, 200L)
Explained Variance Ratio Sum: 0.680931519226
Top 5 Explained Variance Ratio: [ 0.08757407  0.0295904   0.01745794  0.01065465  0.01066833]
Wall time: 1.93 s


## Training models

In [141]:
X_train_final, X_test_final = X_train_selected, X_test_selected

In [142]:
%%time
models = [MultinomialNB(), DecisionTreeClassifier(max_depth=15, min_samples_split=5, max_features=None),
          SGDClassifier(n_iter=20), RandomForestClassifier()]
for clf in models:
    print 'Training', type(clf).__name__
    %time clf.fit(X_train_final, y_train)

Training MultinomialNB
Wall time: 199 ms
Training DecisionTreeClassifier
Wall time: 7.99 s
Training SGDClassifier
Wall time: 3.2 s
Training RandomForestClassifier
Wall time: 1.09 s
Wall time: 12.5 s


## Models Scoring
Evaluate on both train and test set

In [143]:
# define decision tree evaluator
def write_dt_features(dt, filename='feature_names_decision-tree.txt'):
    im = dt.feature_importances_
    f = open(filename, 'w')
    indices = np.argsort(im)[::-1]
    for i, idx in enumerate(indices):
        name, val = top_features[idx], im[idx]
        if not val: break
        f.write((name + '\t' + str(val) + '\n').encode('utf8'))
    f.close()
    print 'Total important features:', i

In [144]:
write_dt_features(models[1])

Total important features: 86


In [145]:
for clf in models:
    print type(clf).__name__
    for X,y,t in [(X_train_final, y_train, 'Train'), (X_test_final, y_test, 'Test')]:
        pred = clf.predict(X) # change to reduced or selected for 2 ways of reducing dimensions
        print t, 'dataset'
        print 'Accuracy Score:', accuracy_score(y, pred)
        print 'Precision Recall F-Score:\n', precision_recall_fscore_support(y, pred, average='weighted')
    print

MultinomialNB
Train dataset
Accuracy Score: 0.534575772437
Precision Recall F-Score:
(0.53503047668472015, 0.53457577243746934, 0.43501266845798869, None)
Test dataset
Accuracy Score: 0.490196078431
Precision Recall F-Score:
(0.38852503985501025, 0.49019607843137253, 0.37644081191898271, None)

DecisionTreeClassifier
Train dataset
Accuracy Score: 0.923982344286
Precision Recall F-Score:
(0.93696200297300425, 0.9239823442864149, 0.92373927734726691, None)
Test dataset
Accuracy Score: 0.817647058824
Precision Recall F-Score:
(0.81935841566979684, 0.81764705882352939, 0.81335647821424351, None)

SGDClassifier
Train dataset
Accuracy Score: 0.998528690535
Precision Recall F-Score:
(0.99853834713573619, 0.99852869053457582, 0.99840663231623172, None)
Test dataset
Accuracy Score: 0.782352941176
Precision Recall F-Score:
(0.76255834350035312, 0.78235294117647058, 0.76878058918888503, None)

RandomForestClassifier
Train dataset
Accuracy Score: 0.998038254046
Precision Recall F-Score:
(0.9980511

## Train a Model with Cross-Validation Set

Tune the model's hyper-parameters to give high K-Fold CV score

In [157]:
params = {'loss':['hinge'], 'alpha':[5e-4, 1e-4, 5e-5], 'n_iter':[35]}
gs1 = GridSearchCV(SGDClassifier(random_state=42), params, scoring='f1_weighted', cv=3, verbose=3)

In [158]:
params = {'criterion':['gini'], 'max_depth':[5, 15], 'min_samples_split':[2, 5, 10]}
gs2 = GridSearchCV(DecisionTreeClassifier(random_state=42), params, scoring='f1_weighted', cv=3, verbose=3)

In [160]:
gs_list = [gs1, gs2]
clfs = []
for gs in gs_list:
    print 'Training', type(gs.estimator).__name__
    %time gs.fit(X_train_final, y_train)
    print gs.best_estimator_
    clfs.append(gs.best_estimator_)
    print gs.best_params_
    print gs.best_score_

 Training SGDClassifier
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] alpha=0.0005, n_iter=35, loss=hinge .............................
[CV] .... alpha=0.0005, n_iter=35, loss=hinge, score=0.756310 -   3.8s
[CV] alpha=0.0005, n_iter=35, loss=hinge .............................
[CV] .... alpha=0.0005, n_iter=35, loss=hinge, score=0.748457 -   3.9s
[CV] alpha=0.0005, n_iter=35, loss=hinge .............................
[CV] .... alpha=0.0005, n_iter=35, loss=hinge, score=0.764411 -   3.9s
[CV] alpha=0.0001, n_iter=35, loss=hinge .............................
[CV] .... alpha=0.0001, n_iter=35, loss=hinge, score=0.763830 -   3.6s
[CV] alpha=0.0001, n_iter=35, loss=hinge .............................
[CV] .... alpha=0.0001, n_iter=35, loss=hinge, score=0.766016 -   3.6s
[CV] alpha=0.0001, n_iter=35, loss=hinge .............................
[CV] .... alpha=0.0001, n_iter=35, loss=hinge, score=0.766614 -   3.9s
[CV] alpha=5e-05, n_iter=35, loss=hinge .........................

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   34.4s finished


Wall time: 40.9 s
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=35, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False)
{'alpha': 0.0001, 'n_iter': 35, 'loss': 'hinge'}
0.765478100082
Training DecisionTreeClassifier
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] min_samples_split=5, criterion=gini, max_depth=5 ................
[CV]  min_samples_split=5, criterion=gini, max_depth=5, score=0.516802 -   2.3s
[CV] min_samples_split=5, criterion=gini, max_depth=5 ................
[CV]  min_samples_split=5, criterion=gini, max_depth=5, score=0.544829 -   2.6s
[CV] min_samples_split=5, criterion=gini, max_depth=5 ................
[CV]  min_samples_split=5, criterion=gini, max_depth=5, score=0.558352 -   2.4s
[CV] min_samples_split=10, criterion=gini, max_depth=5 ...............
[CV]  

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed:  2.2min


[CV]  min_samples_split=10, criterion=gini, max_depth=None, score=0.773782 -   6.4s
[CV] min_samples_split=10, criterion=gini, max_depth=None ............
[CV]  min_samples_split=10, criterion=gini, max_depth=None, score=0.798882 -   5.6s
[CV] min_samples_split=20, criterion=gini, max_depth=None ............
[CV]  min_samples_split=20, criterion=gini, max_depth=None, score=0.800927 -   4.7s
[CV] min_samples_split=20, criterion=gini, max_depth=None ............
[CV]  min_samples_split=20, criterion=gini, max_depth=None, score=0.768715 -   5.1s
[CV] min_samples_split=20, criterion=gini, max_depth=None ............
[CV]  min_samples_split=20, criterion=gini, max_depth=None, score=0.789616 -   5.3s


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  2.7min finished


Wall time: 2min 48s
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best')
{'min_samples_split': 5, 'criterion': 'gini', 'max_depth': 15}
0.795589471826


** Save models' state to disk **

In [161]:
%%time

# SVM
coef = clfs[0].coef_.T
m, n = coef.shape
with open('coef.txt', 'w') as f:
    for i in xrange(m):
        line = []
        for j in xrange(n):
            line.append(str(coef[i,j]))
        f.write('\t'.join(line) + '\n')

print m, n

# decision tree
write_dt_features(clfs[1])

20000 20
Total important features: 84
Wall time: 777 ms


In [162]:
for clf in clfs:
    print type(clf).__name__
    for X,y,t in [(X_train_final, y_train, 'Train'), (X_test_final, y_test, 'Test')]:
        pred = clf.predict(X) # change to reduced or selected to change ways of reducing dimensions
        print '=>', t, 'dataset'
        print 'Accuracy Score:', accuracy_score(y, pred)
        print 'Precision Recall F-Score:\n', precision_recall_fscore_support(y, pred, average='weighted')
        print
print 'Baseline score by chance:', 1.0 / n_labels, '(assume that an algorithm randomly guesses the label)'

SGDClassifier
=> Train dataset
Accuracy Score: 0.998528690535
Precision Recall F-Score:
(0.99853834713573619, 0.99852869053457582, 0.99840663231623172, None)

=> Test dataset
Accuracy Score: 0.786274509804
Precision Recall F-Score:
(0.7671358535145143, 0.78627450980392155, 0.77119820668595818, None)

DecisionTreeClassifier
=> Train dataset
Accuracy Score: 0.923982344286
Precision Recall F-Score:
(0.93704357195575017, 0.9239823442864149, 0.92374477160591117, None)

=> Test dataset
Accuracy Score: 0.827450980392
Precision Recall F-Score:
(0.82845988847434315, 0.82745098039215681, 0.82211694844711702, None)

Baseline score by chance: 0.05 (assume that an algorithm randomly guesses the label)


## Model Evaluation Metrics
Visualize confusion matrix and show classification report

In [163]:
y_true = y_test
y_preds = []
for clf in clfs:
    y_preds.append(clf.predict(X_test_final))

### Confusion Matrix
Visualize true positives and false positives

In [164]:
def plot_confusion_matrix(cm, title='Confusion matrix', model_name='Model', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title + ' (%s)' % model_name)
    plt.colorbar()
    tick_marks = np.arange(n_labels)
    plt.xticks(tick_marks, rotation=0)
    plt.yticks(tick_marks)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [167]:
for clf, y_pred in zip(clfs, y_preds):
    cm = confusion_matrix(y_true, y_pred)
#     print 'Confusion matrix, without normalization'
#     print cm
    plt.figure()
    plot_confusion_matrix(cm, model_name=type(clf).__name__)

    # Normalize the confusion matrix by row (i.e by the number of samples in each class)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#     print 'Normalized confusion matrix (Had to scale by 99 not 100 because the matrix will be too big and wrap lines)'
#     print (cm_normalized * 99).astype('int')
    plt.figure()
    plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix', model_name=type(clf).__name__)

plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Classification Report
Show scoring like precision, recall, f1 and their average for each label

In [168]:
for y_pred, clf in zip(y_preds, clfs):
    print type(clf).__name__
    print classification_report(y_true, y_pred, target_names=None)

SGDClassifier
             precision    recall  f1-score   support

          0       0.84      0.80      0.82        20
          1       0.71      0.67      0.69        15
          2       0.92      0.92      0.92        26
          3       0.74      0.83      0.78        24
          4       0.00      0.00      0.00         1
          5       0.62      0.70      0.66        50
          6       0.75      1.00      0.86         6
          7       0.84      0.95      0.89        94
          8       0.00      0.00      0.00         5
          9       0.88      0.97      0.92        62
         10       0.00      0.00      0.00         1
         11       1.00      0.33      0.50         3
         12       0.94      0.73      0.82        22
         13       0.00      0.00      0.00         4
         14       0.62      0.42      0.50        12
         15       0.79      0.66      0.72        41
         16       0.77      0.77      0.77        65
         17       0.78      0.9

## Learning Curves
Watch the performance of our chosen model as we increase the training size and check if it has variance or bias or somewhere in between

In [173]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 10)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("F1-Score Weighted of CVs")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, scoring='f1_weighted',
                                                            n_jobs=n_jobs, train_sizes=train_sizes, verbose=1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    print 'Train scores mean:', train_scores_mean
    print 'Test scores mean:', test_scores_mean
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="%d-Fold Cross-validation score" % cv)

    plt.legend(loc="best")
    return plt

In [174]:
%%time
# cv = ShuffleSplit(X_train_selected.shape[0], n_iter=5, test_size=0.2, random_state=42)
X = vstack((X_train_final, X_test_final))
y = np.concatenate((y_train, y_test))
for clf in clfs:
    print type(clf).__name__
    title = 'Learning Curves (%s)' % type(clf).__name__
    %time plot_learning_curve(clf, title, X, y, ylim=(-0.05, 1.05), cv=5, n_jobs=1)
plt.show()

SGDClassifier


<IPython.core.display.Javascript object>

[learning_curve] Training set sizes: [ 203  406  609  812 1015 1218 1421 1624 1827 2031]


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:  3.0min


Train scores mean: [ 1.          1.          1.          1.          1.          1.
  0.99939981  0.99947476  0.99845987  0.99798545]
Test scores mean: [ 0.58489055  0.68523505  0.70647668  0.72289808  0.74125324  0.75372615
  0.76030212  0.77252941  0.78092697  0.78710877]
DecisionTreeClassifier


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.1min finished


<IPython.core.display.Javascript object>

[learning_curve] Training set sizes: [ 203  406  609  812 1015 1218 1421 1624 1827 2031]


  'recall', 'true', average, warn_for)
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:  3.1min


Train scores mean: [ 0.93235814  0.95539395  0.92653673  0.90515388  0.92825443  0.9361992
  0.93577326  0.93833518  0.94207887  0.92881786]
Test scores mean: [ 0.61234652  0.73650371  0.75653079  0.74532495  0.76704299  0.77331771
  0.78815906  0.7904081   0.8042665   0.80867882]
Wall time: 6min 24s


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.2min finished
