In [1]:
import numpy as np
import os
from os.path import join, isdir, getsize, exists
# from nltk.stem.snowball import SnowballStemmer
import json
np.set_printoptions(suppress=True)
import matplotlib.pyplot as plt
%matplotlib notebook

# Load and proprocess documents
**Load document labels**

In [2]:
segmented_path = u'./corpus/segmented-docs' # it will listdir into unicode
doc_labels = [fn for fn in os.listdir(segmented_path) if isdir(join(segmented_path, fn))] # list only folders
doc_labels_idx = {}
n_labels = len(doc_labels)
for i, label in enumerate(doc_labels):
    print i, label
    doc_labels_idx[label] = i
print 'Total Labels:', n_labels

0 บริหารธุรกิจ
1 ประมง
2 มนุษยศาสตร์
3 วนศาสตร์
4 วิทยาการจัดการ
5 วิทยาศาสตร์
6 วิทยาศาสตร์การกีฬา
7 วิศวกรรมศาสตร์
8 ศิลปศาสตร์และวิทยาศาสตร์
9 ศึกษาศาสตร์
10 ศึกษาศาสตร์และพัฒนศาสตร์
11 สถาปัตยกรรมศาสตร์
12 สังคมศาสตร์
13 สัตวแพทยศาสตร์
14 สิ่งแวดล้อม
15 อุตสาหกรรมเกษตร
16 เกษตร
17 เศรษฐศาสตร์
18 โครงการจัดตั้งวิทยาเขตสุพรรณบุรี
19 โครงการสหวิทยาการระดับบัณฑิตศึกษา
Total Labels: 20


** Load dataset **

In [3]:
%%time
dataset_contents, dataset_labels = [], []
for i, label in enumerate(doc_labels):
    curr_dir = join(segmented_path, label)
    fns = os.listdir(curr_dir)
    for fn in fns:
        file_path = join(curr_dir, fn)
        with open(file_path, 'r') as f:
            content = unicode(f.read(), 'utf8')
            dataset_contents.append(content)
            dataset_labels.append(i)
N = len(dataset_contents)
print 'Total Segmented Documents:', N

Total Segmented Documents: 2549
Wall time: 1min 1s


**Test English word stemmer from Natural Language Toolkit**

In [8]:
# stemmer = SnowballStemmer('english')
# test_words = u'reply represent representation representative expression cats feeling นำเสนอนะ'.split()
# for word in test_words:
#     print word, stemmer.stem(word)

reply repli
represent repres
representation represent
representative repres
expression express
cats cat
feeling feel
นำเสนอนะ นำเสนอนะ


** Define a function that trims and stems words then replace all PIPELINE by space **

In [4]:
def pretty_trim(text):
    words = text.split(u'|')
    stripped_words_generator = (word.strip() for word in words)
#     stemmed_words_generator = (stemmer.stem(word) for word in stripped_words_generator)
    trimmed_words = (word for word in stripped_words_generator if word) # retains words that are not empty
    return u' '.join(trimmed_words)

**Show sample content**

In [7]:
print 'Content:', dataset_contents[1][:2**9], '...'
print 'Label:', dataset_labels[1]

Content: i|49737869| |i| |page| |I|วิทยานิพนธ์| |P|การ|วิเคราะห์|ต้นทุน|ต่อ|หน่วย|ผลผลิต| |ใน|การผลิต|บัณฑิต|ระดับ|ปริญญาตรี|ของ|วิทยาลัย|เอกชน| |จังหวัด|สุราษฎร์ธานี| |i| |analysi| |of| |cost| |per| |output| |unit| |in| |produc| |undergradu| |iof| |a| |privat| |colleg| |in| |surat| |thani| |prov| |inc| |I|นางสาว|กา|ญ|จน|ธัช| |บัว|พา| |I|บัณฑิต|วิทยาลัย| |มหาวิทยาลัยเกษตรศาสตร์| |I|พ| |ศ| |๒๕๕๔| |I|ใบรับรอง|วิทยานิพนธ์| |I|บัณฑิต|วิทยาลัย| |มหาวิทยาลัยเกษตรศาสตร์| |I|บัญชีมหาบัณฑิต| |I|ปริญญา| |I|บัญชี| |บัญชี| |I|ส ...
Label: 0


** Show sample content after pretty_trimmed() **

In [6]:
print 'Content:', pretty_trim(dataset_contents[1][:2**9]), '...'
print 'Label Str:', doc_labels[dataset_labels[1]]

Content: i 49737869 i page I วิทยานิพนธ์ P การ วิเคราะห์ ต้นทุน ต่อ หน่วย ผลผลิต ใน การผลิต บัณฑิต ระดับ ปริญญาตรี ของ วิทยาลัย เอกชน จังหวัด สุราษฎร์ธานี i analysi of cost per output unit in produc undergradu iof a privat colleg in surat thani prov inc I นางสาว กา ญ จน ธัช บัว พา I บัณฑิต วิทยาลัย มหาวิทยาลัยเกษตรศาสตร์ I พ ศ ๒๕๕๔ I ใบรับรอง วิทยานิพนธ์ I บัณฑิต วิทยาลัย มหาวิทยาลัยเกษตรศาสตร์ I บัญชีมหาบัณฑิต I ปริญญา I บัญชี บัญชี I ส ...
Label Str: บริหารธุรกิจ


## Trim all documents

In [8]:
%%time
fp = u'./corpus/dataset_contents_trimmed.json'
if exists(fp):
    print 'Loading trimmed documents ...'
    with open(fp, 'r') as f:
        dataset_contents_trimmed = json.load(f)
else:
    print 'Trimming documents ...'
    dataset_contents_trimmed = map(pretty_trim, dataset_contents)
    print 'Dumping ...'
    # dumb into a big file for later use because this list is very costful to compute
    with open(fp, 'w') as f:
        json.dump(dataset_contents_trimmed, f, ensure_ascii=True)
print 'Size in GB:', getsize(fp) / 1024.0 / 1024.0 / 1024.0

Trimming documents ...
Dumping ...
Size in GB: 1.80299591459
Wall time: 2min 6s


In [9]:
del dataset_contents

** Count number of words for each document **

In [10]:
%time dataset_words_count = np.array([len(content.split()) for content in dataset_contents_trimmed])
print 'Words Count Mean: ', np.mean(dataset_words_count)
dataset_words_count[:min(40,N)]

Wall time: 12.1 s
Words Count Mean:  42285.5268733


array([45472, 78883, 26280, 45636, 40991, 43624, 51190, 58068, 35827,
       48881, 39265, 55482, 27362, 35126, 34087, 52162, 42342, 38450,
       40987, 34572, 32222, 27636, 28945, 43572, 53255, 54756, 42671,
       29462, 43227, 40838, 53799, 68424, 38497, 48087, 84428, 55090,
       52365, 74338, 60454, 45600])

** Show words count histogram **

In [11]:
plt.figure()
plt.hist(dataset_words_count, bins=200)
plt.xlabel('Words Count')
plt.ylabel('Document Frequency')
plt.show()

<IPython.core.display.Javascript object>

# Machine Learning section

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import RFE
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, classification_report
from sklearn.cross_validation import train_test_split, ShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.learning_curve import learning_curve
from sklearn.neural_network import BernoulliRBM
from collections import Counter
from scipy.sparse import vstack

## Train/Test Split
Split dataset into 2 parts and leave the test part untouched (not fitting it with any model)

Split using stratified sampling might be useful if you want to test all label including the skewed low frequency label

In [13]:
X_train, X_test, y_train, y_test = train_test_split(dataset_contents_trimmed, np.array(dataset_labels),
                                                    test_size=0.2, stratify=dataset_labels, random_state=42)
print 'Train Size:', len(X_train)
print 'Test Size:', len(X_test)
train_counter, test_counter = Counter(y_train), Counter(y_test)
print 'Un-trained label:', list(set(xrange(n_labels)) - set(train_counter))
print 'Un-tested label:', list(set(xrange(n_labels)) - set(test_counter))

Train Size: 2039
Test Size: 510
Un-trained label: []
Un-tested label: []


### Plot bar chart of dataset frequency per label

In [14]:
train_label_freqs = np.zeros(n_labels, np.int32)
test_label_freqs = np.zeros(n_labels, np.int32)
dataset_label_freqs = np.zeros(n_labels, np.int32)
for k,v in train_counter.iteritems():
    train_label_freqs[k] = v
for k,v in test_counter.iteritems():
    test_label_freqs[k] = v
for k,v in Counter(dataset_labels).iteritems():
    dataset_label_freqs[k] = v
plt.figure()
plt.bar(np.arange(n_labels)-0.5, dataset_label_freqs, 1, color='b')
plt.bar(np.arange(n_labels)-0.5, train_label_freqs, 1, color='g')
plt.bar(np.arange(n_labels)-0.5, test_label_freqs, 1, color='r')
plt.xticks(np.arange(len(doc_labels)))
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.legend(['Before Split','Train', 'Test'], loc='best')
plt.grid()
plt.show()

<IPython.core.display.Javascript object>

## Feature Extraction
### Bag of Words Representation
Initialize a vectorizer that counts word instances and apply Tfidf (Term-Frequency * Inverse-Document-Frequency) to them

In [20]:
## saving English stop words to disk
# from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
# stop_words = ENGLISH_STOP_WORDS
# fp = 'stop_words.txt'
# with open(fp, 'w') as f:
#     words = u'\n'.join(sorted(stop_words))
#     f.write(words.encode('utf-8'))
# print len(stop_words)

318


In [15]:
## reading stop words from disk
fp = 'stop_words_unique.txt'
with open(fp, 'r') as f:
    stop_words = unicode(f.read(), 'utf-8')
stop_words = stop_words.split(u'\n')
print 'Stop Words:', len(stop_words)

Stop Words: 928


In [28]:
%%time
counter = CountVectorizer(encoding=u'utf-8', stop_words=stop_words, binary=False, max_features=None)
X_train_count = counter.fit_transform(X_train)
X_test_count = counter.transform(X_test)

tfidf = TfidfTransformer()
X_train_vectorized = tfidf.fit_transform(X_train_count)
X_test_vectorized = tfidf.transform(X_test_count)

Wall time: 2min 5s


** Save extracted feature names to disk **

In [30]:
%%time
feature_names = counter.get_feature_names()
fn = 'feature_names.txt'
with open(fn, 'w') as f:
    f.write(u'\n'.join(feature_names).encode('utf8'))
print 'Check file %s to see all extracted feature names' % fn
print 'Total names:', len(feature_names)

Check file feature_names.txt to see all extracted feature names
Total names: 410640
Wall time: 1.08 s


**Vectorized Dataset Statistics**

In [31]:
print 'Train Shape:', X_train_vectorized.shape
print 'Sample content of type %s:' % type(X_train_vectorized)
print X_train_vectorized

Train Shape: (2039, 410640)
Sample content of type <class 'scipy.sparse.csr.csr_matrix'>:
  (0, 344157)	0.000350485658761
  (0, 272686)	0.000260691259815
  (0, 344378)	0.000501738353321
  (0, 384604)	0.000321028805851
  (0, 182675)	0.000235626575489
  (0, 402596)	0.000140068282982
  (0, 391189)	0.000393929829697
  (0, 401384)	0.000366521382331
  (0, 402408)	0.000344297171214
  (0, 305647)	0.000702014519535
  (0, 402502)	0.0002010295581
  (0, 382250)	0.000477822866432
  (0, 406328)	0.000412309198642
  (0, 399715)	0.000501738353321
  (0, 385389)	0.000243417713819
  (0, 380984)	0.000581576436882
  (0, 408669)	0.000615069472268
  (0, 385237)	0.000362277477372
  (0, 291479)	0.000526899083619
  (0, 385348)	0.000615069472268
  (0, 402173)	0.000412305501467
  (0, 390128)	0.000370300493514
  (0, 389558)	0.000568131696824
  (0, 180044)	0.000335637778354
  (0, 400423)	0.000504000245778
  :	:
  (2038, 391905)	0.0393234432518
  (2038, 391210)	0.00260639712271
  (2038, 408653)	0.00235101735293
  (20

## Feature Selection

Recursive feature elimination using weights of Linear Kernel Support Vector Machine

In [32]:
%%time
rfe = RFE(SGDClassifier(loss='hinge'), n_features_to_select=20000, step=0.2)
X_train_selected = rfe.fit_transform(X_train_vectorized, y_train)
X_test_selected = rfe.transform(X_test_vectorized)
print X_train_selected.shape, X_test_selected.shape

(2039, 20000) (510, 20000)
Wall time: 10.4 s


** Save top feature names to file **

In [33]:
%%time
top_features = [feature for feature, support in zip(feature_names, rfe.support_) if support]
file_name = 'feature_names_top.txt'
with open(file_name, 'w') as f:
    f.write(u'\n'.join(top_features).encode('utf8'))
print 'Go check file %s' % file_name

Go check file feature_names_top.txt
Wall time: 249 ms


** Save model weights to disk **

In [34]:
%%time
coef = rfe.estimator_.coef_.T
m, n = coef.shape
with open('coef.txt', 'w') as f:
    for i in xrange(m):
        line = []
        for j in xrange(n):
            line.append(str(coef[i,j]))
        f.write('\t'.join(line) + '\n')

print m, n

20000 20
Wall time: 770 ms


** Save word count to disk **

In [35]:
%%time
X_train_count_support = X_train_count[:,rfe.support_].T
m, n = X_train_count_support.shape[0], len(doc_labels)
freqs = np.empty((m, n), np.int32)

for label in xrange(n):
    freqs[:,label] = X_train_count_support[:,y_train==label].sum(axis=1).flatten()

with open('word_count_per_label.txt', 'w') as f:
    for i in xrange(m):
        line = []
        for j in xrange(n):
            line.append(str(freqs[i,j]))
        f.write('\t'.join(line) + '\n')

print freqs.shape

(20000L, 20L)
Wall time: 1.3 s


## Dimensionality Reduction

#### Unsupervised non-linear dimension reduction

Pre-training with Bernoulli Restricted Boltzmann Machine

In [91]:
# %%time
# rbm = BernoulliRBM(n_components=50, learning_rate=0.2, batch_size=20, n_iter=20, random_state=42, verbose=1)
# X_train_rbm = rbm.fit_transform(X_train_selected)
# X_test_rbm = rbm.transform(X_test_selected)
# print X_train_rbm.shape

[BernoulliRBM] Iteration 1, pseudo-likelihood = -452.91, time = 8.04s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -35.33, time = 8.45s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -19.68, time = 8.61s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -16.75, time = 8.38s
[BernoulliRBM] Iteration 5, pseudo-likelihood = -14.76, time = 8.89s
[BernoulliRBM] Iteration 6, pseudo-likelihood = -15.02, time = 9.84s
[BernoulliRBM] Iteration 7, pseudo-likelihood = -14.44, time = 7.73s
[BernoulliRBM] Iteration 8, pseudo-likelihood = -13.56, time = 7.60s
[BernoulliRBM] Iteration 9, pseudo-likelihood = -12.23, time = 7.61s
[BernoulliRBM] Iteration 10, pseudo-likelihood = -12.87, time = 6.79s
[BernoulliRBM] Iteration 11, pseudo-likelihood = -12.21, time = 8.57s
[BernoulliRBM] Iteration 12, pseudo-likelihood = -12.50, time = 9.43s
[BernoulliRBM] Iteration 13, pseudo-likelihood = -12.68, time = 7.47s
[BernoulliRBM] Iteration 14, pseudo-likelihood = -12.86, time = 10.05s
[BernoulliRBM] Iteration 15

In [92]:
# print rbm.components_.shape
# print rbm.components_ # weight of each edge, components_[i,j] = weight of edge from hidden node i to visible node j

(50L, 20000L)
[[-0.03572855 -0.05813121 -0.0562077  ..., -0.13155343 -0.07418611
  -0.14079532]
 [-0.03441731 -0.02968999 -0.05920262 ..., -0.09267987 -0.06091394
  -0.10048683]
 [-0.04704162 -0.04297848 -0.06149489 ..., -0.1244551  -0.11109585
  -0.12007697]
 ..., 
 [-0.01286344 -0.03939054 -0.05986788 ..., -0.10667528 -0.06860802
  -0.1033964 ]
 [-0.04138484 -0.05222536 -0.05048636 ..., -0.12131422 -0.07424998
  -0.1004438 ]
 [-0.04486198 -0.02612365 -0.04348389 ..., -0.10224552 -0.0587047
  -0.10635221]]


Truncated SVD (Single Value Decomposition) is called Latent Semantic Analysis (LSA) in text analysis context

In [345]:
# %%time
# svd = TruncatedSVD(n_components=200) # works on sparse data
# X_train_reduced = svd.fit_transform(X_train_selected)
# X_test_reduced = svd.transform(X_test_selected)
# print 'Train Shape:', X_train_reduced.shape
# print 'Explained Variance Ratio Sum:', svd.explained_variance_ratio_.sum()
# print 'Top 5 Explained Variance Ratio:', svd.explained_variance_ratio_[:5]

Train Shape: (620L, 200L)
Explained Variance Ratio Sum: 0.680931519226
Top 5 Explained Variance Ratio: [ 0.08757407  0.0295904   0.01745794  0.01065465  0.01066833]
Wall time: 1.93 s


## Training models

In [62]:
X_train_final, X_test_final = X_train_selected, X_test_selected

In [63]:
%%time
models = [LogisticRegression(), MultinomialNB(), DecisionTreeClassifier(max_depth=15),
          SGDClassifier(n_iter=20), KNeighborsClassifier(n_neighbors=2), Perceptron(), RandomForestClassifier()]
for clf in models:
    print 'Training', type(clf).__name__
    %time clf.fit(X_train_final, y_train)

Training LogisticRegression
Wall time: 10.6 s
Training MultinomialNB
Wall time: 183 ms
Training DecisionTreeClassifier
Wall time: 7.38 s
Training SGDClassifier
Wall time: 2.53 s
Training KNeighborsClassifier
Wall time: 23 ms
Training Perceptron
Wall time: 647 ms
Training RandomForestClassifier
Wall time: 1.26 s
Wall time: 22.7 s


## Models Scoring
Evaluate on both train and test set

In [64]:
for clf in models:
    print type(clf).__name__
    for X,y,t in [(X_train_final, y_train, 'Train'), (X_test_final, y_test, 'Test')]:
        pred = clf.predict(X) # change to reduced or selected for 2 ways of reducing dimensions
        print t, 'dataset'
        print 'Accuracy Score:', accuracy_score(y, pred)
        print 'Precision Recall F-Score:\n', precision_recall_fscore_support(y, pred, average='weighted')
    print

LogisticRegression
Train dataset
Accuracy Score: 0.806420233463
Precision Recall F-Score:


  'precision', 'predicted', average, warn_for)


(0.79599966733855565, 0.80642023346303504, 0.77638237932273713, None)
Test dataset
Accuracy Score: 0.692007797271
Precision Recall F-Score:
(0.63837501098366189, 0.69200779727095518, 0.65113499451924461, None)

MultinomialNB
Train dataset
Accuracy Score: 0.515564202335
Precision Recall F-Score:
(0.4837880182046515, 0.51556420233463029, 0.41678322023504949, None)
Test dataset
Accuracy Score: 0.473684210526
Precision Recall F-Score:
(0.32840103507623586, 0.47368421052631576, 0.35446726461192962, None)

DecisionTreeClassifier
Train dataset
Accuracy Score: 0.923638132296
Precision Recall F-Score:
(0.93917095077909207, 0.92363813229571989, 0.92529578773175225, None)
Test dataset
Accuracy Score: 0.686159844055
Precision Recall F-Score:
(0.69076399921037401, 0.68615984405458086, 0.68408806655763432, None)

SGDClassifier
Train dataset
Accuracy Score: 0.992217898833
Precision Recall F-Score:
(0.9923317755925587, 0.99221789883268485, 0.99209588842239582, None)
Test dataset
Accuracy Score: 0.7914

## Train a Model with Cross-Validation Set
**Support Vector Machine implemented using Stochastic Gradient Descent**

Tune the model's hyper-parameters to give high K-Fold CV score

In [65]:
%%time
params = {'alpha':[1e-3, 1e-4, 1e-5, 1e-6], 'n_iter':[20, 100]}
gs = GridSearchCV(SGDClassifier(random_state=42), params, scoring='f1_weighted', cv=3)
gs.fit(X_train_final, y_train)

  'precision', 'predicted', average, warn_for)


Wall time: 2min 20s


In [66]:
print gs.best_estimator_
print gs.best_params_
print gs.best_score_

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=100, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False)
{'alpha': 0.0001, 'n_iter': 100}
0.745437468079


In [67]:
clf = gs.best_estimator_
print type(clf).__name__
for X,y,t in [(X_train_final, y_train, 'Train'), (X_test_final, y_test, 'Test')]:
    pred = clf.predict(X) # change to reduced or selected to change ways of reducing dimensions
    print '=>', t, 'dataset'
    print 'Accuracy Score:', accuracy_score(y, pred)
    print 'Precision Recall F-Score:\n', precision_recall_fscore_support(y, pred, average='weighted')
    print
print 'Baseline score by chance:', 1.0 / n_labels, '(assume that an algorithm randomly guesses the label)'

SGDClassifier
=> Train dataset
Accuracy Score: 0.992704280156
Precision Recall F-Score:
(0.99280545285178523, 0.99270428015564205, 0.99258131078977474, None)

=> Test dataset
Accuracy Score: 0.791423001949
Precision Recall F-Score:
(0.77300020530350033, 0.79142300194931769, 0.77773579705118112, None)

Baseline score by chance: 0.05 (assume that an algorithm randomly guesses the label)


## Model Evaluation Metrics
Visualize confusion matrix and show classification report

In [68]:
y_true, y_pred = y_test, clf.predict(X_test_final)

### Confusion Matrix
Visualize true positives and false positives

In [69]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(n_labels)
    plt.xticks(tick_marks, rotation=0)
    plt.yticks(tick_marks)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [70]:
cm = confusion_matrix(y_true, y_pred)
print 'Confusion matrix, without normalization'
print cm
plt.figure()
plot_confusion_matrix(cm)

# Normalize the confusion matrix by row (i.e by the number of samples in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print 'Normalized confusion matrix (Had to scale by 99 not 100 because the matrix will be too big and wrap lines)'
print (cm_normalized * 99).astype('int')
plt.figure()
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')

plt.show()

Confusion matrix, without normalization
[[16  0  0  0  0  0  1  1  0  0  0  0  0  0  0  0  0  2  0  0]
 [ 0 10  0  0  0  1  0  1  0  1  0  0  0  0  0  1  1  0  0  0]
 [ 1  0 21  0  1  0  0  0  0  1  0  0  1  0  0  0  0  0  1  0]
 [ 0  0  1 19  0  0  0  2  0  0  0  0  0  0  1  0  0  1  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  1  0 31  0  7  0  0  0  0  0  0  2  0  5  0  0  3]
 [ 0  0  0  0  0  0  6  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  3  0 91  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  2  0  0  0  2  0  0  0  0  0  0  0]
 [ 1  0  0  1  0  0  0  0  0 61  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  2  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  2  1  0  0  0  1  0  3  0  0 15  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  2]
 [ 0  1  0  2  0  0  0  3  0  0  0  0  0  0  4  0  0  2  0  0]
 [ 0  0  0  0  

<IPython.core.display.Javascript object>

Normalized confusion matrix (Had to scale by 99 not 100 because the matrix will be too big and wrap lines)
[[79  0  0  0  0  0  4  4  0  0  0  0  0  0  0  0  0  9  0  0]
 [ 0 66  0  0  0  6  0  6  0  6  0  0  0  0  0  6  6  0  0  0]
 [ 3  0 79  0  3  0  0  0  0  3  0  0  3  0  0  0  0  0  3  0]
 [ 0  0  4 78  0  0  0  8  0  0  0  0  0  0  4  0  0  4  0  0]
 [ 0  0  0  0 99  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  1  0 61  0 13  0  0  0  0  0  0  3  0  9  0  0  5]
 [ 0  0  0  0  0  0 99  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  3  0 94  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 19  0  0 39  0  0  0 39  0  0  0  0  0  0  0]
 [ 1  0  0  1  0  0  0  0  0 95  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 99  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 66  0  0  0 33  0  0  0  0  0  0  0  0]
 [ 0  0  9  4  0  0  0  4  0 13  0  0 67  0  0  0  0  0  0  0]
 [ 0 24  0  0  0 24  0  0  0  0  0  0  0  0  0  0  0  0  0 49]
 [ 0  8  0 

<IPython.core.display.Javascript object>

### Classification Report
Show scoring like precision, recall, f1 and their average for each label

In [71]:
print classification_report(y_true, y_pred, target_names=None)

             precision    recall  f1-score   support

          0       0.80      0.80      0.80        20
          1       0.71      0.67      0.69        15
          2       0.88      0.81      0.84        26
          3       0.70      0.79      0.75        24
          4       0.50      1.00      0.67         1
          5       0.69      0.62      0.65        50
          6       0.75      1.00      0.86         6
          7       0.81      0.96      0.88        95
          8       0.67      0.40      0.50         5
          9       0.90      0.97      0.93        63
         10       0.00      0.00      0.00         1
         11       0.50      0.33      0.40         3
         12       0.79      0.68      0.73        22
         13       0.00      0.00      0.00         4
         14       0.57      0.33      0.42        12
         15       0.86      0.76      0.81        41
         16       0.84      0.88      0.86        65
         17       0.80      0.88      0.84   

## Learning Curves
Watch the performance of our chosen model as we increase the training size and check if it has variance or bias or somewhere in between

In [74]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 10)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("F1-Score Weighted of CVs")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, scoring='f1_weighted',
                                                            n_jobs=n_jobs, train_sizes=train_sizes, verbose=1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="%d-Fold Cross-validation score" % cv)

    plt.legend(loc="best")
    return plt

In [75]:
%%time
# cv = ShuffleSplit(X_train_selected.shape[0], n_iter=5, test_size=0.2, random_state=42)
title = 'Learning Curves (SVM)'
X = vstack((X_train_final, X_test_final))
y = np.concatenate((y_train, y_test))
plot_learning_curve(clf, title, X, y, ylim=(-0.05, 1.05), cv=5, n_jobs=1)
plt.show()

<IPython.core.display.Javascript object>

[learning_curve] Training set sizes: [ 204  409  613  818 1023 1227 1432 1636 1841 2046]


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:  6.4min


Wall time: 6min 36s


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  6.6min finished
