In [1]:
import numpy as np
import os
from os.path import join, isdir, getsize
from nltk.stem.snowball import SnowballStemmer
import json
np.set_printoptions(suppress=True)
import matplotlib.pyplot as plt
%matplotlib notebook

# Load and proprocess documents
**Load document labels**

In [2]:
segmented_path = u'./corpus/segmented-docs' # it will listdir into unicode
doc_labels = [fn for fn in os.listdir(segmented_path) if isdir(join(segmented_path, fn))] # list only folders
doc_labels_idx = {}
n_labels = len(doc_labels)
for i, label in enumerate(doc_labels):
    print i, label
    doc_labels_idx[label] = i
print 'Total Labels:', n_labels

0 ศึกษาศาสตร์และพัฒนศาสตร์
1 สิ่งแวดล้อม
2 วิทยาการจัดการ
3 โครงการสหวิทยาการระดับบัณฑิตศึกษา
4 ศึกษาศาสตร์
5 วิศวกรรมศาสตร์
6 โครงการจัดตั้งวิทยาเขตสุพรรณบุรี
7 วิทยาศาสตร์
8 วิทยาศาสตร์การกีฬา
9 เกษตร
10 อุตสาหกรรมเกษตร
11 วนศาสตร์
12 สังคมศาสตร์
13 สถาปัตยกรรมศาสตร์
14 ประมง
15 เศรษฐศาสตร์
16 สัตวแพทยศาสตร์
17 มนุษยศาสตร์
18 บริหารธุรกิจ
19 ศิลปศาสตร์และวิทยาศาสตร์
Total Labels: 20


** Load dataset **

In [3]:
%%time
dataset_contents, dataset_labels = [], []
for i, label in enumerate(doc_labels):
    curr_dir = join(segmented_path, label)
    fns = os.listdir(curr_dir)
    for fn in fns:
        file_path = join(curr_dir, fn)
        with open(file_path, 'r') as f:
            content = unicode(f.read(), 'utf8')
            dataset_contents.append(content)
            dataset_labels.append(i)
N = len(dataset_contents)
print 'Total Segmented Documents:', N

Total Segmented Documents: 2569
CPU times: user 6.23 s, sys: 1.69 s, total: 7.92 s
Wall time: 7.96 s


**Test English word stemmer from Natural Language Toolkit**

In [5]:
stemmer = SnowballStemmer('english')
test_words = u'reply represent representation representative expression cats feeling นำเสนอนะ'.split()
for word in test_words:
    print word, stemmer.stem(word)

reply repli
represent repres
representation represent
representative repres
expression express
cats cat
feeling feel
นำเสนอนะ นำเสนอนะ


** Define a function that trims and stems words then replace all PIPELINE by space **

In [6]:
def pretty_trim(text):
    words = text.split('|')
    stripped_words_generator = (word.strip() for word in words)
    stemmed_words_generator = (stemmer.stem(word) for word in stripped_words_generator)
    trimmed_words = (word for word in stemmed_words_generator if word) # retains words that are not empty
    return ' '.join(trimmed_words)

**Show sample content**

In [7]:
print 'Content:', dataset_contents[1][:2**8], '...'
print 'Label:', dataset_labels[1]

Content: I|51860237|I|ปรัชญา|ดุษฎีบัณฑิต| | |การพัฒนา|ทรัพยากร|มนุษย์|และ|ชุมชน| |I|ปริญญา|I|สาขา|I|ภาควิชา|I|เรื่อง|I|กระบวนการ|จัดกิจกรรม|การเรียนรู้|นอกหลักสูตร|เพื่อ|ป้องกัน|เอดส์|I|ที่|มี|ประสิทธิผล|สาห|รับ|เยาวชน|ใน|โรงเรียนมัธยม|ศึกษา|Pan| |effective| |proce ...
Label: 0


** Show sample content after pretty_trimmed() **

In [8]:
print 'Content:', pretty_trim(dataset_contents[1][:2**8]), '...'
print 'Label Str:', doc_labels[dataset_labels[1]]

Content: i 51860237 i ปรัชญา ดุษฎีบัณฑิต การพัฒนา ทรัพยากร มนุษย์ และ ชุมชน i ปริญญา i สาขา i ภาควิชา i เรื่อง i กระบวนการ จัดกิจกรรม การเรียนรู้ นอกหลักสูตร เพื่อ ป้องกัน เอดส์ i ที่ มี ประสิทธิผล สาห รับ เยาวชน ใน โรงเรียนมัธยม ศึกษา pan effect proce ...
Label Str: ศึกษาศาสตร์และพัฒนศาสตร์


## Trim and stem all documents

In [9]:
%%time
dataset_contents_trimmed = map(pretty_trim, dataset_contents)

CPU times: user 1h 6min, sys: 46.7 s, total: 1h 6min 47s
Wall time: 1h 6min 19s


In [10]:
%%time
# dumb into a big file for later use because this list is very costful to compute
fp = u'./corpus/dataset_contents_trimmed.json'
with open(fp, 'w') as f:
    json.dump(dataset_contents_trimmed, f, ensure_ascii=True)
print 'Size in GB:', getsize(fp) / 1024.0 / 1024.0 / 1024.0

Size in GB: 1.80476844683
CPU times: user 5.61 s, sys: 2.7 s, total: 8.3 s
Wall time: 22.3 s


In [11]:
del dataset_contents

** Count number of words for each document **

In [12]:
%time dataset_words_count = np.array([len(content.split()) for content in dataset_contents_trimmed])
print 'Words Count Mean: ', np.mean(dataset_words_count)
dataset_words_count[:min(40,N)]

CPU times: user 16.3 s, sys: 248 ms, total: 16.5 s
Wall time: 16.6 s
Words Count Mean:  37835.5130401


array([100447,  81877,  60847,  41019,  44385,  44653,  52990,  34965,
        29143,  23525,  21891,  19635,  28334,  28231,  40053,  34664,
        15793,  23165,  19661,  49713,  21291,  21695,  31365,  42272,
        33852,  36951,  18941,  73725,  24820,  26887,  21518,  16904,
        19162,  28505,  41201,  32327,  21926,  20981,  21084,  41627])

** Show words count histogram **

In [26]:
plt.figure()
plt.hist(dataset_words_count, bins=200)
plt.xlabel('Words Count')
plt.ylabel('Document Frequency')
plt.show()

<IPython.core.display.Javascript object>

# Machine Learning section

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import RFE
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, classification_report
from sklearn.cross_validation import train_test_split, ShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.learning_curve import learning_curve
from collections import Counter

## Train/Test Split
Split dataset into 2 parts and leave the test part untouched (not fitting it with any model)

Split using stratified sampling might be useful if you want to test all label including the skewed low frequency label

In [15]:
X_train, X_test, y_train, y_test = train_test_split(dataset_contents_trimmed, dataset_labels,
                                                    test_size=0.2, stratify=dataset_labels, random_state=42)
print 'Train Size:', len(X_train)
print 'Test Size:', len(X_test)
train_counter, test_counter = Counter(y_train), Counter(y_test)
print 'Un-trained label:', list(set(xrange(n_labels)) - set(train_counter))
print 'Un-tested label:', list(set(xrange(n_labels)) - set(test_counter))

Train Size: 2056
Test Size: 513
Un-trained label: []
Un-tested label: []


### Plot bar chart of dataset frequency per label

In [27]:
train_label_freqs = np.zeros(n_labels, np.int32)
test_label_freqs = np.zeros(n_labels, np.int32)
dataset_label_freqs = np.zeros(n_labels, np.int32)
for k,v in train_counter.iteritems():
    train_label_freqs[k] = v
for k,v in test_counter.iteritems():
    test_label_freqs[k] = v
for k,v in Counter(dataset_labels).iteritems():
    dataset_label_freqs[k] = v
plt.figure()
plt.bar(np.arange(n_labels)-0.5, dataset_label_freqs, 1, color='b')
plt.bar(np.arange(n_labels)-0.5, train_label_freqs, 1, color='g')
plt.bar(np.arange(n_labels)-0.5, test_label_freqs, 1, color='r')
plt.xticks(np.arange(len(doc_labels)))
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.legend(['Before Split','Train', 'Test'], loc='best')
plt.grid()
plt.show()

<IPython.core.display.Javascript object>

## Feature Extraction
### Bag of Words Representation
Initialize a vectorizer that counts word instances and apply Tfidf (Term-Frequency * Inverse-Document-Frequency) to them

In [17]:
%%time
tfidf = TfidfVectorizer(encoding=u'utf-8', stop_words='english', binary=False, max_features=None)
X_train_vectorized = tfidf.fit_transform(X_train)
X_test_vectorized = tfidf.transform(X_test)

CPU times: user 3min 28s, sys: 3.9 s, total: 3min 32s
Wall time: 3min 30s


** Save extracted feature names to disk **

In [18]:
%%time
feature_names = tfidf.get_feature_names()
fn = 'feature_names.txt'
with open(fn, 'w') as f:
    f.write(u'\n'.join(feature_names).encode('utf8'))
print 'Check file %s to see all extracted feature names' % fn
print 'Total names:', len(feature_names)

Check file feature_names.txt to see all extracted feature names
Total names: 1000266
CPU times: user 3.55 s, sys: 192 ms, total: 3.74 s
Wall time: 3.64 s


**Vectorized Dataset Statistics**

In [19]:
print 'Train Shape:', X_train_vectorized.shape
print 'Sample content of type %s:' % type(X_train_vectorized)
print X_train_vectorized

Train Shape: (2056, 1000266)
Sample content of type <class 'scipy.sparse.csr.csr_matrix'>:
  (0, 493695)	0.00389808448096
  (0, 885936)	0.00584966115962
  (0, 832179)	0.00616462930738
  (0, 917872)	0.0046530326173
  (0, 829168)	0.00408733182825
  (0, 945691)	0.00499625140684
  (0, 591897)	0.0046530326173
  (0, 945558)	0.00584966115962
  (0, 688694)	0.00625148464417
  (0, 927607)	0.0046530326173
  (0, 832185)	0.00584966115962
  (0, 651373)	0.00247536255636
  (0, 232148)	0.00160422177341
  (0, 569244)	0.00616462930738
  (0, 784244)	0.00616462930738
  (0, 682679)	0.00477277794959
  (0, 954671)	0.00303770780226
  (0, 523591)	0.00616462930738
  (0, 748873)	0.0037460287167
  (0, 547748)	0.00291007603641
  (0, 746901)	0.00450221085971
  (0, 556859)	0.0024296548421
  (0, 533588)	0.00240777150674
  (0, 636191)	0.00411459101229
  (0, 900308)	0.00403552549947
  :	:
  (2055, 644013)	0.000932749892336
  (2055, 849425)	0.00221738161531
  (2055, 802084)	0.000572625379924
  (2055, 590869)	0.0004939576

## Feature Selection

Recursive feature elimination using weights of Linear Kernel Support Vector Machine

In [20]:
%%time
rfe = RFE(SGDClassifier(loss='hinge'), n_features_to_select=40000, step=0.20)
X_train_selected = rfe.fit_transform(X_train_vectorized, y_train)
X_test_selected = rfe.transform(X_test_vectorized)
print X_train_selected.shape, X_test_selected.shape

(2056, 40000) (513, 40000)
CPU times: user 1min 20s, sys: 3.95 s, total: 1min 24s
Wall time: 36.1 s


** Save top feature names to file **

In [21]:
%%time
top_features = [feature for feature, support in zip(feature_names, rfe.support_) if support]
file_name = 'feature_names_top.txt'
with open(file_name, 'w') as f:
    f.write(u'\n'.join(top_features).encode('utf8'))
print 'Go check file %s' % file_name

Go check file feature_names_top.txt
CPU times: user 864 ms, sys: 16 ms, total: 880 ms
Wall time: 918 ms


## Dimensionality Reduction

Truncated SVD (Single Value Decomposition) is called Latent Semantic Analysis (LSA) in text analysis context

In [345]:
# %%time
# svd = TruncatedSVD(n_components=200) # works on sparse data
# X_train_reduced = svd.fit_transform(X_train_selected)
# X_test_reduced = svd.transform(X_test_selected)
# print 'Train Shape:', X_train_reduced.shape
# print 'Explained Variance Ratio Sum:', svd.explained_variance_ratio_.sum()
# print 'Top 5 Explained Variance Ratio:', svd.explained_variance_ratio_[:5]

Train Shape: (620L, 200L)
Explained Variance Ratio Sum: 0.680931519226
Top 5 Explained Variance Ratio: [ 0.08757407  0.0295904   0.01745794  0.01065465  0.01066833]
Wall time: 1.93 s


## Training models

In [22]:
%%time
models = [LogisticRegression(), LinearSVC(), DecisionTreeClassifier(max_depth=15),
          SGDClassifier(), KNeighborsClassifier(n_neighbors=2), Perceptron(), RandomForestClassifier()]
for clf in models:
    print 'Training', type(clf).__name__
    %time clf.fit(X_train_selected, y_train)

Training LogisticRegression
CPU times: user 1min 44s, sys: 5.75 s, total: 1min 50s
Wall time: 41.8 s
Training LinearSVC
CPU times: user 14 s, sys: 68 ms, total: 14 s
Wall time: 13.6 s
Training DecisionTreeClassifier
CPU times: user 14.4 s, sys: 4 ms, total: 14.4 s
Wall time: 14.4 s
Training SGDClassifier
CPU times: user 8.32 s, sys: 468 ms, total: 8.79 s
Wall time: 3.27 s
Training KNeighborsClassifier
CPU times: user 100 ms, sys: 12 ms, total: 112 ms
Wall time: 41.5 ms
Training Perceptron
CPU times: user 8.1 s, sys: 440 ms, total: 8.54 s
Wall time: 3.18 s
Training RandomForestClassifier
CPU times: user 1.92 s, sys: 20 ms, total: 1.94 s
Wall time: 1.86 s
CPU times: user 2min 31s, sys: 6.77 s, total: 2min 38s
Wall time: 1min 18s


## Models Scoring
Evaluate on both train and test set

In [25]:
for clf in models:
    print type(clf).__name__
    for X,y,t in [(X_train_selected, y_train, 'Train'), (X_test_selected, y_test, 'Test')]:
        pred = clf.predict(X) # change to reduced or selected for 2 ways of reducing dimensions
        print t, 'dataset'
        print 'Accuracy Score:', accuracy_score(y, pred)
        print 'Precision Recall F-Score:\n', precision_recall_fscore_support(y, pred, average='weighted')
    print

LogisticRegression
Train dataset
Accuracy Score: 0.795719844358
Precision Recall F-Score:
(0.78746583276847448, 0.7957198443579766, 0.76363064739388919, None)
Test dataset
Accuracy Score: 0.695906432749
Precision Recall F-Score:
(0.66223822722827075, 0.69590643274853803, 0.6551796416116431, None)

LinearSVC
Train dataset
Accuracy Score: 0.989299610895
Precision Recall F-Score:
(0.9895677031951593, 0.98929961089494167, 0.98866489252394996, None)
Test dataset
Accuracy Score: 0.777777777778
Precision Recall F-Score:
(0.7648174662712206, 0.77777777777777779, 0.75688720212788685, None)

DecisionTreeClassifier
Train dataset
Accuracy Score: 0.899805447471
Precision Recall F-Score:
(0.92730849102440549, 0.89980544747081714, 0.9006108159952404, None)
Test dataset
Accuracy Score: 0.678362573099
Precision Recall F-Score:
(0.68179520073204325, 0.67836257309941517, 0.66964337212441483, None)

SGDClassifier
Train dataset
Accuracy Score: 0.98686770428
Precision Recall F-Score:
(0.98702759772922599, 0

## Train a Model with Cross-Validation Set
**Support Vector Machine implemented using Stochastic Gradient Descent**

Tune the model's hyper-parameters to give high K-Fold CV score

In [29]:
%%time
params = {'alpha':[1e-3, 1e-4, 1e-5], 'n_iter':[300]}
gs = GridSearchCV(SGDClassifier(random_state=42), params, scoring='f1_weighted', cv=3, n_jobs=2)
gs.fit(X_train_selected, y_train)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


CPU times: user 2min 12s, sys: 776 ms, total: 2min 13s
Wall time: 9min 56s


In [30]:
print gs.best_estimator_
print gs.best_params_
print gs.best_score_

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=300, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False)
{'alpha': 0.0001, 'n_iter': 300}
0.74456811893


From the above result, it looks like increasing the number of iterations improve the performance

In [31]:
clf = gs.best_estimator_
print type(clf).__name__
for X,y,t in [(X_train_selected, y_train, 'Train'), (X_test_selected, y_test, 'Test')]:
    pred = clf.predict(X) # change to reduced or selected to change ways of reducing dimensions
    print '=>', t, 'dataset'
    print 'Accuracy Score:', accuracy_score(y, pred)
    print 'Precision Recall F-Score:\n', precision_recall_fscore_support(y, pred, average='weighted')
    print
print 'Baseline score by chance:', 1.0 / n_labels, '(assume that an algorithm randomly guesses the label)'

SGDClassifier
=> Train dataset
Accuracy Score: 0.997081712062
Precision Recall F-Score:
(0.99712512434204548, 0.99708171206225682, 0.9969525364311449, None)

=> Test dataset
Accuracy Score: 0.777777777778
Precision Recall F-Score:
(0.76181799490489222, 0.77777777777777779, 0.76057450277142535, None)

Baseline score by chance: 0.05 (assume that an algorithm randomly guesses the label)


## Model Evaluation Metrics
Visualize confusion matrix and show classification report

In [32]:
y_true, y_pred = y_test, clf.predict(X_test_selected)

### Confusion Matrix
Visualize true positives and false positives

In [33]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(n_labels)
    plt.xticks(tick_marks, rotation=0)
    plt.yticks(tick_marks)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [34]:
cm = confusion_matrix(y_true, y_pred)
print 'Confusion matrix, without normalization'
print cm
plt.figure()
plot_confusion_matrix(cm)

# Normalize the confusion matrix by row (i.e by the number of samples in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print 'Normalized confusion matrix (Had to scale by 99 not 100 because the matrix will be too big and wrap lines)'
print (cm_normalized * 99).astype('int')
plt.figure()
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')

plt.show()

Confusion matrix, without normalization
[[ 0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  5  0  0  0  4  0  0  0  0  0  2  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0]
 [ 0  0  0  2  1  1  0  3  0  5  0  2  0  1  0  1  1  0  0  0]
 [ 0  0  0  0 61  1  0  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 88  0  3  0  0  1  1  0  0  0  0  0  0  2  0]
 [ 0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0  9  0 28  0  5  2  1  0  0  3  1  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  3  0  0  0  0  0  0  0  0  1  1  0]
 [ 0  1  0  3  1  2  0  3  0 50  2  1  0  0  0  1  0  1  0  0]
 [ 0  0  0  0  0  1  0  1  0  1 36  0  0  0  2  0  0  0  0  0]
 [ 0  0  0  1  0  1  0  0  0  2  0 17  0  0  0  3  0  0  0  0]
 [ 0  0  0  0  2  0  1  0  0  0  0  0 13  0  0  1  0  4  1  0]
 [ 0  0  0  0  0  1  0  0  0  1  0  0  0  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 14  1  0  0  0  0]
 [ 0  0  0  0  

<IPython.core.display.Javascript object>

Normalized confusion matrix (Had to scale by 99 not 100 because the matrix will be too big and wrap lines)
[[ 0  0  0  0  0  0  0  0  0  0  0  0 99  0  0  0  0  0  0  0]
 [ 0 41  0  0  0 33  0  0  0  0  0 16  0  0  0  8  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 99  0]
 [ 0  0  0 11  5  5  0 17  0 29  0 11  0  5  0  5  5  0  0  0]
 [ 0  0  0  0 95  1  0  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 91  0  3  0  0  1  1  0  0  0  0  0  0  2  0]
 [ 0  0  0  0  0  0 99  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0 17  0 55  0  9  3  1  0  0  5  1  0  0  0  0]
 [ 0  0  0  0 16  0  0  0 49  0  0  0  0  0  0  0  0 16 16  0]
 [ 0  1  0  4  1  3  0  4  0 76  3  1  0  0  0  1  0  1  0  0]
 [ 0  0  0  0  0  2  0  2  0  2 86  0  0  0  4  0  0  0  0  0]
 [ 0  0  0  4  0  4  0  0  0  8  0 70  0  0  0 12  0  0  0  0]
 [ 0  0  0  0  9  0  4  0  0  0  0  0 58  0  0  4  0 18  4  0]
 [ 0  0  0  0  0 33  0  0  0 33  0  0  0 33  0  0  0  0  0  0]
 [ 0  0  0 

<IPython.core.display.Javascript object>

### Classification Report
Show scoring like precision, recall, f1 and their average for each label

In [35]:
print classification_report(y_true, y_pred, target_names=None)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       0.83      0.42      0.56        12
          2       0.00      0.00      0.00         1
          3       0.29      0.12      0.17        17
          4       0.90      0.97      0.93        63
          5       0.81      0.93      0.86        95
          6       0.50      1.00      0.67         1
          7       0.70      0.56      0.62        50
          8       0.75      0.50      0.60         6
          9       0.74      0.77      0.75        65
         10       0.80      0.88      0.84        41
         11       0.68      0.71      0.69        24
         12       0.93      0.59      0.72        22
         13       0.50      0.33      0.40         3
         14       0.74      0.93      0.82        15
         15       0.79      0.90      0.84        42
         16       0.00      0.00      0.00         4
         17       0.78      0.96      0.86   

Recall of 9 (ศึกษาศาสตร์) is 98 % ; ค่อนข้างสูงมาก หากเราโยนวิทยานิพนธ์ของศึกษาศาสตร์ให้ระบบไป เราแทบจะมั่นใจได้เลยว่ามันจะบอกคลาสถูก

ส่วนคลาสที่มีจำนวนน้อยๆ f1-weighted จะไม่ค่อยสนใจ เพราะมี weight น้อย ทำให้ทำนายไม่ค่อยถูก

## Learning Curves
Watch the performance of our chosen model as we increase the training size and check if it has variance or bias or somewhere in between

In [36]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 10)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("F1-Score Weighted of CVs")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, scoring='f1_weighted',
                                                            n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="%d-Fold Cross-validation score" % cv)

    plt.legend(loc="best")
    return plt

In [37]:
%%time
# cv = ShuffleSplit(X_train_selected.shape[0], n_iter=5, test_size=0.2, random_state=42)
title = 'Learning Curves (SVM)'
plot_learning_curve(clf, title, X_train_selected, y_train, ylim=(-0.05, 1.05), cv=5, n_jobs=2)
plt.show()

<IPython.core.display.Javascript object>

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


CPU times: user 8.87 s, sys: 260 ms, total: 9.13 s
Wall time: 27min 42s
