In [133]:
import numpy as np
import os
from os.path import join, isdir
np.set_printoptions(suppress=True)
import matplotlib.pyplot as plt
%matplotlib notebook

# Load and proprocess documents
**Load document labels**

In [253]:
segmented_path = u'./corpus/segmented-docs' # it will listdir into unicode
doc_labels = [fn for fn in os.listdir(segmented_path) if isdir(join(segmented_path, fn))] # list only folders
doc_labels_idx = {}
n_labels = len(doc_labels)
for i, label in enumerate(doc_labels):
    print i, label
    doc_labels_idx[label] = i
print 'Total Labels:', n_labels

0 บริหารธุรกิจ
1 ประมง
2 มนุษยศาสตร์
3 วนศาสตร์
4 วิทยาศาสตร์
5 วิทยาศาสตร์การกีฬา
6 วิศวกรรมศาสตร์
7 วิศวกรรมศาสตร์ กำแพงแสน
8 ศิลปศาสตร์และวิทยาศาสตร์
9 ศึกษาศาสตร์
10 สถาปัตยกรรมศาสตร์
11 สังคมศาสตร์
12 สัตวแพทยศาสตร์
13 สิ่งแวดล้อม
14 อุตสาหกรรมเกษตร
15 เกษตร
16 เกษตร กำแพงแสน
17 เศรษฐศาสตร์
18 เศรษฐศาสตร์ ศรีราชา
19 โครงการสหวิทยาการระดับบัณฑิตศึกษา
Total Labels: 20


** Load dataset **

In [135]:
%%time
dataset_contents, dataset_labels = [], []
for i, label in enumerate(doc_labels):
    curr_dir = join(segmented_path, label)
    fns = os.listdir(curr_dir)
    for fn in fns:
        file_path = join(curr_dir, fn)
        with open(file_path, 'r') as f:
            content = unicode(f.read(), 'utf8')
            dataset_contents.append(content)
            dataset_labels.append(i)
N = len(dataset_contents)
print 'Total Segmented Documents:', N

Total Segmented Documents: 775
Wall time: 12.6 s


** Define a function that trims words and replace all PIPELINE by space **

In [136]:
def pretty_trim(text):
    words = text.split('|')
    stripped_words_generator = (word.strip() for word in words) # generates a stripped word when its next() method is called
    trimmed_words = [word for word in stripped_words_generator if word] # retains words that are not empty
    return ' '.join(trimmed_words)

**Show sample content**

In [137]:
print 'Content:', dataset_contents[1][:2**8], '...'
print 'Label:', dataset_labels[1]

Content: I|50731470|I|0|b| |บท|ที่| |1|I|1|b| |ความ|สาคัญ|ของ|ปัญหา|P|จาก|วิกฤต|เศรษฐกิจ|ใน|ปี| |2550| |ประเทศสหรัฐอเมริกา|ได้|ประสบปัญหา|วิกฤต|เศรษฐกิจ|ที่|มี|P|จุด|กา|เนิด|จาก|หนี้เสีย|ของ|สินเชื่อ|บ้าน|ที่|ปล่อย|กู้|แก่|ผู้|มี|เครดิต|ต่ำกว่า|มาตรฐาน|และ|เกิด|ปัญ ...
Label: 0


** Show sample content after pretty_trimmed() **

In [138]:
print 'Content:', pretty_trim(dataset_contents[1][:2**8]), '...'
print 'Label Str:', doc_labels[dataset_labels[1]]

Content: I 50731470 I 0 b บท ที่ 1 I 1 b ความ สาคัญ ของ ปัญหา P จาก วิกฤต เศรษฐกิจ ใน ปี 2550 ประเทศสหรัฐอเมริกา ได้ ประสบปัญหา วิกฤต เศรษฐกิจ ที่ มี P จุด กา เนิด จาก หนี้เสีย ของ สินเชื่อ บ้าน ที่ ปล่อย กู้ แก่ ผู้ มี เครดิต ต่ำกว่า มาตรฐาน และ เกิด ปัญ ...
Label Str: บริหารธุรกิจ


** Trim all documents **

In [139]:
%%time
dataset_contents_trimmed = [pretty_trim(content) for content in dataset_contents]

Wall time: 12.2 s


** Count number of words for each document **

In [142]:
%time dataset_words_count = np.array([len(content.split()) for content in dataset_contents_trimmed])
print 'Words Count Mean: ', np.mean(dataset_words_count)
dataset_words_count[:min(40,N)]

Wall time: 3.34 s
Words Count Mean:  40804.4154839


array([ 41235,  24706,  41920,  39107,  43159,  53636,  32753,  46280,
        36777,  48854,  33029,  31878,  51160,  63060,  31636,  63538,
        16844,  17461,  33434,  30484,  19858,  92392,  51363,  55388,
        26136,  46479, 196975,  36790,  58717,  67169,  49065,  20123,
        20198,  39262,  42075,  31650,  24313,  17898,  26907,  22453])

** Show words count histogram **

In [147]:
plt.figure()
plt.hist(dataset_words_count, bins=200)
plt.xlabel('Words Count')
plt.ylabel('Document Frequency')
plt.show()

<IPython.core.display.Javascript object>

# Machine Learning section

In [357]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import RFE
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from collections import Counter

## Train/Test Split
Split dataset into 2 parts and leave the test part untouched (not fitting it with any model)

Split using stratified sampling might be useful if you want to test all label including the skewed low frequency label

In [353]:
X_train, X_test, y_train, y_test = train_test_split(dataset_contents_trimmed, dataset_labels,
                                                    test_size=0.2, stratify=None, random_state=42)
print 'Train Size:', len(X_train)
print 'Test Size:', len(X_test)
train_counter, test_counter = Counter(y_train), Counter(y_test)
print 'Un-trained label:', list(set(xrange(n_labels)) - set(train_counter))
print 'Un-tested label:', list(set(xrange(n_labels)) - set(test_counter))

Train Size: 620
Test Size: 155
Un-trained label: []
Un-tested label: [8, 10]


### Plot bar chart of dataset frequency per label

In [343]:
train_label_freqs = np.zeros(n_labels, np.int32)
test_label_freqs = np.zeros(n_labels, np.int32)
for k,v in train_counter.iteritems():
    train_label_freqs[k] = v
for k,v in test_counter.iteritems():
    test_label_freqs[k] = v
plt.figure()
plt.bar(np.arange(n_labels)-0.5, train_label_freqs, 1)
plt.bar(np.arange(n_labels)-0.5, test_label_freqs, 1, color='r')
plt.xticks(np.arange(len(doc_labels)))
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.legend(['Train', 'Test'])
plt.grid()
plt.show()

<IPython.core.display.Javascript object>

## Feature Extraction
### Bag of Words Representation
Initialize a vectorizer that counts word instances and apply Tfidf (Term-Frequency * Inverse-Document-Frequency) to them

In [327]:
%%time
tfidf = TfidfVectorizer(encoding=u'utf-8', stop_words='english', binary=False, max_features=None)
X_train_vectorized = tfidf.fit_transform(X_train)
X_test_vectorized = tfidf.transform(X_test)

Wall time: 37.1 s



** Save extracted feature names to disk **

In [328]:
%%time
feature_names = tfidf.get_feature_names()
fn = 'feature_names.txt'
with open(fn, 'w') as f:
    f.write(u'\n'.join(feature_names).encode('utf8'))
print 'Check file %s to see all extracted feature names' % fn
print 'Total names:', len(feature_names)

Check file feature_names.txt to see all extracted feature names
Total names: 461056
Wall time: 1.12 s


**Vectorized Dataset Statistics**

In [333]:
print 'Train Shape:', X_train_vectorized.shape
print 'Sample content of type %s:' % type(X_train_vectorized)
print X_train_vectorized

Train Shape: (620, 461056)
Sample content of type <class 'scipy.sparse.csr.csr_matrix'>:
  (0, 2448)	0.00308894193547
  (0, 331376)	0.00133380269638
  (0, 263626)	0.00086825267355
  (0, 451702)	0.000559611795728
  (0, 444120)	0.000616376247728
  (0, 453281)	0.000590057737957
  (0, 374817)	0.00095327929427
  (0, 447034)	0.000787387435842
  (0, 441803)	0.00109172416992
  (0, 449227)	0.00108588430097
  (0, 458255)	0.00066699712212
  (0, 444566)	0.00123483861018
  (0, 446760)	0.00137200370889
  (0, 449226)	0.000744107816624
  (0, 451486)	0.00111585729516
  (0, 449124)	0.00141539435965
  (0, 443794)	0.00059104253643
  (0, 457787)	0.000843657008147
  (0, 133680)	0.000927922222027
  (0, 447664)	0.000873699353619
  (0, 447713)	0.000671674953659
  (0, 448836)	0.00113220243141
  (0, 146565)	0.00113482925134
  (0, 239000)	0.00223086380585
  (0, 242649)	0.00277118696991
  :	:
  (619, 444565)	0.00265042639402
  (619, 449954)	0.00192435323585
  (619, 308254)	0.00229496079736
  (619, 453935)	0.000387

## Feature Selection

Recursive feature elimination using weights of Linear Kernel Support Vector Machine

In [335]:
%%time
rfe = RFE(SGDClassifier(loss='hinge'), n_features_to_select=5000, step=0.10)
X_train_selected = rfe.fit_transform(X_train_vectorized, y_train)
X_test_selected = rfe.transform(X_test_vectorized)
print X_train_selected.shape, X_test_selected.shape

(620, 5000) (155, 5000)
Wall time: 6.93 s


** Save top feature names to file **

In [336]:
%%time
top_features = [feature for feature, support in zip(feature_names, rfe.support_) if support]
file_name = 'feature_names_top.txt'
with open(file_name, 'w') as f:
    f.write(u'\n'.join(top_features).encode('utf8'))
print 'Go check file %s' % file_name

Go check file feature_names_top.txt
Wall time: 245 ms


## Dimensionality Reduction

Truncated SVD (Single Value Decomposition) is called Latent Semantic Analysis (LSA) in text analysis context

In [345]:
# %%time
# svd = TruncatedSVD(n_components=200) # works on sparse data
# X_train_reduced = svd.fit_transform(X_train_selected)
# X_test_reduced = svd.transform(X_test_selected)
# print 'Train Shape:', X_train_reduced.shape
# print 'Explained Variance Ratio Sum:', svd.explained_variance_ratio_.sum()
# print 'Top 5 Explained Variance Ratio:', svd.explained_variance_ratio_[:5]

Train Shape: (620L, 200L)
Explained Variance Ratio Sum: 0.680931519226
Top 5 Explained Variance Ratio: [ 0.08757407  0.0295904   0.01745794  0.01065465  0.01066833]
Wall time: 1.93 s


## Training models

In [349]:
%%time
models = [LogisticRegression(), LinearSVC(), DecisionTreeClassifier(max_depth=15),
          SGDClassifier(), KNeighborsClassifier(n_neighbors=2), Perceptron(), RandomForestClassifier()]
for clf in models:
    print 'Training', type(clf).__name__
    %time clf.fit(X_train_selected, y_train)

Training LogisticRegression
Wall time: 1.72 s
Training LinearSVC
Wall time: 1.01 s
Training DecisionTreeClassifier
Wall time: 1.36 s
Training SGDClassifier
Wall time: 139 ms
Training KNeighborsClassifier
Wall time: 4 ms
Training Perceptron
Wall time: 206 ms
Training RandomForestClassifier
Wall time: 294 ms
Wall time: 4.75 s


## Models Evaluation Metrics
Evaluate on both train and test set

In [356]:
for clf in models:
    print type(clf).__name__
    for X,y,t in [(X_train_selected, y_train, 'Train'), (X_test_selected, y_test, 'Test')]:
        pred = clf.predict(X) # change to reduced or selected for 2 ways of reducing dimensions
        print t, 'dataset'
        print 'Accuracy Score:', accuracy_score(y, pred)
        print 'Precision Recall F-Score:\n', precision_recall_fscore_support(y, pred, average='weighted')
    print

LogisticRegression
Train dataset
Accuracy Score: 0.729032258065
Precision Recall F-Score:
(0.72927521958448949, 0.7290322580645161, 0.68117869104177753, None)
Test dataset
Accuracy Score: 0.483870967742
Precision Recall F-Score:
(0.51661642173976141, 0.4838709677419355, 0.42530237041052987, None)

LinearSVC
Train dataset
Accuracy Score: 0.990322580645
Precision Recall F-Score:
(0.99072590720768661, 0.99032258064516132, 0.9901328314025637, None)
Test dataset
Accuracy Score: 0.670967741935
Precision Recall F-Score:
(0.68319869883437245, 0.67096774193548392, 0.64588142346727151, None)

DecisionTreeClassifier
Train dataset
Accuracy Score: 0.938709677419
Precision Recall F-Score:
(0.93773557895186666, 0.93870967741935485, 0.93498074247813789, None)
Test dataset
Accuracy Score: 0.541935483871
Precision Recall F-Score:
(0.58574968937872174, 0.54193548387096779, 0.55117902309193634, None)

SGDClassifier
Train dataset
Accuracy Score: 1.0
Precision Recall F-Score:
(1.0, 1.0, 1.0, None)
Test data