In [1]:
from gensim import corpora
from gensim import models
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import grid_search

from classification.tools.facade import MessageManager
from classification.tools.loader import BookManager




In [2]:
# dataset作成
file_manager = BookManager()
parse_manager = MessageManager(parser='cabocha')

In [3]:
raw_data_list = ['101', '102', '104']
target_mapper = {'101': 1, '102': 2, '103': 3, '104': 4}
documents = []
labels = []
for raw_data in raw_data_list:
    files = file_manager.load(raw_data)
    for file in files:
        with open(file, 'rt') as f:
            data = f.read()
        message = parse_manager.extract_message(data)
        documents.append(message.bags)
        labels.append(target_mapper[raw_data])


In [4]:
# create dict 
# 全文書に登場する単語にidをふって辞書をつくる
dic = corpora.Dictionary(documents)
#dic.filter_extremes(no_below=20, no_above=0.3)


In [5]:
# corpus 作成
# 各文書中に辞書に登録する単語が何回登場するかを数えてbag of wordsをつくる
bow_corpus = [dic.doc2bow(d) for d in documents]


In [6]:
# tfidf
# 各文書の単語のtf/idfを計算する
tfidf_model = models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf_model[bow_corpus]


In [7]:
# 次元削減
# 辞書から作成した6000次元ほどのコーパスを200次元まで圧縮する。
lsi_model = models.LsiModel(tfidf_corpus, id2word=dic, num_topics=200)
lsi_curpus = lsi_model[tfidf_corpus]

In [8]:
# dataset作成
l_novel_dict = {}
l_novel_dict['data'] = []
l_novel_dict['target'] = []
l_novel_dict['target_name'] = []

for doc, label in zip(lsi_curpus, labels):
    vecs = [v[1] for v in doc]
    l_novel_dict['data'].append(vecs)
    l_novel_dict['target'].append(label)


In [9]:
# トレーニングデータ作成
X_train, X_test, y_train, y_test = train_test_split(l_novel_dict['data'], l_novel_dict['target'], random_state=50)


In [10]:
svc = SVC()
cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
parameters = {'kernel': ['rbf'], 'C': cs, 'gamma': gammas}
clf = grid_search.GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)
print(clf.grid_scores_)

[mean: 0.36538, std: 0.00000, params: {'C': 0.001, 'gamma': 0.001, 'kernel': 'rbf'}, mean: 0.36538, std: 0.00000, params: {'C': 0.001, 'gamma': 0.01, 'kernel': 'rbf'}, mean: 0.36538, std: 0.00000, params: {'C': 0.001, 'gamma': 0.1, 'kernel': 'rbf'}, mean: 0.36538, std: 0.00000, params: {'C': 0.001, 'gamma': 1, 'kernel': 'rbf'}, mean: 0.36538, std: 0.00000, params: {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'}, mean: 0.36538, std: 0.00000, params: {'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}, mean: 0.36538, std: 0.00000, params: {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}, mean: 0.36538, std: 0.00000, params: {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}, mean: 0.36538, std: 0.00000, params: {'C': 0.1, 'gamma': 0.001, 'kernel': 'rbf'}, mean: 0.36538, std: 0.00000, params: {'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}, mean: 0.36538, std: 0.00000, params: {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}, mean: 0.36538, std: 0.00000, params: {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}, mean: 0.36538, std: 0.00

In [11]:
score = clf.score(X_test, y_test)
print(score)

0.712328767123
