In [72]:
from pathlib import Path
import pandas as pd
import glob
import os
from tqdm import tqdm_notebook as tqdm
import logging
import numpy as np
from gensim.models import Word2Vec
import MeCab
import time
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import sys
import re
import csv
from IPython.core.display import display

# logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# visualization setting
np.set_printoptions(suppress=True)

p = Path('../../')
ro = p / 'data' / 'ro'
rw = p / 'data' / 'rw'
text = ro / 'text'
m = p / 'model'

In [10]:
[f for f in (p / 'model').glob('*')]

[PosixPath('../../model/200features_20minwords_10context_len2alldata'),
 PosixPath('../../model/size200_min20_window10'),
 PosixPath('../../model/gmm_latestclusmodel_len2alldata.pkl'),
 PosixPath('../../model/TEST_SDV_60cluster_200feature_matrix_gmm_sparse.npy'),
 PosixPath('../../model/SDV_60cluster_200feature_matrix_gmm_sparse.npy'),
 PosixPath('../../model/gmm_prob_latestclusmodel_len2alldata.pkl')]

In [13]:
w2v = Word2Vec.load('../../model/size200_min20_window10')
w2v.wv[]

2019-10-16 23:43:08,556 : INFO : loading Word2Vec object from ../../model/size200_min20_window10
2019-10-16 23:43:08,705 : INFO : loading wv recursively from ../../model/size200_min20_window10.wv.* with mmap=None
2019-10-16 23:43:08,707 : INFO : setting ignored attribute vectors_norm to None
2019-10-16 23:43:08,707 : INFO : loading vocabulary recursively from ../../model/size200_min20_window10.vocabulary.* with mmap=None
2019-10-16 23:43:08,708 : INFO : loading trainables recursively from ../../model/size200_min20_window10.trainables.* with mmap=None
2019-10-16 23:43:08,708 : INFO : setting ignored attribute cum_table to None
2019-10-16 23:43:08,709 : INFO : loaded ../../model/size200_min20_window10


array([ 0.0025083 ,  0.0183469 ,  0.00345732, -0.03295696, -0.04593367,
       -0.10816387,  0.0857932 , -0.12408824,  0.06511008, -0.04497756,
       -0.07030071,  0.06944484, -0.01027484,  0.01026676,  0.06238428,
       -0.04692511,  0.03373602,  0.14480016,  0.03873958, -0.01466305,
       -0.1427166 , -0.07165251,  0.07729675, -0.09725847,  0.13473438,
       -0.11147039, -0.01249613, -0.09645063, -0.06055609, -0.06965324,
        0.0409329 ,  0.01581345, -0.03386689, -0.11278624, -0.07521959,
       -0.00328965, -0.03337483, -0.04025003, -0.0756775 , -0.04741724,
        0.02490379,  0.11853001,  0.01307016,  0.10079149, -0.13430206,
       -0.09880193,  0.02225843, -0.05936074, -0.01643374, -0.0167249 ,
        0.01553877, -0.00309514,  0.04844624,  0.03015225,  0.05496237,
       -0.00642745,  0.01031007, -0.03303425, -0.07633678,  0.19726828,
        0.08345504,  0.00299668, -0.10082375, -0.05465319,  0.14097722,
       -0.03347525,  0.11759061,  0.01626635,  0.04428696,  0.05

In [64]:
# load class and documents
df = pd.read_csv(rw / 'class_and_document.csv')
display(df.head())

# load featurized corpus
with (rw / 'corpus.csv').open('r') as f:
    corpus = [row for row in csv.reader(f)]

Unnamed: 0,class,document
0,dokujo-tsushin,タニタだけじゃない。2008年から社員食堂を一般公開しているのはロート製薬が運営する旬穀旬菜...
1,dokujo-tsushin,東京・渋谷のセンター街でマスク姿の10〜30代の男女100人にアンケート調査を実施したところ...
2,dokujo-tsushin,あなたは親友と呼べる友達がいるだろうか？ 独女たちの間ではよくある話だが、親友が結婚した途端...
3,dokujo-tsushin,互いに好意を持ちながらも、天井裏から覗く、覗かれるという関係しか築けない複雑な男女の愛を、近...
4,dokujo-tsushin,「女性ホルモンで美しく！」なんてよく聞くけれど、それがどんなものか、みんなはイメージできるか...


In [60]:
# encoding
vectorized_corpus = []
for document in corpus:
    v = []
    for token in document:
        try:
            v.append(w2v.wv[token])
        except KeyError as e:
            pass
    vectorized_corpus.append(np.array(v).mean(axis=0))
    
print(len(vectorized_corpus))

7376


In [84]:
np.array(vectorized_corpus)[0:2, 0:2]

array([[-0.02192472,  0.01983469],
       [-0.01185588,  0.00893216]], dtype=float32)

In [86]:
X = np.array(vectorized_corpus)
y = np.array(df['class'])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5532, 200)
(1844, 200)
(5532,)
(1844,)


In [88]:
from sklearn import svm
[m for m in dir(svm) if 'SV' in m]

['LinearSVC', 'LinearSVR', 'NuSVC', 'NuSVR', 'OneClassSVM', 'SVC', 'SVR']

In [73]:
svc = SVC()
svc.fit(X_train, y_train)
y_predicted = svc.predict(X_test)



In [85]:
svc.score(X_test, y_test)

0.10303687635574837

In [76]:
np.unique(y_test)

array(['dokujo-tsushin', 'it-life-hack', 'kaden-channel',
       'livedoor-homme', 'movie-enter', 'peachy', 'smax', 'sports-watch',
       'topic-news'], dtype=object)

In [75]:
np.unique(y_predicted)

array(['kaden-channel'], dtype=object)

In [78]:
confusion_matrix(y_test, y_predicted)

array([[  0,   0, 204,   0,   0,   0,   0,   0,   0],
       [  0,   0, 213,   0,   0,   0,   0,   0,   0],
       [  0,   0, 190,   0,   0,   0,   0,   0,   0],
       [  0,   0, 131,   0,   0,   0,   0,   0,   0],
       [  0,   0, 235,   0,   0,   0,   0,   0,   0],
       [  0,   0, 213,   0,   0,   0,   0,   0,   0],
       [  0,   0, 214,   0,   0,   0,   0,   0,   0],
       [  0,   0, 239,   0,   0,   0,   0,   0,   0],
       [  0,   0, 205,   0,   0,   0,   0,   0,   0]])