In [1]:
from gensim.models import doc2vec
from collections import namedtuple
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB

from file_io import *
from words_preprocessing import *

In [2]:
train = load_pickle('../train_labeled.pickle')

In [4]:
train[1]

(['"""잘짜여진 구성, 합리적 추적과정, 정확한 근거제시등 다큐영화로서도 훌륭합니다. 꼭 보세요."""'], 0)

In [5]:
%%time
train_docs = [(tokenize(row[0][0]), row[1]) for row in train]
len(train_docs)

CPU times: user 3min 12s, sys: 1.33 s, total: 3min 13s
Wall time: 3min 1s


In [6]:
%%time
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]

CPU times: user 329 ms, sys: 13.9 ms, total: 343 ms
Wall time: 344 ms


In [9]:
model = doc2vec.Doc2Vec(size=300, alpha=0.02, min_alpha=0.02, 
                        workers=3, min_count=10, iter=100, seed=1234)
model.build_vocab(tagged_train_docs)

In [10]:
%%time
model.train(tagged_train_docs, epochs=model.iter, total_examples=model.corpus_count)

CPU times: user 18min 53s, sys: 4min 30s, total: 23min 23s
Wall time: 12min 41s


240240698

In [11]:
model.wv.init_sims()
save_pickle('../train_docs.pickle', train_docs)
model.save('../model/doc2vec_alpha002.model')

In [13]:
%%time
X = [model.infer_vector(doc.words) for doc in tagged_train_docs]
y = [doc.tags[0] for doc in tagged_train_docs]
print(len(X))
print(len(y))

165810
165810
CPU times: user 35.2 s, sys: 264 ms, total: 35.5 s
Wall time: 35.6 s


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=1234)

len(X_train), len(X_test), len(y_train), len(y_test)

(124357, 41453, 124357, 41453)

In [21]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
y_pred = lr_clf.predict(X_test)

In [23]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, y_pred))

[[15787  1759   377   465    68]
 [ 4525  4846   414   257    53]
 [ 2635   767  1432   183    41]
 [ 3117   873   183  1002    46]
 [ 1625   518   128   143   209]]


In [24]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.57      0.86      0.68     18456
          1       0.55      0.48      0.51     10095
          2       0.57      0.28      0.38      5058
          3       0.49      0.19      0.28      5221
          4       0.50      0.08      0.14      2623

avg / total       0.55      0.56      0.52     41453



In [26]:
lr_clf.score(X_test, y_test)

0.56150338938074451