# Data Ingestion

In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

train_raw_df = fetch_20newsgroups(subset='train', categories=categories)
test_raw_df = fetch_20newsgroups(subset='test', categories=categories)

x_train, x_val, y_train, y_val = train_test_split(np.array(train_raw_df.data), train_raw_df.target, test_size=0.1)
x_test = np.array(test_raw_df.data)
y_test = test_raw_df.target

# x_train = [x_train[:200] for x in x_train]

print('Train:', len(x_train))
print('Val:', len(x_val))
print('Test:', len(x_test))

Train: 2031
Val: 226
Test: 1502


In [2]:
%reload_ext  autoreload
%autoreload 2

import sys, os
def add_aion(curr_path=None):
    if curr_path is None:
        dir_path = os.getcwd()
        target_path = os.path.dirname(os.path.dirname(dir_path))
        print(target_path)
        if target_path not in sys.path:
            print('Added %s into sys.path.' % (target_path))
            sys.path.insert(0, target_path)
            
add_aion()

/data/jupyter/common
Added /data/jupyter/common into sys.path.


# Model

In [3]:
from aion.embeddings.doc2vec import Doc2VecEmbeddings

In [5]:
doc2vec_embs = Doc2VecEmbeddings()
x_train_tokens = doc2vec_embs.build_vocab(documents=x_train)
doc2vec_embs.train(x_train_tokens)

2018-10-08 22:52:10.269082 start
2018-10-08 22:53:30.387969 end


In [8]:
x_train_t = doc2vec_embs.encode(documents=x_train)
x_test_t = doc2vec_embs.encode(documents=x_test)

In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='newton-cg', max_iter=1000)
model.fit(x_train_t, y_train)

y_pred = model.predict(x_test_t)

In [10]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Accuracy:%.2f%%' % (accuracy_score(y_test, y_pred)*100))
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy:52.80%
Classification Report:
             precision    recall  f1-score   support

          0       0.56      0.17      0.26       319
          1       0.82      0.63      0.72       389
          2       0.85      0.31      0.45       396
          3       0.38      0.93      0.54       398

avg / total       0.66      0.53      0.50      1502

