<a href="https://colab.research.google.com/github/mesahwi/TextAnlaysis/blob/master/Naver_News_Analysis/Doc2Vec_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install konlpy

In [1]:
!apt-get install python3-dev; pip3 install konlpy

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python3-dev is already the newest version (3.6.7-1~18.04).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 4 not upgraded.


Mount google drive

In [2]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Import necessary packages

In [0]:
import glob

import numpy as np
import gensim
import sklearn
import nltk
import collections

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import warnings, os
warnings.filterwarnings('ignore')

from konlpy.tag import *

# 아래와 같은 분석기가 있음.
hnm = Hannanum()
kkma = Kkma()
okt = Okt()

base_dir = 'gdrive/Shared drives/텍스트마이닝/News Analysis/'


Prepare data

In [4]:
# prepare corpus  

nrow_per_type = 500
ntype = 2

data, label = np.empty([nrow_per_type * ntype,1], dtype=object), np.zeros((nrow_per_type * ntype, 1))

files_0 = glob.glob(base_dir + 'Chosun/politics/*.txt')
files_1 = glob.glob(base_dir + 'Han/politics/*.txt')

files_total = np.append(files_0[:nrow_per_type], files_1[:nrow_per_type])

print('reading data...')
for i, name in enumerate(files_total):
  with open(name,'r') as handle:
    data[i,0] = handle.read()
    label[i,0] = int(i/nrow_per_type)

reading data...


In [0]:
# split into train/test set

train_percentage = 0.7

data_size = len(label)
shuffled_indeces = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffled_indeces]
shuffled_label = label[shuffled_indeces]

split_idx = int(data_size*train_percentage)

data_train = shuffled_data[:split_idx]
data_test = shuffled_data[split_idx:]
label_train = shuffled_label[:split_idx]
label_test = shuffled_label[split_idx:]

del shuffled_data, shuffled_label, data, label

Preprocess

In [0]:
#stopwords
f = open(base_dir+'stopwords.txt', 'r')
stopwords = f.read()
stopwords = stopwords.split('\n')

In [0]:
# preprocess, single documents, not whole corpus.
# This function uses nouns. For a more sophisticated analysis, using morphs is recommended
def preprocess_single_doc(text, tokenizer_type = 1):
  
  #choose tokenizer
  if tokenizer_type == 1:
    tokenizer = Hannanum()
  elif tokenizer_type == 2:
    tokenizer = Kkma()
  elif tokenizer_type == 3:
    tokenizer = Okt()
    
  #tokenize
  tokens = tokenizer.nouns(text)
  
  #remove short words, but probably should not apply to Korean
  #tokens = [token for token in tokens if len(token) > 1] 

  #stop words
  my_stopwords = ['조선일보', '조선닷컴', '닷컴', 'Chosun', 'Copyrights', '&', '바로가기', '기자', '구독', '메인' 'ⓒ', '배포', '한겨례', '한겨례신문', '▶', '◀', '네이버', '[', ']', 'co', 'kr', 'hani']
  tokens = [token for token in tokens if token not in stopwords and token not in my_stopwords]
  
  #numbers are already left out, from tokenizer.nouns()
  #tokens = [word for word in tokens if not any(char.isdigit() for char in word)]

  preprocessed = ' '.join(tokens)
  return preprocessed

In [0]:
train_corpus = [TaggedDocument(words = preprocess_single_doc(d[0], 3), tags=[str(i)]) for i, d in enumerate(data_train)] #'TaggedDocument', to be used for doc2vec

Train Doc2Vec model

In [0]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [10]:
#see how much documents are the most similar to themselves, according to our doc2vec model
#(Testing the performance of our doc2vec model)

correct_list = []
wrong_list = []
wrong_id_list = []
for doc_id in range(len(train_corpus)):
  v = model.infer_vector(train_corpus[doc_id].words)
  sims = model.docvecs.most_similar([v])
  dif = int(sims[0][0]) - doc_id
  if dif==0:
    correct_list.append(sims)
  else:
    wrong_list.append(sims)
    wrong_id_list.append(doc_id)

print(len(correct_list) / len(train_corpus), ' correct')


1.0  correct


Evaluate, with logistic regresson and svm

In [0]:
def getDocVec(model):
  totLen = len(model.docvecs)
  X = [model.docvecs[i] for i in range(totLen)]
  return X

In [0]:
#logistic regression, svm model building
X_train = getDocVec(model)
Y_train = label_train[:,0]

lm = LogisticRegression()
lmfit = lm.fit(X_train, Y_train)
y_train_lm = lmfit.predict(X_train)

svm = SVC()
svmfit = svm.fit(X_train, Y_train)
y_train_svm = svmfit.predict(X_train)

In [13]:
col_lm = collections.Counter(y_train_lm - Y_train)
col_svm = collections.Counter(y_train_svm - Y_train)
print('Logistic Regression Performance with Training set : ', col_lm[0]/len(y_train_lm))
print('Support Vector Machine Performance with Training set : ', col_svm[0]/len(y_train_svm))

Logistic Regression Performance with Training set :  0.7228571428571429
Support Vector Machine Performance with Training set :  0.9814285714285714


Now that we have built logistic regerssion and svm models, it's time to test the overall performance, using test set

In [0]:
test_corpus =[TaggedDocument(words = preprocess_single_doc(d[0], 3), tags=[str(i)]) for i, d in enumerate(data_test)]

model2 = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model2.build_vocab(test_corpus)
model2.train(test_corpus, total_examples=model2.corpus_count, epochs=model2.epochs)

In [15]:
#with test set
X_test = getDocVec(model2)
Y_test = label_test[:,0]

X_test = getDocVec(model2)
y_test_lm = lmfit.predict(X_test)
y_test_svm = svmfit.predict(X_test)

col_lm = collections.Counter(y_test_lm - Y_test)
col_svm = collections.Counter(y_test_svm - Y_test)
print('Logistic Regression Performance with Test set : ', col_lm[0]/len(y_test_lm))
print('Support Vector Machine Performance with Test set : ', col_svm[0]/len(y_test_svm))

Logistic Regression Performance with Test set :  0.6
Support Vector Machine Performance with Test set :  0.59


Using Logistic Regression and SVM on the document vectors, around 0.6 of the political documents were classified ('조선일보'/'한겨례')