In [1]:
import csv
import pickle
import gensim

from gensim import models
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
dataset_file_name = './dataset/corpus.csv'
model_file_name = 'logistic.pk1'

In [3]:
# Change csv file format

with open(dataset_file_name, 'r') as f:
    csv.field_size_limit(1000000)
    document_sentence_data = list(csv.reader(f))
print(document_sentence_data)

[['humanities', '가볍 공허 소개 할까요 '], ['humanities', '으니까 '], ['humanities', '쑥 스럽 '], ['humanities', '저 메 뚝 씨 '], ['humanities', '규정 어려울 표준 저항 인문 학도 둘 요 '], ['humanities', '근엄 가르치 인문 학자 가볍 진지 자세 배움 임하 20 째 인문 학도 '], ['humanities', '저 옛날 얘기 할게요 '], ['humanities', '중학교 처음 기억나 네요 '], ['humanities', '꽃 희망 굉장히 힘들 '], ['humanities', '어머니 강요 거든요 '], ['humanities', '무지하 졸린 '], ['humanities', '그래도 위 삼 머리맡 놓 잡니다 '], ['humanities', '좋아하 여럿 봅니다 '], ['humanities', 'TV 듯이 봐요 '], ['humanities', '기억 '], ['humanities', '싹 사라져 버립니다 '], ['humanities', '이번 계기 전환 '], ['humanities', '흄 만나 칸트 달라졌 듯 얇 넓 좁 깊 독서 개편 '], ['humanities', '개편 픈 똥 팔 씨 진실 부디 달성 '], ['humanities', '한방 개편 긴 어렵 매일 조금 씩 자극 밤 지새우 본격 이어진 '], ['humanities', '매일 밤 때로 부드럽 상냥 때로 불편 강렬 흔들 줄 철수 만나 요 '], ['humanities', '철수 라고요 철 머리 수요 '], ['humanities', '웃음 철수 놀자 ~’ 예요 '], ['humanities', '당대 풍미 철학자 불러서 한바탕 노 '], ['humanities', '갖 썼 전반 배경 '], ['humanities', '지점 돌아볼 고요 '], ['humanities', '다소 무거워 질 순 토론 '], ['humanities', '수다 합시다 '], ['humanities', '그냥 

In [6]:
texts = []
label_ids = []
id_of_label = {}
IDX_OF_LABEL, IDX_OF_SENTENCES = 0, 1
sum_letters = []

for counter, row in enumerate(document_sentence_data):
    if counter == 0:
        continue
        
    label = row[IDX_OF_LABEL]
    
    if label not in id_of_label:
        # If Label dose not have id (e.g. 1, 2, ...) create id.
        id_of_label[label] = len(id_of_label)
    
    label_ids.append(id_of_label[label])
    word_list = row[IDX_OF_SENTENCES].split(' ')
    texts.append(word_list)
    
    for word in word_list:
        for letter in word:
            sum_letters.append(letter)
            
print('sum_letters: ', len(sum_letters))
label_of_id = { ids: label for label, ids in id_of_label.items() }


sum_letters:  184645


In [7]:
# Split train data and test data.
X_train_texts, X_test_texts, y_train, y_test = \
        train_test_split(
            texts, 
            label_ids, 
            test_size=0.2,
            random_state=42)


In [8]:
# From training text data, create matrix which weighed TF-IDF.

text_data_dic = gensim.corpora.Dictionary(X_train_texts)

# Create corpus Bag of words of dictionary of texts.
corpus = [ text_data_dic.doc2bow(text) for text in X_train_texts ]

# Weight corpus using TF-IDF.
tfidf_model = models.TfidfModel(corpus)
tfidf_corpus = tfidf_model[corpus]

num_words = len(text_data_dic)
X_train_tfidf = gensim.matutils.corpus2dense(
        tfidf_corpus, 
        num_terms=num_words,
        ).T

In [9]:
# From text data, create matrix which weighed by TF-IDF

corpus = [ text_data_dic.doc2bow(text) for text in X_test_texts ]

# Weight corpus using TF-IDF.
tfidf_corpus = tfidf_model[corpus]

num_words = len(text_data_dic)
X_test_tfidf = gensim.matutils.corpus2dense(
        tfidf_corpus, 
        num_terms=num_words,
        ).T

In [11]:
clf = LogisticRegression(C=1)
clf.fit(X_train_tfidf, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [13]:
# Evaluate classifier using test data.
y_pred = clf.predict(X_test_tfidf)
target_names = list(label_of_id.values())

print(classification_report(
    y_test,
    y_pred,
    target_names=target_names))

print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

  humanities       0.92      0.87      0.90       460
     science       0.87      0.92      0.89       434

   micro avg       0.89      0.89      0.89       894
   macro avg       0.90      0.90      0.89       894
weighted avg       0.90      0.89      0.89       894

[[402  58]
 [ 36 398]]
