In [1]:
import csv
import pickle
import gensim

from gensim import models
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
dataset_file_name = './dataset/corpus.csv'
model_file_name = 'logistic.pk1'

In [3]:
# Change csv file format

with open(dataset_file_name, 'r') as f:
    csv.field_size_limit(1000000)
    document_sentence_data = list(csv.reader(f))

In [4]:
texts = []
label_ids = []
id_of_label = {}
IDX_OF_LABEL, IDX_OF_SENTENCES = 0, 1
sum_letters = []

for counter, row in enumerate(document_sentence_data):
    if counter == 0:
        continue
        
    label = row[IDX_OF_LABEL]
    
    if label not in id_of_label:
        # If Label dose not have id (e.g. 1, 2, ...) create id.
        id_of_label[label] = len(id_of_label)
    
    label_ids.append(id_of_label[label])
    word_list = row[IDX_OF_SENTENCES].split(' ')
    texts.append(word_list)

label_of_id = { ids: label for label, ids in id_of_label.items() }


In [5]:
# Split train data and test data.
X_train_texts, X_test_texts, y_train, y_test = \
        train_test_split(
            texts, 
            label_ids, 
            test_size=0.2,
            random_state=42)


In [6]:
# From training text data, create matrix which weighed TF-IDF.

text_data_dic = gensim.corpora.Dictionary(X_train_texts)

# Create corpus Bag of words of dictionary of texts.
corpus = [ text_data_dic.doc2bow(text) for text in X_train_texts ]

# Weight corpus using TF-IDF.
tfidf_model = models.TfidfModel(corpus)
tfidf_corpus = tfidf_model[corpus]

num_words = len(text_data_dic)
X_train_tfidf = gensim.matutils.corpus2dense(
        tfidf_corpus, 
        num_terms=num_words,
        ).T

In [7]:
# From text data, create matrix which weighed by TF-IDF

corpus = [ text_data_dic.doc2bow(text) for text in X_test_texts ]

# Weight corpus using TF-IDF.
tfidf_corpus = tfidf_model[corpus]

num_words = len(text_data_dic)
X_test_tfidf = gensim.matutils.corpus2dense(
        tfidf_corpus, 
        num_terms=num_words,
        ).T

In [8]:
clf = LogisticRegression(C=1)
clf.fit(X_train_tfidf, y_train)



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [9]:
# Evaluate classifier using test data.
y_pred = clf.predict(X_test_tfidf)
target_names = list(label_of_id.values())

print(classification_report(
    y_test,
    y_pred,
    target_names=target_names))

print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

  humanities       0.92      0.87      0.90       460
     science       0.87      0.92      0.89       434

   micro avg       0.89      0.89      0.89       894
   macro avg       0.90      0.90      0.89       894
weighted avg       0.90      0.89      0.89       894

[[402  58]
 [ 36 398]]
