In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import warnings
warnings.filterwarnings(action='ignore')

from matplotlib import pyplot as plt
from matplotlib import rcParams
%matplotlib inline

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pathlib import Path
from collections import  Counter

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re

In [2]:
trn = pd.read_csv('data/train.csv')
tst = pd.read_csv('data/train.csv')

In [5]:
# 클렌징: 부호 제거, 소문자 변환
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text.lower())

trn['text']=trn['text'].apply(alpha_num)

In [6]:
# 스톱워즈 제거 함수
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

# 스톱워즈
stopwords = ['the', 'and', 'i', 'to', 'of', 'a', 'in', 'to', 'had', 'he', 'man', 'have', 'are',
             'that', 'you', 'was', 'with', 'form', 'his', 'as', 'odin', 'said', 'one']

trn['text']=trn['text'].apply(remove_stopwords)

In [7]:
# test 세트도 소문자로 변환
trn['text'] = trn['text'].apply(alpha_num).apply(remove_stopwords)

tst['text'] = tst['text'].str.lower()
tst['text'] = tst['text'].apply(alpha_num).apply(remove_stopwords)

In [9]:
# train - test 분리
X_trn = np.array([x for x in trn['text']])
X_tst = np.array([x for x in tst['text']])
y_trn = np.array([x for x in trn['author']])

## NLTK

In [10]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer

In [11]:
s = trn.text[3]
print(s)

captain porch keeping himself carefully out way treacherous shot should any be intended turned spoke us doctors watch on lookout dr take north side if please jim east gray west watch below all hands load muskets lively men careful


In [12]:
tokens = word_tokenize(s)
print(tokens)

['captain', 'porch', 'keeping', 'himself', 'carefully', 'out', 'way', 'treacherous', 'shot', 'should', 'any', 'be', 'intended', 'turned', 'spoke', 'us', 'doctors', 'watch', 'on', 'lookout', 'dr', 'take', 'north', 'side', 'if', 'please', 'jim', 'east', 'gray', 'west', 'watch', 'below', 'all', 'hands', 'load', 'muskets', 'lively', 'men', 'careful']


In [13]:
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(t) for t in tokens]

['captain',
 'porch',
 'keeping',
 'himself',
 'carefully',
 'out',
 'way',
 'treacherous',
 'shot',
 'should',
 'any',
 'be',
 'intended',
 'turned',
 'spoke',
 'u',
 'doctor',
 'watch',
 'on',
 'lookout',
 'dr',
 'take',
 'north',
 'side',
 'if',
 'please',
 'jim',
 'east',
 'gray',
 'west',
 'watch',
 'below',
 'all',
 'hand',
 'load',
 'musket',
 'lively',
 'men',
 'careful']

In [14]:
stemmer = SnowballStemmer("english")
[stemmer.stem(t) for t in tokens]

['captain',
 'porch',
 'keep',
 'himself',
 'care',
 'out',
 'way',
 'treacher',
 'shot',
 'should',
 'ani',
 'be',
 'intend',
 'turn',
 'spoke',
 'us',
 'doctor',
 'watch',
 'on',
 'lookout',
 'dr',
 'take',
 'north',
 'side',
 'if',
 'pleas',
 'jim',
 'east',
 'gray',
 'west',
 'watch',
 'below',
 'all',
 'hand',
 'load',
 'musket',
 'live',
 'men',
 'care']

## Bag Of Words

In [15]:
vec = CountVectorizer(alpha_num, tokenizer=word_tokenize, stop_words=['the', 'and', 'i', 'to', 'of', 'a', 'in', 'to', 'had', 'he', 'man', 'have', 'are',
             'that', 'you', 'was', 'with', 'form', 'his', 'as', 'odin', 'said', 'one'], ngram_range=(1, 2), min_df=100)
X_cnt = vec.fit_transform(trn['text'])
print(X_cnt.shape)

(54879, 2426)


In [16]:
X_cnt[0, :50].todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [17]:
vec = TfidfVectorizer(alpha_num, tokenizer=word_tokenize, stop_words=['the', 'and', 'i', 'to', 'of', 'a', 'in', 'to', 'had', 'he', 'man', 'have', 'are',
             'that', 'you', 'was', 'with', 'form', 'his', 'as', 'odin', 'said', 'one'], ngram_range=(1, 3), min_df=50)
X = vec.fit_transform(trn['text'])
X_tst = vec.transform(tst['text'])
print(X.shape, X_tst.shape)

(54879, 5121) (54879, 5121)


In [18]:
X[0, :50].todense()

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]])

## 로지스틱 회귀모델

In [19]:
target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [20]:
y = trn.author.values
y.shape

(54879,)

In [21]:
p = np.zeros((X.shape[0], n_class))
p_tst = np.zeros((X_tst.shape[0], n_class))
for i_cv, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
    clf = LogisticRegression()
    clf.fit(X[i_trn], y[i_trn])
    p[i_val, :] = clf.predict_proba(X[i_val])
    p_tst += clf.predict_proba(X_tst) / n_class

In [22]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p):8.4f}')

Accuracy (CV):  69.4710%
Log Loss (CV):   0.8421


In [23]:
np.savetxt(p_val_file, p, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

NameError: name 'p_val_file' is not defined

## 제출 파일 생성

In [24]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

NameError: name 'sample_file' is not defined

In [25]:
sub[sub.columns] = p_tst
sub.head()

NameError: name 'sub' is not defined

In [None]:
sub.to_csv(submission_ml)