In [1]:
import os
import pandas as pd
import warnings 
warnings.filterwarnings(action='ignore')
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re

## 1. 데이터 load  

In [29]:
train = pd.read_csv("open/train.csv", encoding = 'utf-8', index_col=0)
test = pd.read_csv("open/test_x.csv", encoding = 'utf-8', index_col=0)
sample_submission = pd.read_csv("open/sample_submission.csv", encoding = 'utf-8')

In [30]:
display(train.head(3))
display(test.head(3))
display(sample_submission.head(3))

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1


Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...


Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0
2,2,0,0,0,0,0


## 2. EDA & 전처리

In [4]:
train.shape, test.shape, sample_submission.shape

((54879, 2), (19617, 1), (19617, 5))

### 작가 확인 

In [5]:
train.author.unique() # 작가(5)일 확률을 submission 에 제출  

array([3, 2, 1, 4, 0], dtype=int64)

### 전처리 - NLP  

In [6]:
#부호 제거  
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

train['text']=train['text'].apply(alpha_num)
test['text']=test['text'].apply(alpha_num)

display(train.head(3))
display(test.head(3))

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,He was almost choking There was so much so muc...,3
1,Your sister asked for it I suppose,2
2,She was engaged one day as she walked in peru...,1


Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,Not at all I think she is one of the most char...
1,No replied he with sudden consciousness not to...
2,As the lady had stated her intention of scream...


In [7]:
# 불용어

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)


train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

display(train.head(3))
display(test.head(3))

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,almost choking much much wanted say strange ex...,3
1,sister asked suppose,2
2,engaged one day walked perusing janes last let...,1


Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,not think one charming young ladies ever met m...
1,no replied sudden consciousness not find canno...
2,lady stated intention screaming course screame...


## 3. 모델링

### train_test_split

In [8]:
# train_test_split
X_train = np.array([x for x in train['text']])
X_test = np.array([x for x in test['text']])
y_train = np.array([x for x in train['author']])

In [9]:
#### 파라미터 설정

In [10]:
# 파라미터 설정
vocab_size = 20000
embedding_dim = 16
max_length = 500
padding_type='post'

In [11]:
### Tokenizer 

In [12]:
X_train

array(['almost choking much much wanted say strange exclamations came lips pole gazed fixedly bundle notes hand looked odin evident perplexity',
       'sister asked suppose',
       'engaged one day walked perusing janes last letter dwelling passages proved jane not written spirits instead surprised mr odin saw looking odin meeting putting away letter immediately forcing smile said',
       ..., 'sincere wellwisher friend sister lucy odin',
       'wanted lend money', 'certainly not occurred said yes like'],
      dtype='<U1433')

In [13]:
# tokenizer에 fit
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [14]:
X_train

array(['almost choking much much wanted say strange exclamations came lips pole gazed fixedly bundle notes hand looked odin evident perplexity',
       'sister asked suppose',
       'engaged one day walked perusing janes last letter dwelling passages proved jane not written spirits instead surprised mr odin saw looking odin meeting putting away letter immediately forcing smile said',
       ..., 'sincere wellwisher friend sister lucy odin',
       'wanted lend money', 'certainly not occurred said yes like'],
      dtype='<U1433')

In [15]:
display(list(word_index.items())[:5])
display(list(word_index.items())[-3:])

[('odin', 1), ('not', 2), ('said', 3), ('no', 4), ('one', 5)]

[('happyyour', 47134), ('tremblethe', 47135), ('treesapple', 47136)]

In [16]:
# 데이터를 sequence로 변환해주고 padding 해줍니다.
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [17]:
np.array(train_sequences).shape

(54879,)

In [18]:
train_padded.shape

(54879, 500)

In [19]:
#가벼운 NLP모델 생성
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [20]:
# compile model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 16)           320000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 125       
Total params: 320,533
Trainable params: 320,533
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
%%time
# fit model
num_epochs = 20
history = model.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=2, 
                    validation_split=0.2)

Epoch 1/20
1372/1372 - 7s - loss: 1.5653 - accuracy: 0.2780 - val_loss: 1.5515 - val_accuracy: 0.3174
Epoch 2/20
1372/1372 - 6s - loss: 1.4284 - accuracy: 0.3956 - val_loss: 1.2862 - val_accuracy: 0.4926
Epoch 3/20
1372/1372 - 7s - loss: 1.1778 - accuracy: 0.5291 - val_loss: 1.1205 - val_accuracy: 0.5509
Epoch 4/20
1372/1372 - 7s - loss: 1.0335 - accuracy: 0.5939 - val_loss: 1.0224 - val_accuracy: 0.6115
Epoch 5/20
1372/1372 - 8s - loss: 0.9281 - accuracy: 0.6436 - val_loss: 0.9404 - val_accuracy: 0.6409
Epoch 6/20
1372/1372 - 7s - loss: 0.8467 - accuracy: 0.6788 - val_loss: 0.8914 - val_accuracy: 0.6574
Epoch 7/20
1372/1372 - 7s - loss: 0.7833 - accuracy: 0.7074 - val_loss: 0.8600 - val_accuracy: 0.6767
Epoch 8/20
1372/1372 - 7s - loss: 0.7344 - accuracy: 0.7248 - val_loss: 0.8343 - val_accuracy: 0.6799
Epoch 9/20
1372/1372 - 7s - loss: 0.6890 - accuracy: 0.7471 - val_loss: 0.8045 - val_accuracy: 0.6964
Epoch 10/20
1372/1372 - 7s - loss: 0.6548 - accuracy: 0.7600 - val_loss: 0.7862 - 

## 4. 예측

In [22]:
# predict values
pred = model.predict_proba(test_padded)

In [23]:
pred

array([[4.83274271e-05, 7.67029464e-01, 8.14175978e-02, 1.50292084e-01,
        1.21237780e-03],
       [1.52880237e-01, 6.11197829e-01, 7.09753633e-02, 1.56752858e-02,
        1.49271324e-01],
       [9.99140382e-01, 2.57441570e-04, 4.01694820e-07, 2.15506368e-08,
        6.01836189e-04],
       ...,
       [1.00664247e-03, 9.98941243e-01, 2.22256872e-07, 6.74877447e-06,
        4.51480919e-05],
       [2.68513919e-04, 9.99678493e-01, 5.22380446e-07, 4.71231851e-06,
        4.77443864e-05],
       [9.99272168e-01, 1.15657485e-05, 7.44099134e-06, 3.48753417e-07,
        7.08565349e-04]], dtype=float32)

In [24]:
## 5. 제출 

In [31]:
# submission
sample_submission[['0','1','2','3','4']] = pred
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,4.832743e-05,7.670295e-01,8.141760e-02,1.502921e-01,1.212378e-03
1,1,1.528802e-01,6.111978e-01,7.097536e-02,1.567529e-02,1.492713e-01
2,2,9.991404e-01,2.574416e-04,4.016948e-07,2.155064e-08,6.018362e-04
3,3,9.250784e-06,1.809201e-09,9.908127e-01,3.740388e-07,9.177648e-03
4,4,9.765337e-01,1.921502e-03,2.542602e-04,2.077566e-02,5.149257e-04
...,...,...,...,...,...,...
19612,19612,8.760786e-07,9.999992e-01,5.640542e-14,6.852649e-09,1.847872e-10
19613,19613,1.618986e-03,4.234270e-04,1.011457e-04,8.784949e-12,9.978564e-01
19614,19614,1.006642e-03,9.989412e-01,2.222569e-07,6.748774e-06,4.514809e-05
19615,19615,2.685139e-04,9.996785e-01,5.223804e-07,4.712319e-06,4.774439e-05


In [32]:
sample_submission.to_csv('submission/20210221-0.csv', index = False, encoding = 'utf-8')