# **LSTM + 전처리**

# 라이브러리 및 데이터 불러오기

In [1]:
#import packages

import pandas as pd
import numpy as np
import re

import nltk
import nltk.data
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn import metrics, preprocessing, pipeline, model_selection, naive_bayes
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier, LogisticRegression
import xgboost as xgb

import time

from keras import backend as K
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import GlobalAveragePooling1D, Conv1D, MaxPooling1D, Flatten
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence, text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

In [2]:
#import datasets

train = pd.read_csv('G:/From 2021/2021/2021-Pre/학회_시즌2/딥러닝 분반/소대회/train.csv', encoding = "cp949")
test = pd.read_csv('G:/From 2021/2021/2021-Pre/학회_시즌2/딥러닝 분반/소대회/test_x.csv')
sample = pd.read_csv('G:/From 2021/2021/2021-Pre/학회_시즌2/딥러닝 분반/소대회/sample_submission.csv')

In [3]:
#Check if samples are properly imported
sample.head()

Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0
2,2,0,0,0,0,0
3,3,0,0,0,0,0
4,4,0,0,0,0,0


In [4]:
#Checking a distribution
train['author'].value_counts()

3    15063
0    13235
2    11554
4     7805
1     7222
Name: author, dtype: int64

In [5]:
test.shape

(19617, 2)

# Feature Engineering 데이터 전처리

In [6]:
# Importing the libraries
import numpy as np
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [7]:
# Importing stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\playp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
## Number of words in the text 
train["num_words"] = train["text"].apply(lambda x: len(str(x).split()))
test["num_words"] = test["text"].apply(lambda x: len(str(x).split()))

## Number of unique words in the text ##
train["num_unique_words"] = train["text"].apply(lambda x: len(set(str(x).split())))
test["num_unique_words"] = test["text"].apply(lambda x: len(set(str(x).split())))

## Number of characters in the text ##
train["num_chars"] = train["text"].apply(lambda x: len(str(x)))
test["num_chars"] = test["text"].apply(lambda x: len(str(x)))

## Number of stopwords in the text ##
eng_stopwords = nltk.corpus.stopwords.words('english')
train["num_stopwords"] = train["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
test["num_stopwords"] = test["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))


In [9]:
## Number of punctuations in the text ##
import string
train["num_punctuations"] =train['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
test["num_punctuations"] =test['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

## Number of title case words in the text ##
train["num_words_upper"] = train["text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test["num_words_upper"] = test["text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

## Number of title case words in the text ##
train["num_words_title"] = train["text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
test["num_words_title"] = test["text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

## Average length of the words in the text ##
train["mean_word_len"] = train["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test["mean_word_len"] = test["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [10]:
# Clean text
def clean_text(text):
    return re.sub('[^a-zA-Z]', ' ', text)

In [11]:
train['text_cleaned'] = train['text'].apply(lambda x: clean_text(x))
test['text_cleaned'] = test['text'].apply(lambda x: clean_text(x))

In [12]:
#문장부호 등 확인

def extract_features(df):
    df['len'] = df['text'].apply(lambda x: len(x))
    df['n_words'] = df['text'].apply(lambda x: len(x.split(' ')))
    df['n_.'] = df['text'].str.count('\.')
    df['n_...'] = df['text'].str.count('\...')
    df['n_,'] = df['text'].str.count('\,')
    df['n_:'] = df['text'].str.count('\:')
    df['n_;'] = df['text'].str.count('\;')
    df['n_-'] = df['text'].str.count('\-')
    df['n_?'] = df['text'].str.count('\?')
    df['n_!'] = df['text'].str.count('\!')
    df['n_\''] = df['text'].str.count('\'')
    df['n_"'] = df['text'].str.count('\"')
    df["n_“"] = df['text'].str.count('“')
    df["n_”"] = df['text'].str.count('”')

In [13]:
print('Processing train...')
extract_features(train)
print('Processing test...')
extract_features(test)

Processing train...
Processing test...


In [14]:
#merging similar features into a feature
def selecting_features(df):
    df["n_quotes"]=df['n_"']+df['n_“']+df['n_”']
    df.drop(['n_"'], axis=1, inplace=True)
    df.drop(['n_“'], axis=1, inplace=True)
    df.drop(['n_”'], axis=1, inplace=True)
    df.drop(['num_words_title'], axis=1, inplace=True)

selecting_features(train)
selecting_features(test)

In [15]:
train_nums = train.copy()
test_nums = test.copy()

In [16]:
def nums_only(df):
    df.drop(['text'],axis=1,inplace=True)
    df.drop(['text_cleaned'],axis=1,inplace=True)

In [17]:
nums_only(train_nums)
nums_only(test_nums)

In [18]:
train_nums.head()

Unnamed: 0,index,author,num_words,num_unique_words,num_chars,num_stopwords,num_punctuations,num_words_upper,mean_word_len,len,...,n_.,n_...,"n_,",n_:,n_;,n_-,n_?,n_!,n_',n_quotes
0,0,3,46,39,240,25,8,0,4.23913,240,...,3,2,4,0,1,0,0,0,0,0
1,1,2,7,7,38,2,2,1,4.571429,38,...,0,0,1,0,0,0,1,0,0,2
2,2,1,57,50,320,26,9,0,4.614035,320,...,2,2,6,1,0,0,0,0,0,0
3,3,4,58,49,319,26,18,0,4.517241,319,...,6,5,9,0,2,0,0,0,1,2
4,4,3,39,36,228,16,13,0,4.871795,228,...,6,4,4,0,1,0,0,2,0,4


In [19]:
#Exporting Features(only quants) for later use

train_nums.to_csv('G:/From 2021/2021/2021-Pre/학회_시즌2/딥러닝 분반/소대회/train_nums.csv',index=False)
test_nums.to_csv('G:/From 2021/2021/2021-Pre/학회_시즌2/딥러닝 분반/소대회/test_nums.csv',index=False)

### 전처리 2차

In [20]:
## add tfidf and svd 
# max_df option 어떻게 결정해야 할지 고민 !
tfidf_vec = TfidfVectorizer(ngram_range=(1,3), min_df =0,max_df=0.9,lowercase=False, use_idf=True)
train_tfidf = tfidf_vec.fit_transform(train['text'].values.tolist())
test_tfidf = tfidf_vec.fit_transform(test['text'].values.tolist())
print('Tfidf_train:',train_tfidf.shape)
print('Tfidf_test:',test_tfidf.shape)

Tfidf_train: (54879, 2137863)
Tfidf_test: (19617, 1777117)


# Modeling

In [21]:
# add cnn feat
from keras.layers import Embedding, GRU, Dense, Flatten, Dropout
from keras.models import Sequential, load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing
from sklearn.metrics import log_loss
import gc
print('import keras done')

import keras done


In [22]:
# add naive feature
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

In [23]:
# 여기서부터 1-D CNN
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout, MaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model, to_categorical

In [24]:
#Check if it is OK.

X_train = train['text'].values
X_test = test['text'].values
y = train['author'].values
print(X_train.shape, X_test.shape, y.shape)

(54879,) (19617,) (54879,)


In [25]:
X_train[:3]

array(['He was almost choking. There was so much, so much he wanted to say, but strange exclamations were all that came from his lips. The Pole gazed fixedly at him, at the bundle of notes in his hand; looked at odin, and was in evident perplexity.',
       '“Your sister asked for it, I suppose?”',
       ' She was engaged one day as she walked, in perusing Jane’s last letter, and dwelling on some passages which proved that Jane had not written in spirits, when, instead of being again surprised by Mr. odin, she saw on looking up that odin was meeting her. Putting away the letter immediately and forcing a smile, she said:'],
      dtype=object)

In [26]:
#Setting Hyperparameters

vocab_size = 20000
embedding_dim = 64
max_length = 500
padding_type='post'

In [27]:
#Tokenizing & Padding

tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(X_train)
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)
word_index = tokenizer.word_index

In [28]:
X_train = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)
X_test = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)
print(X_train.shape, X_test.shape)

(54879, 500) (19617, 500)


In [29]:
#Check if it is all right

def below_threshold_len(max_len, nested_list):
    cnt = 0
    for s in nested_list:
        if len(s) <= max_len:
            cnt = cnt + 1 
    print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (cnt / len(nested_list))*100))

max_len = 500
below_threshold_len(max_len,X_train)

전체 샘플 중 길이가 500 이하인 샘플의 비율: 100.0


In [30]:
#Import utilities

from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
n_fold = 5
n_class = 5
seed = 42 
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [31]:
### 양방향 LSTM 모형 설정

from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional
import tensorflow as tf

model2 = tf.keras.Sequential() 
model2.add(Embedding(vocab_size, 100))
model2.add(Bidirectional(LSTM(100)))
model2.add(Dense(5, activation='softmax'))
model2.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy']) 

In [32]:
y_train = np.array([x for x in train['author']]) 

In [33]:
##모형 돌릴 준비!
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.utils import to_categorical
import os
MODEL_SAVE_FOLDER_PATH = './model16_4/' 
if not os.path.exists(MODEL_SAVE_FOLDER_PATH): 
    os.mkdir(MODEL_SAVE_FOLDER_PATH)
model_path = MODEL_SAVE_FOLDER_PATH + '{epoch:02d}-{val_loss:.4f}.hdf5'
cb_checkpoint = ModelCheckpoint(filepath=model_path, monitor='val_loss', verbose=1, save_best_only=True)

##parameter 추가 세팅
n_class = 5 
embedding_dim = 100
max_length = max_len 

##0벡터 및 Early Stopping 설정
p_val = np.zeros((X_train.shape[0], n_class))
p_tst = np.zeros((X_test.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(X_train, y_train), 1):
    print(f'training model for CV #{i}') 
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3, verbose=1,
                       mode='min', baseline=None, restore_best_weights=True)
    ###모형 적용 
    clf2 = model2
    clf2.fit(X_train[i_trn], 
            to_categorical(y_train[i_trn]), 
            validation_data=(X_train[i_val], to_categorical(y_train[i_val])), 
            epochs=7, batch_size=256, 
            callbacks=[es, cb_checkpoint])
    p_val[i_val, :] = clf2.predict(X_train[i_val])
    p_tst += clf2.predict(X_test) / n_fold

training model for CV #1
Epoch 1/7

Epoch 00001: val_loss improved from inf to 0.86597, saving model to ./model16_4\01-0.8660.hdf5
Epoch 2/7

Epoch 00002: val_loss improved from 0.86597 to 0.65783, saving model to ./model16_4\02-0.6578.hdf5
Epoch 3/7

Epoch 00003: val_loss improved from 0.65783 to 0.65388, saving model to ./model16_4\03-0.6539.hdf5
Epoch 4/7

Epoch 00004: val_loss did not improve from 0.65388
Epoch 5/7

Epoch 00005: val_loss did not improve from 0.65388
Epoch 6/7
Restoring model weights from the end of the best epoch.

Epoch 00006: val_loss did not improve from 0.65388
Epoch 00006: early stopping
training model for CV #2
Epoch 1/7

Epoch 00001: val_loss improved from 0.65388 to 0.37036, saving model to ./model16_4\01-0.3704.hdf5
Epoch 2/7

Epoch 00002: val_loss did not improve from 0.37036
Epoch 3/7

Epoch 00003: val_loss did not improve from 0.37036
Epoch 4/7
Restoring model weights from the end of the best epoch.

Epoch 00004: val_loss did not improve from 0.37036
Ep

In [34]:
#print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')

Log Loss (CV):   0.3712


In [36]:
print(clf2.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         2000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               160800    
_________________________________________________________________
dense (Dense)                (None, 5)                 1005      
Total params: 2,161,805
Trainable params: 2,161,805
Non-trainable params: 0
_________________________________________________________________
None


In [37]:
##Exporting the output

sample_file = "G:/From 2021/2021/2021-Pre/학회_시즌2/딥러닝 분반/소대회/sample_submission.csv" 
sub = pd.read_csv(sample_file) 
print(sub.shape) 

sub[['0', '1', '2', '3', '4']] = p_tst 
sub.to_csv("G:/From 2021/2021/2021-Pre/학회_시즌2/딥러닝 분반/소대회/submission_LSTM.csv", index=False) 

(19617, 6)
