ref: https://www.kaggle.com/hamishdickson/bidirectional-lstm-in-keras-with-glove-embeddings

ref: https://lovit.github.io/nlp/representation/2018/09/05/glove/

In [48]:
import pandas as pd
import time
import os, gc
import numpy as np
from tqdm import tqdm
import random
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import *
from keras.utils.np_utils import to_categorical
from keras.initializers import Constant
import re

import matplotlib.pyplot as plt
%matplotlib inline

import torch

In [49]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(42)

In [50]:
%%time
#train_df=pd.read_csv("../KB_NLP/morphs/komo_morphs_train.csv")
#test_df=pd.read_csv("../KB_NLP/morphs/komo_morphs_test.csv")
train_df=pd.read_csv("../KB_NLP/raw_data/train.csv")
test_df=pd.read_csv("../KB_NLP/raw_data/public_test.csv")

CPU times: user 923 ms, sys: 44 ms, total: 967 ms
Wall time: 967 ms


In [51]:
pd.set_option('display.max_colwidth',-1)
train_df.head()

Unnamed: 0,id,year_month,text,smishing
0,0,2017-01,XXX은행성산XXX팀장입니다.행복한주말되세요,0
1,1,2017-01,오늘도많이웃으시는하루시작하세요XXX은행 진월동VIP라운지 XXX올림,0
2,2,2017-01,안녕하십니까 고객님. XXX은행입니다.금일 납부하셔야 할 금액은 153600원 입니다.감사합니다. 새해 복 많이 받으십시오.XXX은행옥포XXX올림,0
3,4,2017-01,XXX 고객님안녕하세요XXX은행 XXX지점입니다지난 한 해 동안 저희 XXX지점에 보내주신 성원에 감사드립니다. 설렘으로 시작한 2017년소망하시는 일 모두 이XXX 고객님의 가정에 늘 건강과 행복이 함께하길 기원하겠습니다. 사랑하는 가족과 함께 정을 나누는 행복한 설 명절 보내세요 XXX은행 XXX지점직원일동,0
4,5,2017-01,1월은 새로움이 가득XXX입니다.올 한해 더 많이행복한 한해되시길바랍니다,0


In [52]:
from soynlp.hangle import decompose

doublespace_pattern = re.compile('\s+')

def jamo_sentence(sent):

    def transform(char):
        if char == ' ':
            return char
        cjj = decompose(char)
        try:
            len(cjj)
        except:
            return ' '
        if len(cjj) == 1:
            return cjj
        cjj_ = ''.join(c if c != ' ' else '' for c in cjj)
        return cjj_

    sent_ = ''.join(transform(char) for char in sent)
    sent_ = doublespace_pattern.sub(' ', sent_)
    return sent_
# 'ㅇㅓ-ㅇㅣ-ㄱㅗ- ㅋㅔㄱㅋㅔㄱ ㅇㅏ-ㅇㅣ-ㄱㅗ-ㅇㅗ-'

jamo_sentence(train_df.loc[0, 'text'])

' ㅇㅡㄴㅎㅐㅇㅅㅓㅇㅅㅏㄴ ㅌㅣㅁㅈㅏㅇㅇㅣㅂㄴㅣㄷㅏ ㅎㅐㅇㅂㅗㄱㅎㅏㄴㅈㅜㅁㅏㄹㄷㅚㅅㅔㅇㅛ'

In [53]:
%%time
test_df['jamo'] = test_df['text'].apply(lambda x: jamo_sentence(x))
train_df['jamo'] = train_df['text'].apply(lambda x: jamo_sentence(x))

CPU times: user 1min 1s, sys: 144 ms, total: 1min 1s
Wall time: 1min 1s


In [54]:
%%time
train_df.to_csv("../KB_NLP/jamo_data/jamo_train.csv", index=False)
test_df.to_csv("../KB_NLP/jamo_data/jamo_test.csv", index=False)

CPU times: user 3.08 s, sys: 240 ms, total: 3.32 s
Wall time: 5.91 s


In [55]:
train_df['jamo'][0]

' ㅇㅡㄴㅎㅐㅇㅅㅓㅇㅅㅏㄴ ㅌㅣㅁㅈㅏㅇㅇㅣㅂㄴㅣㄷㅏ ㅎㅐㅇㅂㅗㄱㅎㅏㄴㅈㅜㅁㅏㄹㄷㅚㅅㅔㅇㅛ'

In [56]:
raw_corpus_fname = '\n'.join(pd.concat([train_df['jamo'], test_df['jamo']]))
file=open('../KB_NLP/jamo_text.txt','w')
file.write(raw_corpus_fname)
file.close()

In [57]:
jamo_jumo = pd.concat([train_df['jamo'], test_df['jamo']])

In [58]:
input_sentences = list(jamo_jumo)

In [59]:
# Make Co-occurence Matrix by using soynlp
from soynlp.utils import DoublespaceLineCorpus
from soynlp.vectorizer import sent_to_word_contexts_matrix

#corpus_path = '2016-10-20_article_all_normed_ltokenize.txt'
#corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)

x, idx2vocab = sent_to_word_contexts_matrix(
    input_sentences,
    windows=1,
    min_tf=10,
    tokenizer=lambda x:x.split(), # (default) lambda x:x.split(),
    dynamic_weight=True,
    verbose=True)

print(x.shape) # (36002, 36002)

Create (word, contexts) matrix
  - counting word frequency from 297570 sents, mem=1.833 Gb
  - scanning (word, context) pairs from 297570 sents, mem=1.635 Gb
  - (word, context) matrix was constructed. shape = (38890, 38890)                    
  - done
(38890, 38890)


---
# make every 10 th embeddings with 200dim

In [60]:
from glove import Corpus, Glove

In [61]:
glove = Glove(no_components=200, learning_rate=0.01, random_state=42,)
glove.fit(x.tocoo(), epochs=2, no_threads=4, verbose=False)
dictionary = {vocab:idx for idx, vocab in enumerate(idx2vocab)}
glove.add_dictionary(dictionary)

In [62]:
glove.dictionary

{'ㅇㅗㄹㄹㅣㅁ': 0,
 'ㅇㅡㄴㅎㅐㅇ': 1,
 'ㄱㅗㄱㅐㄱㄴㅣㅁ': 2,
 'ㄱㅏㅁㅅㅏㅎㅏㅂㄴㅣㄷㅏ': 3,
 'ㅇㅝㄹ': 4,
 'ㅂㅏㄹㅏㅂㄴㅣㄷㅏ': 5,
 'ㅅㅜ': 6,
 'ㅎㅐㅇㅂㅗㄱㅎㅏㄴ': 7,
 'ㅇㅣㅆㅅㅡㅂㄴㅣㄷㅏ': 8,
 'ㄱㅘㅇㄱㅗ': 9,
 'ㅈㅓㅎㅢ': 10,
 'ㅁㅣㅊ': 11,
 'ㄱㅗㄱㅐㄱㄴㅣㅁㅇㅢ': 12,
 'ㅎㅏㅇㅅㅏㅇ': 13,
 'ㅂㅗㄴㅐㅅㅔㅇㅛ': 14,
 'ㄷㅡㄹㅣㅂㄴㅣㄷㅏ': 15,
 'ㅎㅏㄹㅜ': 16,
 'ㄲㅗㄱ': 17,
 'ㅊㅚㄷㅐ': 18,
 'ㅇㅣㅂㄴㅣㄷㅏ': 19,
 'ㄴㅕㄴ': 20,
 'ㄱㅏㅁㅅㅏㄷㅡㄹㅣㅂㄴㅣㄷㅏ': 21,
 'ㅈㅜㅅㅕㅅㅓ': 22,
 'ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ': 23,
 'ㅈㅜㅁㅏㄹ': 24,
 'ㅇㅣㄹ': 25,
 'ㅂㅜㅌㅏㄱㄷㅡㄹㅣㅂㄴㅣㄷㅏ': 26,
 'ㄸㅏㄹㅏ': 27,
 'ㅇㅣㅆㄴㅡㄴ': 28,
 'ㅈㅣㅈㅓㅁ': 29,
 'ㅈㅜㅅㅣㅁㅕㄴ': 30,
 'ㅈㅡㄹㄱㅓㅇㅜㄴ': 31,
 'ㄱㅏㄴㅡㅇ': 32,
 'ㅁㅜㄹㅛㅅㅜㅅㅣㄴㄱㅓㅂㅜ': 33,
 'ㅎㅏㄴㄷㅗ': 34,
 'ㄷㅚㅅㅔㅇㅛ': 35,
 'ㅌㅣㅁㅈㅏㅇ': 36,
 'ㄱㅕㅇㅇㅜ': 37,
 'ㄲㅏㅈㅣ': 38,
 'ㅈㅗㅎㅇㅡㄴ': 39,
 'ㄷㅓ': 40,
 'ㅁㅏㄴㅇㅝㄴ': 41,
 'ㅇㅕㄴ': 42,
 'ㅇㅗㄴㅡㄹㄷㅗ': 43,
 'ㅎㅏㄴ': 44,
 'ㄸㅗㄴㅡㄴ': 45,
 'ㅈㅓㅁ': 46,
 'ㅎㅏㅂㄴㅣㄷㅏ': 47,
 'ㅅㅏㅇㄷㅏㅁ': 48,
 'ㅅㅏㅇㅍㅜㅁ': 49,
 'ㅂㅜㄴ': 50,
 'ㄷㅡㅇ': 51,
 'ㅅㅗㅈㅜㅇㅎㅏㄴ': 52,
 'ㄱㅡㅁㄹㅣ': 53,
 'ㄱㅗㄱㅐㄱㄴㅣㅁㄲㅔ': 54,
 'ㅁㅏㄶㅇㅣ': 55,
 'ㅁㅐㅇㅜ': 56,
 'ㅊㅚㅅㅓㄴㅇㅡㄹ': 57,
 'ㅇㅗㄴㅡㄹ': 58,
 'ㄱㅏㄷㅡㄱㅎㅏㄴ': 59,
 'ㄷㅏㅂㅈㅏㅇㅇㅡㄹ': 60,
 'ㅡ': 61,
 'ㄷㅡㄹㅣㅁ': 62,
 'ㅅㅣ': 63,
 'ㅊㅚㅈㅓ': 64,
 'ㅁㅏㄴ': 65,
 'ㄷㅐㅎㅏㄴ': 66,
 'ㅂㅏㄹㅗ

In [63]:
glove.word_vectors

array([[-0.19787655, -0.13440955,  0.13852456, ...,  0.08905442,
        -0.14362293,  0.27114329],
       [-0.22072931, -0.31352288,  0.10031367, ...,  0.00255756,
        -0.20289128,  0.25142753],
       [-0.17239438, -0.1748533 ,  0.09027506, ..., -0.02894472,
        -0.12553198,  0.13983379],
       ...,
       [ 0.00169862, -0.002237  ,  0.00033272, ...,  0.00039494,
         0.00142482, -0.00108642],
       [-0.00049516,  0.00111817, -0.00157066, ..., -0.00090297,
        -0.00087071, -0.00111987],
       [-0.00117491,  0.00144927,  0.00257118, ..., -0.00188591,
         0.00131372,  0.00049556]])

%%time
for k in range(10,210,10):
    glove = Glove(no_components=200, learning_rate=0.01, random_state=42)
    glove.fit(x.tocoo(), epochs=k, no_threads=4, verbose=False)
    dictionary = {vocab:idx for idx, vocab in enumerate(idx2vocab)}
    glove.add_dictionary(dictionary)
    
    with open("../KB_NLP/glove_txt/glove.200D.{}E.txt".format(k),'w') as f:
        for word in glove.dictionary:
            f.write(word)
            f.write(" ")
            for i in range(0, 200):
                f.write(str(glove.word_vectors[glove.dictionary[word]][i]))
                f.write(" ")
            f.write("\n")
    
    print("{}th embedding DONE".format(k))

In [69]:
%%time
for k in range(10,210,10):
    glove = Glove(no_components=200, learning_rate=0.01, random_state=42)
    glove.fit(x.tocoo(), epochs=k, no_threads=4, verbose=False)
    dictionary = {vocab:idx for idx, vocab in enumerate(idx2vocab)}
    glove.add_dictionary(dictionary)
    
    key_list = []
    for key in glove.dictionary.keys():
        key_list.append(key)
    
    dicts = {}
    for word, vector in zip(glove.dictionary.keys(), glove.word_vectors):
        dicts[word] = vector
    #keys = key_list
    #values = glove.word_vectors
    #for i in key_list:
    #    for j in values:
    #        dicts[i] = j
    with open("../KB_NLP/glove_pkl/glove.200D.{}.pkl".format(k),'wb') as f:
        pickle.dump(dicts,f)
    
    print("{}th embedding DONE".format(k))

10th embedding DONE
20th embedding DONE
30th embedding DONE
40th embedding DONE
50th embedding DONE
60th embedding DONE
70th embedding DONE
80th embedding DONE
90th embedding DONE
100th embedding DONE
110th embedding DONE
120th embedding DONE
130th embedding DONE
140th embedding DONE
150th embedding DONE
160th embedding DONE
170th embedding DONE
180th embedding DONE
190th embedding DONE
200th embedding DONE
CPU times: user 3h 5min 52s, sys: 1.15 s, total: 3h 5min 53s
Wall time: 1h 47min 31s
