In [31]:
import os
import gc
import time
import math
import random
import pickle
import operator 
import numpy as np 
import pandas as pd 
from tqdm import tqdm_notebook as tqdm

from keras.preprocessing import text, sequence
import torch
from torch import nn, cuda
from torch.nn import functional as F
from torch.utils.data import TensorDataset, Subset, DataLoader
from torch.optim import Adam, Optimizer
from torch.optim.lr_scheduler import _LRScheduler, LambdaLR, ReduceLROnPlateau

import re
import gensim
from gensim.models.wrappers import FastText

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [32]:
use_cuda = cuda.is_available()
use_cuda

True

In [33]:
Glove_200_PATH = '../KB_NLP/glove_txt/glove.200D.10E.txt'
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 220

In [34]:
def seed_everything(seed=123):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [35]:
class SequenceBucketCollator():
    def __init__(self, choose_length, sequence_index, length_index, label_index=None):
        self.choose_length = choose_length
        self.sequence_index = sequence_index
        self.length_index = length_index
        self.label_index = label_index
        
    def __call__(self, batch):
        batch = [torch.stack(x) for x in list(zip(*batch))]
        
        sequences = batch[self.sequence_index]
        lengths = batch[self.length_index]
        
        length = self.choose_length(lengths)
        mask = torch.arange(start=maxlen, end=0, step=-1) < length
        padded_sequences = sequences[:, mask]
        
        batch[self.sequence_index] = padded_sequences
        
        if self.label_index is not None:
            return [x for i, x in enumerate(batch) if i != self.label_index], batch[self.label_index]
    
        return batch

In [36]:
def preprocess_text(text):
    
#     stopwords = ['XXX', '.', '을', '를', '이', '가', '-', '(', ')', ':', '!', '?', ')-', '.-', 'ㅡ', 'XXXXXX', '..', '.(', '은', '는']
#     text = re.sub(".", " ", text)
    text = re.sub("XXX", " ", text)
    text = re.sub("XXXXXX", " ", text)
    text = re.sub("[^ .?!/@$%~|0-9|ㄱ-ㅣ가-힣]+", "", text) # 한글과 띄어쓰기, 특수기호 일부를 제외한 모든 글자
#     text = re.sub("[\s]+", "", text.strip()) # white space duplicate
#     text = re.sub("[\.]+", "", text.strip()) # full stop duplicate
    
    text = re.sub("(?s)<ref>.+?</ref>", "", text) # remove reference links
    text = re.sub("(?s)<[^>]+>", "", text) # remove html tags
    text = re.sub("&[a-z]+;", "", text) # remove html entities
    text = re.sub("(?s){{.+?}}", "", text) # remove markup tags
    text = re.sub("(?s){.+?}", "", text) # remove markup tags
    text = re.sub("(?s)\[\[([^]]+\|)", "", text) # remove link target strings
    text = re.sub("(?s)\[\[([^]]+\:.+?]])", "", text) # remove media links
    
    text = re.sub("[']{5}", "", text) # remove italic+bold symbols
    text = re.sub("[']{3}", "", text) # remove bold symbols
    text = re.sub("[']{2}", "", text) # remove italic symbols
    text = re.sub(r'\d+', ' ', text) # clean numbers
    
#     text = re.sub(r"[^ \r\n\p{Hangul}.?!]", " ", text) # Replace unacceptable characters with a space.
#     text = re.sub("[ ]{2,}", " ", text) # Squeeze spaces.
    
    return text

In [37]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class NeuralNet(nn.Module):

    def __init__(self, embedding_matrix):
        super(NeuralNet, self).__init__()

        lstm_hidden_size = 120
        gru_hidden_size = 60
        self.gru_hidden_size = gru_hidden_size

        self.embedding = nn.Embedding(*embedding_matrix.shape)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = nn.Dropout2d(0.2)

        self.lstm = nn.LSTM(embedding_matrix.shape[1], lstm_hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(lstm_hidden_size * 2, gru_hidden_size, bidirectional=True, batch_first=True)

        self.linear = nn.Linear(gru_hidden_size * 6, 20)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.out = nn.Linear(20, 1)
        
    def apply_spatial_dropout(self, h_embedding):
        h_embedding = h_embedding.transpose(1, 2).unsqueeze(2)
        h_embedding = self.embedding_dropout(h_embedding).squeeze(2).transpose(1, 2)
        return h_embedding

    def forward(self, x, normal_feats, lengths=None):
        h_embedding = self.embedding(x.long())
        h_embedding = self.apply_spatial_dropout(h_embedding)

        h_lstm, _ = self.lstm(h_embedding)
        h_gru, hh_gru = self.gru(h_lstm)

        hh_gru = hh_gru.view(-1, self.gru_hidden_size * 2)

        avg_pool = torch.mean(h_gru, 1)
        max_pool, _ = torch.max(h_gru, 1)

#         normal_linear  = F.relu(self.normal_linear(normal_feats.float()))

        conc = torch.cat((hh_gru, avg_pool, max_pool), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)

        return out

In [38]:
def train_model(n_epochs=4, accumulation_step=2, **kwargs):
    
    optimizer = Adam(model.parameters(), lr=0.001)
    scheduler = LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
    checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]
    
    best_epoch = -1
    best_valid_score = 0.
    best_valid_loss = 1.
    all_train_loss = []
    all_valid_loss = []
    total_preds = []
    
    for epoch in range(n_epochs):
        
        start_time = time.time()

        train_loss = train_one_epoch(model, criterion, train_loader, optimizer, accumulation_step)
        val_loss, val_score = validation(model, criterion, valid_loader)
    
#         if val_score > best_valid_score:
#             best_valid_score = val_score
#             torch.save(model.state_dict(), 'best_score{}.pt'.format(fold))
    
        elapsed = time.time() - start_time
        
        lr = [_['lr'] for _ in optimizer.param_groups]
        print("Epoch {} - train_loss: {:.6f}  val_loss: {:.6f}  val_score: {:.6f}  lr: {:.5f}  time: {:.0f}s".format(
                epoch+1, train_loss, val_loss, val_score, lr[0], elapsed))

        # inference
        test_preds = inference_test(model, test_loader)
        total_preds.append(test_preds)
        
        # scheduler update
        scheduler.step()
    
    total_preds = np.average(total_preds, weights=checkpoint_weights, axis=0)

    return total_preds, val_score, val_loss

In [39]:
def inference_test(model, test_loader):
    model.eval()

    test_preds = np.zeros((len(test_dataset), 1))

    with torch.no_grad():
        for i, inputs in enumerate(test_loader):
            if use_cuda:
                inputs[0] = inputs[0].cuda()
                inputs[1] = inputs[1].cuda()
#                 inputs[2] = inputs[2].cuda()

            outputs = model(inputs[0], inputs[1])
#             outputs = model(inputs[0], inputs[1], inputs[2])
            test_preds[i * batch_size:(i+1) * batch_size] = sigmoid(outputs.cpu().numpy())
    
    return test_preds

In [40]:
def train_one_epoch(model, criterion, train_loader, optimizer, accumulation_step=2):
    
    model.train()
    train_loss = 0.
    
    optimizer.zero_grad()

#     for i, (inputs, targets) in tqdm(enumerate(train_loader), desc='train', total=len(train_loader)):
    for i, (inputs, targets) in enumerate(train_loader):

        if use_cuda:
            inputs[0] = inputs[0].cuda()
            inputs[1] = inputs[1].cuda()
#             inputs[2] = inputs[2].cuda()
            targets = targets.cuda()    
            
        preds = model(inputs[0], inputs[1])
        preds = model(inputs[0], inputs[1])
        loss = criterion(preds, targets)

        loss.backward()
        
        if accumulation_step:
            if (i+1) % accumulation_step == 0:  
                optimizer.step()
                optimizer.zero_grad()
        else:
            optimizer.step()
            optimizer.zero_grad()

        train_loss += loss.item() / len(train_loader)
        
    return train_loss


def validation(model, criterion, valid_loader):
    
    model.eval()
    valid_preds = np.zeros((len(valid_dataset), 1))
    valid_targets = np.zeros((len(valid_dataset), 1))
    val_loss = 0.
    
    with torch.no_grad():
#         for i, (inputs, targets) in tqdm(enumerate(valid_loader), desc='valid', total=len(valid_loader)):
        for i, (inputs, targets) in enumerate(valid_loader):
            
            valid_targets[i * batch_size: (i+1) * batch_size] = targets.numpy().copy()
            
            if use_cuda:
                inputs[0] = inputs[0].cuda()
                inputs[1] = inputs[1].cuda()
#                 inputs[2] = inputs[2].cuda()
                targets = targets.cuda()   
            
            outputs = model(inputs[0], inputs[1])
#             outputs = model(inputs[0], inputs[1], inputs[2])
            loss = criterion(outputs, targets)
            
            valid_preds[i * batch_size: (i+1) * batch_size] = sigmoid(outputs.detach().cpu().numpy())
            
            val_loss += loss.item() / len(valid_loader)
    
    val_score = roc_auc_score(valid_targets, valid_preds)
#     valid_preds = np.where(valid_preds>=0.1, 1, 0)
#     val_score = f1_score(valid_targets, valid_preds)
    
    
    return val_loss, val_score   

In [41]:
def build_vocab(sentences, verbose=True):
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def check_coverage(vocab, embedding_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embedding_index[word]
            k += vocab[word]
        except:
            oov[word] = vocab[word]
            i += vocab[word]
            pass
    
    print("Found embeddings for {:.2%} of vocab".format(len(a) / len(vocab)))
    print("Found embedding for {:.2%} of all text".format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x


def build_matrix(word_index, word2vec_vocab, vector_size=200):
    embedding_matrix = np.zeros((max_features + 1, vector_size))
    unknown_words = []
    
    for word, i in word_index.items():
        if i <= max_features:
            try:
                embedding_matrix[i] = word2vec_vocab[word]
            except:
                unknown_words.append(word)
                
    return embedding_matrix, unknown_words

In [42]:
def count_regexp_occ(regexp="", text=None):
    """ Simple way to get the number of occurence of a regex"""
    # reference: https://www.kaggle.com/coolcoder22/lightgbm-fast-compact-solution
    return len(re.findall(regexp, text))

In [43]:
def normal_feature_engineering(text_list, month_infos):
    add_feats_matrix = np.zeros((len(text_list), 4))

    for idx in range(len(text_list)):
        target_text = text_list[idx]    
        add_feats_matrix[idx, 0] = len(target_text.split(' '))
        add_feats_matrix[idx, 1] = len(set(target_text.split(' ')))
        add_feats_matrix[idx, 2] = add_feats_matrix[idx, 1] / add_feats_matrix[idx, 0] 

    for idx, month in enumerate(month_infos):
        add_feats_matrix[idx, 3]  = month

    add_feats_matrix = add_feats_matrix.astype('int')
    return add_feats_matrix


In [44]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index)+1,200))
    unknown_words = []
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

def sigmoid(x):
    return 1/(1+np.exp(-x))

In [45]:
def train_model(model, train, test, loss_fn, output_dim, lr=0.001,
                batch_size=16, n_epochs=4, enable_checkpoint_ensemble=True):
    param_lrs = [{'params':param,'lr':lr} for param in model.parameters()]
    optimizer = torch.optim.Adam(param_lrs, lr=lr)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6**epoch)

    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=True)
    all_test_preds = []
    checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]

    best_epoch = -1
    best_valid_score = 0
    best_valid_loss = 1
    all_train_loss = []
    all_valid_loss = []
    total_preds = []

    for epoch in range(n_epochs):
        t1 = time.time()

        ###### Train ##############################################
        model.train()
        train_loss = 0

        optimizer.zero_grad()
        for i, (inputs, targets) in enumerate(train_loader):
            inputs[0] = inputs[0].cuda()
            inputs[1] = inputs[1].cuda()
            targets = targets.cuda()

            preds = model(inputs[0], inputs[1])
            preds = model(inputs[0], inputs[1])
            loss = criterion(preds, targets)

            loss.backward()
            if accumulation_step:
                if (i+1) % accumulation_step ==0:
                    optimizer.step()
                    optimizer.zero_grad()
            else:
                optimizer.step()
                optimizer.zero_grad()

            train_loss += loss.item()/len(train_loader)

        ###### Validation ########################################
        model.eval()
        valid_preds = np.zeros((len(valid_dataset), 1))
        valid_targets = np.zeros((len(valid_dataset), 1))
        val_loss = 0.

        with torch.no_grad():
    #         for i, (inputs, targets) in tqdm(enumerate(valid_loader), desc='valid', total=len(valid_loader)):
            for i, (inputs, targets) in enumerate(valid_loader):

                valid_targets[i * batch_size: (i+1) * batch_size] = targets.numpy().copy()

                if use_cuda:
                    inputs[0] = inputs[0].cuda()
                    inputs[1] = inputs[1].cuda()
    #                 inputs[2] = inputs[2].cuda()
                    targets = targets.cuda()   

                outputs = model(inputs[0], inputs[1])
    #             outputs = model(inputs[0], inputs[1], inputs[2])
                loss = criterion(outputs, targets)

                valid_preds[i * batch_size: (i+1) * batch_size] = sigmoid(outputs.detach().cpu().numpy())

                val_loss += loss.item() / len(valid_loader)

        val_score = roc_auc_score(valid_targets, valid_preds)
    #     valid_preds = np.where(valid_preds>=0.1, 1, 0)
    #     val_score = f1_score(valid_targets, valid_preds)
        elapsed = time.time()-t1
        lr = [_['lr'] for _ in optimizer.param_groups]
        print("Epoch {} - train_loss: {:.6f} - val_loss: {:.6f} - val_score: {:.6f} lr: {:.5f} time: {:.0f}s").format(
        epoch+1, train_loss, val_loss, val_score, lr[0], elapsed)
        ####### Prediction ####################################
        model.eval()
        test_preds = np.zeros((len(test_dataset),1))
        with torch.no_grad():
            for i, inputs in enumerate(test_loader):
                inputs[0] = inputs[0].cuda()
                inputs[1] = inputs[1].cuda()
                outputs = model(inputs[0], inputs[1])
                test_preds[i*batch_size:(i+1)*batch_size] = sigmoid(outputs.cpu().numpy())
        
        total_preds.append(test_preds)
        
        scheduler.step()
    
    total_preds = np.average(total_preds, weights=checkpoint_weights, axis=0)







## Preprocessing

In [46]:
os.listdir('../KB_NLP/jamo_data')

['jamo_train.csv', 'jamo_test.csv']

In [47]:
train_df = pd.read_csv('../KB_NLP/jamo_data/jamo_train.csv')
test_df = pd.read_csv('../KB_NLP/jamo_data/jamo_test.csv')

In [48]:
train_text_list = train_df['jamo'].values
test_text_list = test_df['jamo'].values

train_month_infos = train_df['year_month'].apply(lambda x: int(x[-2:])).values
test_month_infos = test_df['year_month'].apply(lambda x: int(x[-2:])).values

train_add_feats_matrix = normal_feature_engineering(train_text_list, train_month_infos)
test_add_feats_matrix = normal_feature_engineering(test_text_list, test_month_infos)

In [50]:
x_train = train_df['jamo']
y_train = train_df['smishing']
x_test = test_df['jamo']

In [51]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(train_df['jamo']) + list(test_df['jamo']))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [52]:
max_features = None
max_features = max_features or len(tokenizer.word_index) + 1
max_features

330467

In [53]:
glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, Glove_200_PATH)
print('n unknown words : ',len(unknown_words_glove))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


n unknown words :  291576


In [54]:
glove_matrix.shape

(330467, 200)

In [55]:
x_train_torch = torch.tensor(x_train, dtype=torch.long).cuda()
x_test_torch = torch.tensor(x_test, dtype=torch.long).cuda()
y_train_torch = torch.tensor(y_train).float().unsqueeze(1).cuda()

In [57]:
train_dataset = TensorDataset(x_train_torch, y_train_torch)
test_dataset = TensorDataset(x_test_torch)

all_test_preds = []

seed_everything(42)

model = NeuralNet(glove_matrix)
model.cuda()

test_preds = train_model(model, train_dataset, test_dataset, output_dim=y_train_torch.shape[-1], 
                         loss_fn=nn.BCEWithLogitsLoss(reduction='mean'))
all_test_preds.append(test_preds)

IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

In [None]:
x_train = train_df['jamo']
y_train = train_df['smishing']
x_test = test_df['jamo']

In [None]:
max_features = None

In [None]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(x_train)+list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [None]:
train_text_list = train_df['jamo'].values
test_text_list = test_df['jamo'].values

train_month_infos = train_df['year_month'].apply(lambda x: int(x[-2:])).values
test_month_infos = test_df['year_month'].apply(lambda x: int(x[-2:])).values

In [None]:
train_add_feats_matrix = normal_feature_engineering(train_text_list, train_month_infos)
test_add_feats_matrix = normal_feature_engineering(test_text_list, test_month_infos)