In [0]:
from google.colab import drive
drive.mount('/content/drive')
# !ls drive/My\ Drive

In [0]:
!pip install janome
!pip install overrides
!git clone https://github.com/HIT-SCIR/ELMoForManyLangs.git
!sudo python 'ELMoForManyLangs/setup.py' install

In [0]:
# coding:utf-8
import re
import time

import h5py
import janome
import numpy as np
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from ELMoForManyLangs.elmoformanylangs import Embedder
from gensim.models import KeyedVectors
from janome.tokenizer import Tokenizer
from keras.preprocessing import text, sequence
import matplotlib.pyplot as plt
%matplotlib inline

from overrides import overrides
from pathlib import Path
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from torch.utils.data import TensorDataset, Dataset
from tqdm import tqdm
tqdm.pandas()


DATA_ROOT = 'drive/My Drive/ELMo_allenlp_tutorial/'
word_model_path = "drive/My Drive/ELMo(MeCab+NEologd,大規模日本語ビジネスニュースコーパス)/単語単位埋め込みモデル"
char_model_path = "drive/My Drive/ELMo(MeCab+NEologd,大規模日本語ビジネスニュースコーパス)/文字単位・単語単位埋め込みモデル"
fasttext_embedding_path = DATA_ROOT + 'entity_vector/entity_vector.model.bin'
embedding_model = KeyedVectors.load_word2vec_format(fasttext_embedding_path, binary=True)
is_char = True

In [0]:
class ELMoNet(nn.Module):
      def __init__(self, elmo_model, embedding_matrix, OUTPUT_DIM):
            super(ELMoNet, self).__init__()

            embed_size = embedding_matrix.shape[1]
            self.embedding = nn.Embedding(max_features, embed_size)
            self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
            self.embedding.weight.requires_grad = False
            self.embedding_dropout = nn.Dropout(0.1)
            
            self.LSTM_UNITS = 128
            self.DENSE_HIDDEN_UNITS = self.LSTM_UNITS * 4
            
            self.elmo_embedder = elmo_model

            self.lstm1 = nn.LSTM(1024+200, self.LSTM_UNITS, bidirectional=True, batch_first=True)
            # self.lstm1 = nn.LSTM(200, self.LSTM_UNITS, bidirectional=True, batch_first=True)
            self.lstm2 = nn.LSTM(self.LSTM_UNITS * 2, self.LSTM_UNITS, bidirectional=True, batch_first=True)

            self.linear1 = nn.Linear(self.DENSE_HIDDEN_UNITS, self.DENSE_HIDDEN_UNITS)
            self.dropout = nn.Dropout(0.2)
            self.linear2 = nn.Linear(self.DENSE_HIDDEN_UNITS, OUTPUT_DIM)


      def forward(self, x):
            
            l = x.shape[1]
          
            h_embedding = self.embedding(x)
            h_embedding = self.embedding_dropout(h_embedding)
        
            
            # 0 padding を除く
            x_p = [[i for i in y if i !=0] for y in x.cpu().detach().numpy()]
        
            # x_p : index, sentences : text
            sentences =  list(map(sequence_to_text, x_p))
            
            h_elmo = self.elmo_embedder.sents2elmo(sentences)
            h_elmo = [np.concatenate(
                [i, [[0] * 1024] * (l - len(i))], axis=0) if len(i) != l else i for i in h_elmo]
            h_elmo = torch.tensor(h_elmo).float().cuda()
            
          
            # fasttext vector と elmo vector を concat する.
            h_embcat = torch.cat([h_elmo, h_embedding], 2)
            
            # h_embcat = h_embedding


            h_lstm1, _ = self.lstm1(h_embcat)
            h_lstm2, _ = self.lstm2(h_lstm1)

            # global average pooling
            avg_pool = torch.mean(h_lstm2, 1)
            # global max pooling
            max_pool, _ = torch.max(h_lstm2, 1)

            h_conc = torch.cat((max_pool, avg_pool), 1)
            out  = F.relu(self.linear1(h_conc))
            out  = self.linear2(self.dropout(out))

            return F.log_softmax(out) 

In [0]:
class TextDataset(Dataset):

    def __init__(self,X,y=None):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return [self.X[idx],self.y[idx]]
        return self.X[idx]

In [0]:
def prepare_labels(y):
    # From here: https://www.kaggle.com/pestipeti/keras-cnn-starter
    values = np.array(y)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)

    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

    y = onehot_encoded
    return y, label_encoder

In [0]:
def tokenizer(text): 
    return [tok for tok in j_t.tokenize(text, wakati=True)]
  

def clean_text(x):
    x = str(x)        
    x = x.replace('\n', '') # 改行削除
    x = x.replace('\t', '') # タブ削除
    x = x.replace('年', '')
    x = x.replace('月', '')
    x = x.replace('日', '')
    x = x.replace('時', '')
    x = re.sub(re.compile(r'[!-\/:-@[-`{-~]'), ' ', x) 
    x = re.sub(r'\[math\]', ' LaTex math ', x) # LaTex削除
    x = re.sub(r'\[\/math\]', ' LaTex math ', x) # LaTex削除
    x = re.sub(r'\\', ' LaTex ', x) # LaTex削除   
    x = re.sub(r'(\d+)([a-zA-Z])', '\g<1> \g<2>', x) # タグの削除
    x = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', x) # タグの削除
    x = re.sub(r'(\d+),(\d+)', '\g<1>\g<2>', x) # タグの削除 
    x = re.sub(' +', ' ',x) # 連続して出現する空白の削除   
    return x
  

def rm_puncts(text):
    puncts = r',.":)・《》「」『』！(-!?|;\'$&/[]>%=#*+\\•~@£·_{}©^®`<→°€™›♥←×§″′Â█½à…“★”–●â►−¢²¬░¶↑±¿▾═¦║―¥▓—‹─▒：¼⊕▼▪†■’▀¨▄♫☆é¯♦¤▲è¸¾Ã⋅‘∞∙）↓、│（»，♪╩╚³・╦╣╔╗▬❤ïØ¹≤‡√。【】〜'
    for punct in puncts:
        text = text.replace(punct, '')
    return text 
  
  
def rm_spaces(text):
    spaces = ['\u200b', '\u200e', '\u202a', '\u2009', '\u2028', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\u3000', '\x10', '\x7f', '\x9d', '\xad',
              '\x97', '\x9c', '\x8b', '\x81', '\x80', '\x8c', '\x85', '\x92', '\x88', '\x8d', '\x80', '\x8e', '\x9a', '\x94', '\xa0', 
              '\x8f', '\x82', '\x8a', '\x93', '\x90', '\x83', '\x96', '\x9b', '\x9e', '\x99', '\x87', '\x84', '\x9f',
             ]
    for space in spaces:
            text = text.replace(space, ' ')
    return text
  

def replace_num(text):
    text = re.sub('[0-9]{5,}', '#####', text)
    text = re.sub('[0-9]{4}', '####', text)
    text = re.sub('[0-9]{3}', '###', text)
    text = re.sub('[0-9]{2}', '##', text)
    text = re.sub('[０-９]{5,}', '#####', text)
    text = re.sub('[０-９]{4}', '####', text)
    text = re.sub('[０-９]{3}', '###', text)
    text = re.sub('[０-９]{2}', '##', text)
    return text
  
  
def preprocess(df_col):
    df_col = df_col.astype(str).progress_apply(lambda x: rm_puncts(x))
    df_col = df_col.astype(str).progress_apply(lambda x: rm_spaces(x))
    df_col = df_col.astype(str).progress_apply(lambda x: clean_text(x))
    df_col = df_col.astype(str).progress_apply(lambda x: replace_num(x))
    df_col = df_col.astype(str).progress_apply(lambda x: tokenizer(x))
    return df_col

In [0]:
train = pd.read_csv(Path(DATA_ROOT) / "livedoor_news_text.csv")
print(train.shape)

j_t = Tokenizer()

x_train = preprocess(train['news'])    
y_train, le = prepare_labels(train['class'])

tokenizer_text = text.Tokenizer()
tokenizer_text.fit_on_texts(list(x_train))

x_train_seq = tokenizer_text.texts_to_sequences(x_train)
x_train_padded = sequence.pad_sequences(x_train_seq, maxlen=200)

max_features = None
max_features = max_features or len(tokenizer_text.word_index) + 1
print(max_features)

In [0]:
word_index = tokenizer_text.word_index
num_words = len(word_index)

embedding_matrix = np.zeros((num_words+1, 200))
for word, i in tqdm(word_index.items()):
    if word in embedding_model.index2word:
        embedding_matrix[i] = embedding_model[word]
        
print(embedding_matrix.shape)

In [0]:
# Creating a reverse dictionary
reverse_word_map = dict(map(reversed, tokenizer_text.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)

In [0]:
if is_char = True:
    char_e = Embedder(char_model_path)
    model = ELMoNet(char_e, embedding_matrix, 9)
else:
    word_e = Embedder(word_model_path)
    model = ELMoNet(word_e, embedding_matrix, 9)

In [0]:
def train_model(index, model, train_dataset, valid_dataset, batchsize):
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batchsize, shuffle=True, num_workers=4)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batchsize, shuffle=False, num_workers=4)

    no_of_epochs = 50
    
    valid_loss_min = np.Inf
    patience = 4
    # current number of epochs, where validation loss didn't increase
    p = 0
    # whether training should be stopped
    stop = False
    
    since = time.time()
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    training_losses = []
    valid_losses = []
    for epoch in tqdm(range(no_of_epochs)):
        print('Epoch {}/{}'.format(epoch, no_of_epochs - 1))
        print('-' * 10)
        model.train()
        running_loss = 0.0
        tk0 = tqdm(train_loader, total=int(len(train_loader)))
        for x_batch, y_batch in tk0:
            x_batch = x_batch.to(device)
            y_batch  = y_batch.to(device)

            # Forward Pass
            preds = model(x_batch)
            loss = criterion(preds, y_batch)
            running_loss += loss.item()

            # Backward Pass and Optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step() 
        epoch_loss = running_loss / len(train_loader)
        training_losses.append(epoch_loss)
        print('Training Loss: {:.4f}'.format(epoch_loss))
        
        model.eval()
        valid_preds = np.zeros((len(valid_dataset), 9))
        valid_loss = 0.0
        best_val_loss = np.inf
        tk1 = tqdm(valid_loader, total=int(len(valid_loader))) 
        for i, (x_batch, y_batch) in enumerate(tk1):
            with torch.no_grad():
                x_batch = x_batch.to(device)
                y_batch  = y_batch.to(device)

                # Forward Pass
                preds = model(x_batch)
                loss = criterion(preds, y_batch) 
            valid_loss += loss.item()
            valid_preds[i * valid_loader.batch_size:(i+1) * valid_loader.batch_size, :] = preds.detach().cpu().numpy()
        epoch_valid_loss = valid_loss / len(valid_loader)
        valid_losses.append(epoch_valid_loss)

        if epoch_valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.10f} --> {:.10f}).  Saving model ... to model{}.pt'.format(
            valid_loss_min,
            epoch_valid_loss,
            index))
            torch.save(model.state_dict(), f'model{index}.pt')
            valid_loss_min = epoch_valid_loss
            p = 0

        # check if validation loss didn't improve
        if epoch_valid_loss > valid_loss_min:
            p += 1
            print(f'{p} epochs of increasing val loss')
            if p > patience:
                print('Stopping training')
                stop = True
                break        

        if stop:
            break
        
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print(f'save model => model{index}.bin')
    torch.save(model.state_dict(), f'model{index}.bin')
    return valid_preds, training_losses, valid_losses

In [0]:
def get_score(y_true, y_pred):
    print("acc : {}".format(accuracy_score(y_true, y_pred)))
    print("f1-score: {}".format(f1_score(y_true, y_pred, average='weighted')))


def run_model(index, model, train_index, val_index):
    full_dataset = TextDataset(X=x_train_torch, y=y_train_torch)
    train_dataset = torch.utils.data.Subset(full_dataset, train_index)
    valid_dataset = torch.utils.data.Subset(full_dataset, val_index)
    
    batchsize = 64
    
    valid_preds, training_losses, valid_losses = train_model(index, model, train_dataset, valid_dataset, batchsize)
    return valid_preds, training_losses, valid_losses

In [0]:
train_index, val_index = train_test_split(list(x_train.index), test_size=0.2, shuffle=True, random_state=1129, stratify=y_train)
y_val = y_train[val_index]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

x_train_torch = torch.tensor(x_train_padded, dtype=torch.long)
y_train_torch = torch.tensor(y_train, dtype=torch.float32)
model.to(device)
print(f"fold{0} start")
valid_preds, training_losses, valid_losses = run_model(0, model, train_index, val_index)

In [0]:
print(get_score(np.argmax(valid_preds, 1), np.argmax(y_val, 1)))
plt.plot(training_losses)
plt.plot(valid_losses)