In [1]:
%load_ext watermark
%watermark -a Chan -d -p numpy,pandas,konlpy,torch,keras

Using TensorFlow backend.


Chan 2019-08-26 

numpy 1.16.4
pandas 0.24.2
konlpy 0.5.1
torch 1.0.1
keras 2.2.4


In [2]:
import pandas as pd
import numpy as np
import konlpy
from utils.bp_processing import bp_tokenize

# Load datasets 

In [3]:
human = pd.read_csv('../../data/train.csv', encoding='utf-16')[['comment', 'label']]
human.head()

Unnamed: 0,comment,label
0,ㅜㅜ,0
1,ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ,0
2,헐,0
3,제시,0
4,이거인 듯,0


In [4]:
human.groupby('label').count()

Unnamed: 0_level_0,comment
label,Unnamed: 1_level_1
0,39904
1,2096


In [5]:
human_X, human_y = human[['comment']], human.label

In [6]:
auto = pd.read_csv('../../data/auto_labeled.csv', encoding='utf-16', sep='\t')
auto.dropna(inplace=True)
auto.label = auto.label.astype('int')
auto.head()

Unnamed: 0,comment,label
0,ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ자낳괴,1
1,ㅇ,0
2,ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ,0
3,실토하심ㅋㅋㅋㅋㅋㅋㅋ,0
4,돌려 돌려 돌림판~,0


In [7]:
auto.groupby('label').count()

Unnamed: 0_level_0,comment
label,Unnamed: 1_level_1
0,1938856
1,61137


In [8]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from imblearn.under_sampling import RandomUnderSampler
import torch.utils.data as data_utils
import torch

In [9]:
auto_X, auto_y = RandomUnderSampler().fit_sample(auto[['comment']], auto.label)

In [10]:
auto_y.sum(), len(auto_y)

(61137, 122274)

In [28]:
new_X = np.concatenate([human_X.values, auto_X])
new_y = np.concatenate([human_y.values, auto_y])

In [29]:
new_X.shape, new_y.shape

((164274, 1), (164274,))

In [30]:
new_y.sum() / len(new_y)

0.3849239684916664

In [31]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(new_X,new_y)

In [33]:
X_train = [_[0] for _ in X_train]
X_test = [_[0] for _ in X_test]

In [34]:
# bpe tokenizing
tokenized_train = bp_tokenize(X_train)
tokenized_test = bp_tokenize(X_test)

100%|██████████████████████████████████████████████████████████████████████| 123205/123205 [00:00<00:00, 560345.11it/s]
100%|████████████████████████████████████████████████████████████████████████| 41069/41069 [00:00<00:00, 570738.97it/s]


In [35]:
tokenized_train.shape, tokenized_test.shape

((123205,), (41069,))

In [36]:
# padding
MAX_LEN = 50
x_padded_train = pad_sequences(tokenized_train, maxlen=MAX_LEN)
x_padded_test = pad_sequences(tokenized_test, maxlen=MAX_LEN)

In [37]:
x_padded_train.shape, x_padded_test.shape

((123205, 50), (41069, 50))

In [38]:
train_data = data_utils.TensorDataset(torch.from_numpy(x_padded_train).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.DoubleTensor))
BATCH_SIZE = 32
train_loader = data_utils.DataLoader(train_data, batch_size=BATCH_SIZE, drop_last=True)
# return train_loader,x_test_pad,y_test

# Model

In [39]:
import torch,keras
import numpy as np
from torch.autograd import Variable
import torch.nn.functional as F
import torch.utils.data as data_utils
 
EMB_DIM = 10
VOCAB_SIZE = 260
class StructuredSelfAttention(torch.nn.Module):
    """
    The class is an implementation of the paper A Structured Self-Attentive Sentence Embedding including regularization
    and without pruning. Slight modifications have been done for speedup
    """
   
    def __init__(self, batch_size, lstm_hid_dim, d_a, r, max_len, emb_dim=EMB_DIM, vocab_size=VOCAB_SIZE):
        """
        Initializes parameters suggested in paper
 
        Args:
            batch_size  : {int} batch_size used for training
            lstm_hid_dim: {int} hidden dimension for lstm
            d_a         : {int} hidden dimension for the dense layer
            r           : {int} attention-hops or attention heads
            max_len     : {int} number of lstm timesteps
            emb_dim     : {int} embeddings dimension
            vocab_size  : {int} size of the vocabulary
 
        Returns:
            self
 
        Raises:
            Exception
        """
        super(StructuredSelfAttention,self).__init__()
       
        self.embeddings = torch.nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = torch.nn.LSTM(emb_dim, lstm_hid_dim, 1, batch_first=True)
        self.linear_first = torch.nn.Linear(lstm_hid_dim, d_a)
        self.linear_first.bias.data.fill_(0)
        self.linear_second = torch.nn.Linear(d_a, r)
        self.linear_second.bias.data.fill_(0)
        self.n_classes = 1
        self.linear_final = torch.nn.Linear(lstm_hid_dim, self.n_classes)
        self.batch_size = batch_size       
        self.max_len = max_len
        self.lstm_hid_dim = lstm_hid_dim
        self.hidden_state = self.init_hidden()
        self.r = r
        
    def softmax(self,input, axis=1):
        """
        Softmax applied to axis=n
 
        Args:
           input: {Tensor,Variable} input on which softmax is to be applied
           axis : {int} axis on which softmax is to be applied
 
        Returns:
            softmaxed tensors
 
       
        """
 
        input_size = input.size()
        trans_input = input.transpose(axis, len(input_size)-1)
        trans_size = trans_input.size()
        input_2d = trans_input.contiguous().view(-1, trans_size[-1])
        soft_max_2d = F.softmax(input_2d)
        soft_max_nd = soft_max_2d.view(*trans_size)
        return soft_max_nd.transpose(axis, len(input_size)-1)
       
        
    def init_hidden(self):
        return (Variable(torch.zeros(1,self.batch_size,self.lstm_hid_dim)),Variable(torch.zeros(1,self.batch_size,self.lstm_hid_dim)))
       
        
    def forward(self,x):
        embeddings = self.embeddings(x)
        outputs, self.hidden_state = self.lstm(embeddings.view(self.batch_size,self.max_len,-1),self.hidden_state)       
        x = F.tanh(self.linear_first(outputs))       
        x = self.linear_second(x)       
        x = self.softmax(x,1)       
        attention = x.transpose(1,2)       
        sentence_embeddings = attention@outputs       
        avg_sentence_embeddings = torch.sum(sentence_embeddings,1)/self.r
        
        output = F.sigmoid(self.linear_final(avg_sentence_embeddings))
        return output,attention

       
    #Regularization
    def l2_matrix_norm(self,m):
        """
        Frobenius norm calculation
 
        Args:
           m: {Variable} ||AAT - I||
 
        Returns:
            regularized value
 
       
        """
        return torch.sum(torch.sum(torch.sum(m**2,1),1)**0.5).type(torch.DoubleTensor)

# Train

In [40]:
import torch
from torch.autograd import Variable
from sklearn.metrics import accuracy_score, f1_score, classification_report

def train(attention_model, train_loader, criterion, optimizer, epochs = 5, use_regularization = False , C=0, clip=False):
    """
        Training code
 
        Args:
            attention_model : {object} model
            train_loader    : {DataLoader} training data loaded into a dataloader
            optimizer       :  optimizer
            criterion       :  loss function. Must be BCELoss for binary_classification and NLLLoss for multiclass
            epochs          : {int} number of epochs
            use_regularizer : {bool} use penalization or not
            C               : {int} penalization coeff
            clip            : {bool} use gradient clipping or not
       
        Returns:
            accuracy and losses of the model
 
      
        """
    losses = []
    accuracy = []
    for i in range(epochs):
        print("Running EPOCH",i+1)
        total_loss = 0
        n_batches = 0
        correct = 0
       
        for batch_idx,train in enumerate(train_loader):
 
            attention_model.hidden_state = attention_model.init_hidden()
            x,y = Variable(train[0]),Variable(train[1])
            y_pred,att = attention_model(x)
           
            #penalization AAT - I
            if use_regularization:
                attT = att.transpose(1,2)
                identity = torch.eye(att.size(1))
                identity = Variable(identity.unsqueeze(0).expand(train_loader.batch_size,att.size(1),att.size(1)))
                penal = attention_model.l2_matrix_norm(att@attT - identity)

            #binary classification
            #Adding a very small value to prevent BCELoss from outputting NaN's
            correct+=torch.eq(torch.round(y_pred.type(torch.DoubleTensor).squeeze(1)),y).data.sum()
            if use_regularization:
                try:
                    loss = criterion(y_pred.type(torch.DoubleTensor).squeeze(1)+1e-8,y) + C * penal/train_loader.batch_size

                except RuntimeError:
                    raise Exception("BCELoss gets nan values on regularization. Either remove regularization or add very small values")
            else:
                loss = criterion(y_pred.type(torch.DoubleTensor).squeeze(1),y)
               
 
            total_loss+=loss.data
            optimizer.zero_grad()
            loss.backward()
           
            #gradient clipping
            if clip:
                torch.nn.utils.clip_grad_norm(attention_model.parameters(),0.5)
            optimizer.step()
            n_batches+=1
           
        print("avg_loss is",total_loss/n_batches)
        print("Accuracy of the model",int(correct)/(n_batches*train_loader.batch_size))
        losses.append(total_loss/n_batches)
        accuracy.append(int(correct)/(n_batches*train_loader.batch_size))
        
        
    return losses, accuracy
 
def evaluate(attention_model, x_test, y_test):
    """
        cv results
 
        Args:
            attention_model : {object} model
            x_test          : {nplist} x_test
            y_test          : {nplist} y_test
       
        Returns:
            cv-accuracy
 
      
    """
   
    attention_model.batch_size = x_test.shape[0]
    attention_model.hidden_state = attention_model.init_hidden()
    x_test_var = Variable(torch.from_numpy(x_test).type(torch.LongTensor))
    y_test_pred,_ = attention_model(x_test_var)
    
    y_preds = torch.round(y_test_pred.type(torch.DoubleTensor).squeeze(1))
    y_test_var = Variable(torch.from_numpy(y_test).type(torch.DoubleTensor))
    
    print(classification_report(y_test, y_preds.detach().numpy()))
    return int(torch.eq(y_preds,y_test_var).data.sum())/x_test_var.size(0)
 
def get_activation_wts(attention_model,x):
    """
        Get r attention heads
 
        Args:
            attention_model : {object} model
            x               : {torch.Variable} input whose weights we want
       
        Returns:
            r different attention weights
 
      
    """
    attention_model.batch_size = x.size(0)
    attention_model.hidden_state = attention_model.init_hidden()
    _,wts = attention_model(x)
    return wts

In [43]:
attention_model = StructuredSelfAttention(batch_size=train_loader.batch_size,
                                          lstm_hid_dim=10,
                                          d_a=100,
                                          r=1,
                                          vocab_size=VOCAB_SIZE, 
                                          max_len=MAX_LEN,
                                          )

In [44]:
loss = torch.nn.BCELoss()
optimizer = torch.optim.Adam(attention_model.parameters())

losses, accuracy = train(attention_model, train_loader, loss, optimizer, epochs=5,
                         use_regularization=False, C=0.03, clip=True)


Running EPOCH 1




avg_loss is tensor(0.4316, dtype=torch.float64)
Accuracy of the model 0.7960146103896104
Running EPOCH 2
avg_loss is tensor(0.2894, dtype=torch.float64)
Accuracy of the model 0.889172077922078
Running EPOCH 3
avg_loss is tensor(0.2584, dtype=torch.float64)
Accuracy of the model 0.9025649350649351
Running EPOCH 4
avg_loss is tensor(0.2403, dtype=torch.float64)
Accuracy of the model 0.9108603896103896
Running EPOCH 5
avg_loss is tensor(0.2273, dtype=torch.float64)
Accuracy of the model 0.9172077922077922


# Evaluate 

In [45]:
print(evaluate(attention_model, x_padded_test, y_test))



              precision    recall  f1-score   support

           0       0.91      0.95      0.93     25115
           1       0.92      0.85      0.88     15954

    accuracy                           0.91     41069
   macro avg       0.91      0.90      0.91     41069
weighted avg       0.91      0.91      0.91     41069

0.9131218193771458


# Visualize

In [51]:
def visualize_attention(wts, X_test, filename):
    wts_add = torch.sum(wts, 1)
    wts_add_np = wts_add.data.numpy()
    wts_add_list = wts_add_np.tolist()
    text = []
    for test in X_test:
        text.append(" ".join(test))
    createHTML(text, wts_add_list, filename)
    print("Attention visualization created for {} samples".format(len(X_test)))
    return

In [52]:
attention_model.batch_size = np.array(x_padded_test).shape[0]
attention_model.hidden_state = attention_model.init_hidden()
x_test_var = Variable(torch.from_numpy(x_padded_test).type(torch.LongTensor))
y_test_pred,_ = attention_model(x_test_var)
y_preds = torch.round(y_test_pred.type(torch.DoubleTensor).squeeze(1))



In [54]:
from utils.visualize_attention import chat_to_byteLength

In [60]:
chat_to_bytelengths = list(map(chat_to_byteLength, X_test))
chat_to_bytelength_lamb = lambda x: '_'*(50-len(x)) + x
atttxt = list(map(chat_to_bytelength_lamb, chat_to_bytelengths))

In [67]:
test_last_idx = -500
wts = get_activation_wts(attention_model,
                         Variable(torch.from_numpy(x_padded_test[y_preds.data.numpy()==1][test_last_idx:]).type(torch.LongTensor)))
# print(torch.sum(wts,1).data.numpy().tolist())
visualize_attention(wts, np.array(atttxt)[y_preds.data.numpy()==1][test_last_idx:], filename='bpe_attention.html')



Attention visualization created for 500 samples


In [50]:
#Credits to Lin Zhouhan(@hantek) for the complete visualization code
import random, os, numpy, scipy
from codecs import open
def createHTML(texts, weights, fileName):
    """
    Creates a html file with text heat.
	weights: attention weights for visualizing
	texts: text on which attention weights are to be visualized
    """
    fileName = "visualization/"+fileName
    fOut = open(fileName, "w", encoding="utf-8")
    part1 = """
    <html lang="en">
    <head>
    <meta http-equiv="content-type" content="text/html; charset=utf-8">
    <style>
    body {
    font-family: Sans-Serif;
    }
    </style>
    </head>
    <body>
    <h3>
    Heatmaps
    </h3>
    </body>
    <script>
    """
    part2 = """
    var color = "255,0,0";
    var ngram_length = 9;
    var half_ngram = 3;
    for (var k=0; k < any_text.length; k++) {
    var tokens = any_text[k].split(" ");
    var intensity = new Array(tokens.length);
    var max_intensity = Number.MIN_SAFE_INTEGER;
    var min_intensity = Number.MAX_SAFE_INTEGER;
    for (var i = 0; i < intensity.length; i++) {
    intensity[i] = 0.0;
    for (var j = -half_ngram; j < ngram_length-half_ngram; j++) {
    if (i+j < intensity.length && i+j > -1) {
    intensity[i] += trigram_weights[k][i + j];
    }
    }
    if (i == 0 || i == intensity.length-1) {
    intensity[i] /= 6.0;
    } else {
    intensity[i] /= 9.0;
    }
    if (intensity[i] > max_intensity) {
    max_intensity = intensity[i];
    }
    if (intensity[i] < min_intensity) {
    min_intensity = intensity[i];
    }
    }
    var denominator = max_intensity - min_intensity;
    for (var i = 0; i < intensity.length; i++) {
    intensity[i] = (intensity[i] - min_intensity) / denominator;
    }
    if (k%2 == 0) {
    var heat_text = "<p><br><b>Example:</b><br>";
    } else {
    var heat_text = "<b>Example:</b><br>";
    }
    var space = "";
    for (var i = 0; i < tokens.length; i++) {
    heat_text += "<span style='background-color:rgba(" + color + "," + intensity[i] + ")'>" + space + tokens[i] + "</span>";
    if (space == "") {
    space = " ";
    }
    }
    //heat_text += "<p>";
    document.body.innerHTML += heat_text;
    }
    </script>
    </html>"""
    putQuote = lambda x: "\"%s\""%x
    textsString = "var any_text = [%s];\n"%(",".join(map(putQuote, texts)))
    weightsString = "var trigram_weights = [%s];\n"%(",".join(map(str,weights)))
    fOut.write(part1)
    fOut.write(textsString)
    fOut.write(weightsString)
    fOut.write(part2)
    fOut.close()
  
    return


# Model Save

In [71]:
torch.save(attention_model, '../model/self_attention_bp_mixed.pt')

  "type " + obj.__name__ + ". It won't be checked "
