In [None]:
#Install Keras & Tensorflow
!pip install --upgrade tensorflow
!pip install Keras
!pip install keras-tqdm

### Download & unzip fasttext word embeddings

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
!gzip -d cc.en.300.vec.gz

In [None]:
import pickle
import numpy as np

idx = 0
vocab = {}
with open("cc.en.300.vec", 'r', encoding="utf-8", newline='\n',errors='ignore') as f:
    for l in f:
        line = l.rstrip().split(' ')
        if idx == 0:
            vocab_size = int(line[0]) + 2
            dim = int(line[1])
            vecs = np.zeros(vocab_size*dim).reshape(vocab_size,dim)
            vocab["__PADDING__"] = 0
            vocab["__UNK__"] = 1
            idx = 2
        else:
            vocab[line[0]] = idx
            emb = np.array(line[1:]).astype(np.float)
            if (emb.shape[0] == dim):
                vecs[idx,:] = emb
                idx+=1
            else:
                continue

    pickle.dump(vocab,open("fasttext_voc",'wb'))
    np.save("fasttext.npy",vecs)



In [1]:
import pickle
import numpy as np
fasttext_embed = np.load("fasttext.npy")
fasttext_word_to_index = pickle.load(open("fasttext_voc", 'rb'))

In [2]:
def recall(y_true, y_pred):
    
    """
    Recall metric.
    Only computes a batch-wise average of recall.
    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def precision(y_true, y_pred):
    
    """
    Precision metric.
    Only computes a batch-wise average of precision.
    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    Source
    ------
    https://github.com/fchollet/keras/issues/5400#issuecomment-314747992
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def f1(y_true, y_pred):
    
    """Calculate the F1 score."""
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * ((p * r) / (p + r))


def accuracy(y_true, y_pred):
    return K.mean(K.equal(y_true, K.round(y_pred)), axis=1)

In [3]:
import pandas as pd
import numpy as np

In [5]:
data = pd.read_csv('stack-overflow-data.csv')
data['tags'].head(10)

0               c#
1          asp.net
2      objective-c
3             .net
4           python
5          asp.net
6        angularjs
7           iphone
8    ruby-on-rails
9        angularjs
Name: tags, dtype: object

In [6]:
data.tags.value_counts()

javascript       2000
sql              2000
jquery           2000
php              2000
html             2000
c                2000
iphone           2000
c#               2000
asp.net          2000
.net             2000
ios              2000
ruby-on-rails    2000
android          2000
css              2000
objective-c      2000
c++              2000
python           2000
mysql            2000
angularjs        2000
java             2000
Name: tags, dtype: int64

In [46]:
from sklearn.preprocessing import MultiLabelBinarizer 
import itertools
from sklearn.model_selection import train_test_split

full_train, test = train_test_split(data,
                                    test_size=0.3,
                                    random_state=1596,
                                    stratify=data['tags'])
train, train_dev = train_test_split(full_train,
                                    test_size=0.2,
                                    random_state=1596,
                                    stratify=full_train['tags'])
mlb = MultiLabelBinarizer()

X_train =train['post']
X_test = test['post']
X_train_dev = train_dev['post']

y_train = mlb.fit_transform(train['tags'])
y_test = mlb.transform(test['tags'])
y_train_dev = mlb.transform(train_dev['tags'])


In [47]:
y_train.shape

(22400, 25)

In [48]:
# Convert texts to sequence of indexes and PADDING

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_WORDS =20000
MAX_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = fasttext_embed.shape[1]

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='__UNK__')
tokenizer.fit_on_texts(X_train)
train_seqs = tokenizer.texts_to_sequences(X_train)
test_seqs = tokenizer.texts_to_sequences(X_test)
dev_seqs = tokenizer.texts_to_sequences(X_train_dev)
train_data = pad_sequences(train_seqs, maxlen=MAX_SEQUENCE_LENGTH,padding='post')
test_data = pad_sequences(test_seqs, maxlen=MAX_SEQUENCE_LENGTH,padding='post')
dev_data = pad_sequences(dev_seqs, maxlen=MAX_SEQUENCE_LENGTH,padding='post')

In [49]:
y_test.shape

(12000, 25)

In [13]:
print(train_data[345])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

[  114   389     7  3903   395    31  1053   395    10    24   317     3
    48   142     7   114     2     4 15410     4    14     4  1053     4
   395   137   389    30   587   389     7     2     4  3903     4   395
   200   703     2   419   389    14     2     4  1053     4   395   145
    53  1511     3    48   453   449    25     3   506     3    48   350
   220   328    10    24     4    89  3658   614   419   174    10     2
     4  1053     4   395     9     4  1053     6     1  3903     6     1
     4     9     3   221     2   247     4     9     4    18    35 14030
    15 14030     5     1 15410   183 14030     1   389   114     1 15410
 14030    49     1 15410 14030   402     4     9   162  1265     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [14]:
# Define model's embedding matrix
embedding_matrix = np.zeros((MAX_WORDS+2, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_WORDS:
            continue
    try:
        embedding_vector = fasttext_embed[fasttext_word_to_index[word],:]
        embedding_matrix[i] = embedding_vector
    except:
        pass

In [15]:
embedding_matrix.shape

(20002, 300)

In [None]:
#Create and train a BiGRU (RNN) model with an MLP on top of it 

import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
from keras.callbacks import ModelCheckpoint
from keras.layers.embeddings import Embedding
from keras_tqdm import TQDMNotebookCallback
from keras.models import Sequential
from keras.layers import Dense, Dropout, Bidirectional
from keras.layers.recurrent import GRU
from keras.optimizers import Adam
from keras import backend as K


GRU_SIZE = 100
DENSE = 200
N_CLASSES = 25

# create empty sequential model
model = Sequential()
# add an embedding layer
model.add(Embedding(MAX_WORDS+2, EMBEDDING_DIM, weights=[embedding_matrix], 
                    input_length=MAX_SEQUENCE_LENGTH,mask_zero=True, trainable=False))
# Use 0.2 dropout probabillity
model.add(Dropout(0.2))
# add a bidirectional gru layer with 0.2 variational (recurrent) dropout 
model.add(Bidirectional(GRU(GRU_SIZE, return_sequences=False, recurrent_dropout = 0.2)))
# add a hidden MLP layer
model.add(Dropout(0.2))
model.add(Dense( DENSE, activation='relu' ))
# add the output MLP layer
model.add(Dense( N_CLASSES, activation='sigmoid' ))

print(model.summary())
model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001),
                  metrics=[precision, recall, f1, accuracy])

checkpoint = ModelCheckpoint('keras_BiGRU_model', monitor='val_f1', verbose=1, save_best_only=True, mode='max')



history = model.fit(train_data, y_train,
              batch_size=32,
              epochs=5,
              verbose = 0,
              callbacks=[checkpoint,TQDMNotebookCallback()],
              validation_data=(dev_data, y_test),
              shuffle=True)

### Vizualize Model's training history

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt


# summarize history for f1
plt.plot(history.history['f1'])
plt.plot(history.history['val_f1'])
plt.title('Model f1')
plt.ylabel('f1-score')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper right')
plt.show()


### Evaluate performance on dev set

In [None]:
from sklearn import metrics

y_prob = model.predict(dev_data, batch_size=32, verbose=0)
print(metrics.classification_report(y_test, (y_prob > 0.5).astype('int32')))

### Custom keras layer for linear and deep self-attention over RNNs output states

In [22]:
from keras import backend as K
from keras import initializers, regularizers, constraints
from keras.layers.core import Layer
import numpy as np


def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)


class LinearAttention(Layer):
    def __init__(self,
                 kernel_regularizer=None, bias_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False,
                 **kwargs):
        
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(kernel_regularizer)
        self.b_regularizer = regularizers.get(bias_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.return_attention = return_attention
        super(LinearAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((1,),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, inputs, mask=None):
        # do not pass the mask to the next layers
        if self.return_attention:
            return [None, None]
        return None

    def call(self, x, mask=None):
        
        # eij = Wx + b
        eij = dot_product(x, self.W)

        if self.bias:
            eij += self.b

        # Apply mask
        if mask is not None:
            eij *= K.cast(mask, K.floatx())

        # a = softmax(eij)
        a = K.expand_dims(K.softmax(eij, axis=-1))
        weighted_input = x * a
        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]


class DeepAttention(Layer):
    def __init__(self,
                 kernel_regularizer=None, u_regularizer=None, bias_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False,
                 **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(kernel_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b1_regularizer = regularizers.get(bias_regularizer)
        self.b2_regularizer = regularizers.get(bias_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b1_constraint = constraints.get(b_constraint)
        self.b2_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.return_attention = return_attention
        super(DeepAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b1 = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b1'.format(self.name),
                                     regularizer=self.b1_regularizer,
                                     constraint=self.b1_constraint)
            self.b2 = self.add_weight((1,),
                                     initializer='zero',
                                     name='{}_b2'.format(self.name),
                                     regularizer=self.b2_regularizer,
                                     constraint=self.b2_constraint)
        else:
            self.b1 = None
            self.b2 = None

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        self.built = True


    def compute_mask(self, inputs, mask=None):
        # do not pass the mask to the next layers
        if self.return_attention:
            return [None, None]
        return None

    def call(self, x, mask=None):
        # uit = tanh(Wx + b)
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b1

        uit = K.tanh(uit)

        # ait = softmax(Ueij)
        eij = dot_product(uit, self.u)
        if self.bias:
            eij += self.b2

        # Apply mask
        if mask is not None:
            eij *= K.cast(mask, K.floatx())

        a = K.expand_dims(K.softmax(eij, axis=-1))
        
        weighted_input = x * a
        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]

### Create and train a Bi-LSTM + deep self-attention + MLP model

In [50]:
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
from keras.callbacks import ModelCheckpoint
from keras.layers.embeddings import Embedding
from keras_tqdm import TQDMNotebookCallback
from keras.models import Model
from keras.layers import Dense, Dropout, Bidirectional, Input
from keras.layers.recurrent import LSTM
from keras.optimizers import Adam
from keras import backend as K

LSTM_SIZE = 100
DENSE = 200
N_CLASSES = 25

inputs = Input((MAX_SEQUENCE_LENGTH,))
embeddings = Embedding(MAX_WORDS+2,EMBEDDING_DIM, weights=[embedding_matrix], 
                    input_length=MAX_SEQUENCE_LENGTH, mask_zero=True, trainable=False)(inputs)
drop_emb = Dropout(0.2)(embeddings)
bilstm = Bidirectional(LSTM(units=LSTM_SIZE, return_sequences=True,recurrent_dropout = 0.2))(drop_emb)
#x, attn = LinearAttention(return_attention=True)(bilstm)
x, attn = DeepAttention(return_attention=True)(bilstm)
out = Dense(units=DENSE, activation="relu")(x)
out = Dense(units=N_CLASSES, activation="sigmoid")(out)
model2 = Model(inputs, out)

print(model2.summary())
model2.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001),
                  metrics=[precision, recall, f1, accuracy])

checkpoint = ModelCheckpoint('keras_BiLSTM+attn_model', monitor='val_f1', verbose=1, save_best_only=True, mode='max')

history2 = model2.fit(train_data, y_train,
              batch_size=32,
              epochs=5,
              verbose = 0,
              callbacks=[checkpoint,TQDMNotebookCallback()],
              validation_data=(dev_data, y_train_dev),
              shuffle=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 1000)              0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 1000, 300)         6000600   
_________________________________________________________________
dropout_10 (Dropout)         (None, 1000, 300)         0         
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 1000, 200)         320800    
_________________________________________________________________
deep_attention_9 (DeepAttent [(None, 200), (None, 1000 40401     
_________________________________________________________________
dense_17 (Dense)             (None, 200)               40200     
_________________________________________________________________
dense_18 (Dense)             (None, 25)                5025      
Total para

HBox(children=(IntProgress(value=0, description='Training', max=5, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=22400, style=ProgressStyle(description_width='i…

KeyboardInterrupt: 

In [38]:
inputs.shape

TensorShape([Dimension(None), Dimension(1000)])

In [31]:
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
from keras.callbacks import ModelCheckpoint
from keras.layers.embeddings import Embedding
from keras_tqdm import TQDMNotebookCallback
from keras.models import Model
from keras.layers import Dense, Dropout, Bidirectional, Input
from keras.layers.recurrent import LSTM
from keras.optimizers import Adam
from keras import backend as K

LSTM_SIZE = 100
DENSE = 200
N_CLASSES = 25


def load_biLSTM_model(x_train, y_train, x_train_dev, y_train_dev, params):
    inputs = Input((MAX_SEQUENCE_LENGTH,))
    embeddings = Embedding(MAX_WORDS+2,EMBEDDING_DIM, weights=[embedding_matrix], 
                        input_length=MAX_SEQUENCE_LENGTH, mask_zero=True, trainable=False)(inputs)
    drop_emb = Dropout(0.2)(embeddings)
    bilstm = Bidirectional(LSTM(units=LSTM_SIZE, return_sequences=True,recurrent_dropout = 0.2))(drop_emb)
    #x, attn = LinearAttention(return_attention=True)(bilstm)
    x, attn = DeepAttention(return_attention=True)(bilstm)
    out = Dense(units=DENSE, activation=params['activation1'])(x) 
    out = Dense(units=N_CLASSES, activation=params['activation2'])(out) 
    model2 = Model(inputs, out)

    print(model2.summary())
    model2.compile(loss='binary_crossentropy',
                      optimizer=params['optimizer'], 
                      metrics=[precision, recall, f1, accuracy])

    checkpoint = ModelCheckpoint('keras_BiLSTM+attn_model', monitor='val_f1', verbose=1, save_best_only=True, mode='max')

    history2 = model2.fit(x_train, y_train,
                  batch_size=params['batch_size'],  
                  epochs=params['epochs'],   
                  verbose = 0,
                  callbacks=[checkpoint,TQDMNotebookCallback()],
                  validation_data=(x_train_dev, y_train_dev),
                  shuffle=True)
    
    return history2,model2

In [None]:
# params epoch, batch_size, activation1, activation2, loss?, optimizer
# todo ftia3e synarthsh gia to modelo
import talos
import os

DATA_DIR = '../data'
TALOS_DIR = os.path.join(DATA_DIR, 'talos_logs')
TALOS_TF_LOG_FILENAME = 'talos_tf_log'
talos_tf_log_pathname = os.path.join(TALOS_DIR, TALOS_TF_LOG_FILENAME)


p = {'activation1':['relu', 'elu'],
     'activation2':['sigmoid', 'tanh'],
     'optimizer': ['Nadam', 'Adam'],
     'batch_size': [20,30,40],
     'epochs': [3]}

talos.Scan(train_data, y_train, 
           x_val=dev_data, 
           y_val=y_train_dev, 
           model=load_biLSTM_model, 
           params=p,grid_downsample=0.1,
           print_params=True,
           seed=123,
           last_epoch_value=True,
           dataset_name=talos_tf_log_pathname)











  0%|          | 0/4 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A

{'activation1': 'relu', 'activation2': 'tanh', 'optimizer': 'Nadam', 'batch_size': 40, 'epochs': 5}
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        (None, 1000)              0         
_________________________________________________________________
embedding_12 (Embedding)     (None, 1000, 300)         6000600   
_________________________________________________________________
dropout_12 (Dropout)         (None, 1000, 300)         0         
_________________________________________________________________
bidirectional_12 (Bidirectio (None, 1000, 200)         320800    
_________________________________________________________________
deep_attention_11 (DeepAtten [(None, 200), (None, 1000 40401     
_________________________________________________________________
dense_21 (Dense)             (None, 200)               40200     
__________________________________________

HBox(children=(IntProgress(value=0, description='Training', max=5, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=5600, style=ProgressStyle(description_width='in…


Epoch 00001: val_f1 did not improve from -inf


HBox(children=(IntProgress(value=0, description='Epoch 1', max=5600, style=ProgressStyle(description_width='in…


Epoch 00002: val_f1 did not improve from -inf


HBox(children=(IntProgress(value=0, description='Epoch 2', max=5600, style=ProgressStyle(description_width='in…

### Vizualize Model's training history

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt


# summarize history for f1
plt.plot(history2.history['f1'])
plt.plot(history2.history['val_f1'])
plt.title('Model f1')
plt.ylabel('f1-score')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history2.history['loss'])
plt.plot(history2.history['val_loss'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper right')
plt.show()