In [1]:
import os
import pandas as pd
import numpy as np
import pickle

from sklearn.utils.class_weight import compute_class_weight
from keras.utils import to_categorical

from keras.models import Sequential
from keras.layers import *

from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

import matplotlib.pyplot as plt
plt.style.use('seaborn')

Using TensorFlow backend.


In [2]:
EMBEDD_FILE = os.path.join("gensim", "embedd_weights.npy")
X_FILE_BIN =  os.path.join("gensim", "embedded_X.npy")
Y_FILE_BIN =  os.path.join("gensim", "encoded_Y.npy")

In [3]:
embedd_weights = np.matrix(np.load(EMBEDD_FILE))

In [4]:
embedd_weights.dtype

dtype('float32')

In [5]:
X = np.load(X_FILE_BIN)
Y = np.load(Y_FILE_BIN)

In [6]:
X.dtype

dtype('int32')

In [7]:
embedd_weights.shape, X.shape, Y.shape

((59717, 300), (838804, 679), (838804,))

In [8]:
NUM_WORDS = embedd_weights.shape[0]
EMBEDD_DIM = embedd_weights.shape[1]
PADDED_LEN = X.shape[1]

In [9]:
n_total = X.shape[0]
n_skip = 450_000 # 0
n_train = 500_000
n_val = 10_000 # 200_000
n_test = n_total - n_train - n_val

X_train = X[n_skip:n_train]
Y_train = to_categorical(Y[n_skip:n_train])

X_val = X[n_train:n_train+n_val]
Y_val = to_categorical(Y[n_train:n_train+n_val])

X_test = X[-n_test:]
Y_test = to_categorical(Y[-n_test:])

In [10]:
# compute class weights for keras (obtained from the whole of train sample)

classes = np.unique(Y[:n_train])
n_classes = len(classes)

class_weights = compute_class_weight('balanced', classes, Y[:n_train])  # ~ 1 / np.unique(Y, return_counts=True)[1]
class_weights

array([ 0.27602776,  0.72736371, 13.23801959,  9.49559404, 30.29201502,
        1.26825655])

In [11]:
from keras import backend as K

# Custom metrics.
# Note that any metric is computed per batch during training
# (hence one needs large batches for it to make sense).
# Also, there is the implicit 0.5 treshold in the K.round function below

def macroPrec(y_true, y_pred):
    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0)
    positives = K.sum(K.round(y_pred), axis=0)
    precision = true_positives / (positives + K.epsilon())

    macroPrec = K.mean( precision )
    
    return macroPrec


def macroRecall(y_true, y_pred):
    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0)
    actual_ones = K.sum(K.round(K.clip(y_true, 0, 1)), axis=0)
    recall = true_positives / (actual_ones + K.epsilon())

    macroRecall = K.mean( recall )
    
    return macroRecall


def macroF1(y_true, y_pred):
    
    def recall(y_true, y_pred):

        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0)
        actual_ones = K.sum(K.round(K.clip(y_true, 0, 1)), axis=0)
        recall = true_positives / (actual_ones + K.epsilon())
        return recall

    
    def precision(y_true, y_pred):
        
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0)
        positives = K.sum(K.round(K.clip(y_pred, 0, 1)), axis=0)
        precision = true_positives / (positives + K.epsilon())
        return precision
    
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    macroF1 = K.mean( 2*((precision*recall)/(precision+recall+K.epsilon())) )
    
    return macroF1



# Custom loss functions
# No clippping or rounding as those are not differentiable
# These functions are already taking into account class imbalances, so remeber
# not to set the class_weight argument in fitting method when using them as the loss 

def fuzzy_macroF1_flip(y_true, y_pred):
    
    def recall(y_true, y_pred):

        true_positives = K.sum(y_true * y_pred, axis=0)
        actual_ones = K.sum(y_true, axis=0)
        recall = true_positives / (actual_ones + K.epsilon())
        return recall

    
    def precision(y_true, y_pred):
        
        true_positives = K.sum(y_true * y_pred, axis=0)
        positives = K.sum(y_pred, axis=0)
        precision = true_positives / (positives + K.epsilon())
        return precision
    
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    macroF1 = K.mean( 2*((precision*recall)/(precision+recall+K.epsilon())) ) 
    
    return 1-macroF1

def w_cat_crossE(y_true, y_pred):
    
    class_entropy = K.sum(y_true * K.log(y_pred+K.epsilon()), axis=0)
    weighted_entropy = class_weights * class_entropy
    return -K.mean(weighted_entropy)/100

In [19]:
# some other global learning settings

batch_size = 2000
patience = 4
optimizer = 'adam' # 'rmsprop'
my_loss = fuzzy_macroF1_flip
my_metrics = ['categorical_accuracy', macroF1, w_cat_crossE] #, macroPrec, macroRecall]

In [20]:
# auxiliary functions
    
# to plot the learning history,
# i.e. loss and metrics on each train_batch and validation_batch
def plot_history(history):
    
    plt.figure(figsize=(18, 5))
    
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(loss) + 1)
    
    plt.subplot(1, 3, 1)    
    plt.plot(x, loss, 'ob', label='Training loss')
    plt.plot(x, val_loss, '-b', label='Validation loss')
    plt.title('Training and val losses') 
    plt.legend()
    
    all_metrics = np.array(list(set(history.history.keys()) - set(['loss', 'val_loss', 'lr'])))
    metrics = all_metrics[[not metric.startswith('val_') for metric in all_metrics]]
    plt.subplot(1, 3, 2)
    markers = ['b', 'r', 'g', 'y', 'c', 'm', 'k']
    marker = zip(markers, markers)
    for metric in metrics:
        results = history.history[metric]
        val_results = history.history['val_'+metric]
        mark1, mark2 = next(marker)
        plt.plot(x, results, 'o'+mark1, label=metric)
        plt.plot(x, val_results, '-'+mark2, label='Validation '+metric)
        plt.title('Training and val metrics')
    plt.legend()
    
    plt.subplot(1, 3, 3)
    lr = history.history['lr'] # learning rate
    plt.plot(x, lr, 'k', label='Learning rate')
    plt.title('Learning rate')  
    plt.legend()
    
    
# choose callback functions to be given in model.fit(...)
def callbacks(LRfactor=0.8):
    return [ReduceLROnPlateau(factor=LRfactor, patience = patience//2),
            EarlyStopping(monitor='val_macroF1', mode='max', patience=patience)
           ]

---

In [45]:
model = Sequential()
model.add(Embedding(input_dim=NUM_WORDS, output_dim=EMBEDD_DIM, input_length=PADDED_LEN))
model.add(Conv1D(48, 3))
model.add(MaxPooling1D(8, 4))
model.add(Conv1D(12, 4))
model.add(Flatten())
model.add(Dense(n_classes, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 679, 300)          17915100  
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 677, 48)           43248     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 168, 48)           0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 165, 12)           2316      
_________________________________________________________________
flatten_6 (Flatten)          (None, 1980)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 11886     
Total params: 17,972,550
Trainable params: 17,972,550
Non-trainable params: 0
________________________________________________________________

In [49]:
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=my_metrics)

In [50]:
history = model.fit(X_train,Y_train,
                    class_weight=class_weights,
                    epochs=30,
                    batch_size=batch_size,
                    validation_data=(X_val, Y_val),
                    callbacks=callbacks())

Train on 50000 samples, validate on 50000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30


---

In [24]:
model2 = Sequential()
model2.add(Embedding(input_dim=NUM_WORDS, output_dim=EMBEDD_DIM, input_length=PADDED_LEN,\
                     weights=[embedd_weights], trainable=False))
model2.add(Conv1D(48, 3))
model2.add(MaxPooling1D(8, 4))
model2.add(Conv1D(12, 4))
model2.add(Flatten())
model2.add(Dense(n_classes, activation='softmax'))
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 679, 300)          17915100  
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 677, 48)           43248     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 168, 48)           0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 165, 12)           2316      
_________________________________________________________________
flatten_4 (Flatten)          (None, 1980)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 11886     
Total params: 17,972,550
Trainable params: 57,450
Non-trainable params: 17,915,100
___________________________________________________________

In [25]:
model2.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=my_metrics)

In [26]:
model2.fit(X_train, Y_train,\
           class_weight=class_weights,
           epochs=30,
           batch_size=batch_size,
           validation_data=(X_val, Y_val),
           callbacks=callbacks())

Train on 50000 samples, validate on 10000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30


<keras.callbacks.History at 0x21285e0ff98>

---

In [27]:
model3 = Sequential()
model3.add(Embedding(input_dim=NUM_WORDS, output_dim=EMBEDD_DIM, input_length=PADDED_LEN,\
                     weights=[embedd_weights], trainable=True))
model3.add(Conv1D(48, 3))
model3.add(MaxPooling1D(8, 4))
model3.add(Conv1D(12, 4))
model3.add(Flatten())
model3.add(Dense(n_classes, activation='softmax'))
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 679, 300)          17915100  
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 677, 48)           43248     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 168, 48)           0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 165, 12)           2316      
_________________________________________________________________
flatten_5 (Flatten)          (None, 1980)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 6)                 11886     
Total params: 17,972,550
Trainable params: 17,972,550
Non-trainable params: 0
________________________________________________________________

In [28]:
model3.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=my_metrics)

In [29]:
model3.fit(X_train, Y_train,\
           class_weight=class_weights,
           epochs=30,
           batch_size=batch_size,
           validation_data=(X_val, Y_val),
           callbacks=callbacks())

Train on 50000 samples, validate on 10000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30

KeyboardInterrupt: 

---

In [26]:
# the workhorse

class BlackBox():
    
    def __init__(self, layers, loss, metrics, class_weight=None, callbacks=callbacks(), embedd_weights = None, embedd_dims=[None]*3):
        
        self.layers = layers
        self.loss = loss
        self.metrics = metrics
        self.class_weight = class_weight
        self.callbacks = callbacks
        self.history = None
        self.results = None
        
        self.model = Sequential()
        if embedd_weights.any():
            self.input_dim = embedd_dims[0] # num_words
            self.output_dim = embedd_dims[1] # dimension of the embedding space
            self.input_length = embedd_dims[2] # padded_length
            self.embedd_weights = embedd_weights
            
            self.model.add(Embedding(input_dim=self.input_dim,\
                                     output_dim=self.output_dim,\
                                     input_length=self.input_length,\
                                     weights=self.embedd_weights,\
                                     trainable=False))
            self.model.layers[0].trainable = False  # szczeżonego panbug szczeże
        for layer in layers:
            self.model.add(layer)
            
        self.model.add(Dense(n_classes, activation='softmax'))
        self.model.compile(loss=self.loss, optimizer=optimizer, metrics=self.metrics)
    
    def summary(self):
        return self.model.summary()
    
    def fit(self, epochs=200):
        self.history = self.model.fit(
            X_train, Y_train,
            class_weight=self.class_weight,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_val, Y_val),
            callbacks=self.callbacks,
            verbose = 1        
            )
        
    def reset(self):
        
        self.history = None
        self.results = None
        self.model.compile(loss=self.loss, optimizer=optimizer, metrics=self.metrics)
        
    def evaluate(self):
        print(f'Loss function: {self.loss.__name__ if callable(self.loss) else self.loss}. Metrics: {[metric.__name__ if callable(metric) else metric for metric in self.metrics]}')
        print("Predicting for X_test, comparing with y_test")
        return self.results
    
    def plot(self):
        plot_history(self.history)
        
    def Ksave(self, file):
        self.model.save(file)
        
    def save(self, file):
        pickle.dump(self, file)
 


In [31]:
# training an embedding, test run

layers1 = [GlobalAveragePooling1D()]

model1 = BlackBox(layers=layers1, loss=my_loss, metrics=my_metrics,\
                  embedd_weights=embedd_weights, embedd_dims=[NUM_WORDS, EMBEDD_DIM, PADDED_LEN])
model1.summary()

ValueError: Argument must be a dense tensor: (array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([-1.60898035e-03, -1.52038794e-03, -1.10933091e-03, -7.67398276e-04,
       -4.76854068e-04, -1.32066687e-03, -3.64607768e-05,  5.24082046e-04,
       -1.09348889e-03, -9.52688977e-04,  1.58644223e-03, -1.47640193e-03,
        1.24946039e-03, -1.15392741e-03, -5.77126455e-04,  1.30668608e-03,
        3.98015429e-04, -5.98374987e-04,  1.15968799e-03,  8.16885731e-04,
        2.60857923e-04, -1.12693349e-03, -9.21725314e-06, -1.60185300e-04,
       -1.37256618e-04,  1.12894434e-03, -1.31035340e-03,  1.24460098e-03,
        1.53846527e-03,  6.33214833e-04, -2.74155667e-04,  5.95811696e-04,
        8.74387100e-04,  1.27034681e-03, -7.89425219e-04,  3.71693575e-04,
        1.53230067e-05,  1.63376739e-03,  5.56558662e-04,  1.04503555e-03,
        1.44374999e-03, -3.97537282e-04,  5.85352245e-04, -1.07527315e-03,
        8.13228311e-04,  3.20632316e-05, -5.96775615e-04,  2.12454688e-04,
        6.15049794e-04, -1.23639533e-03,  3.54901167e-05, -1.58044032e-03,
       -2.35013489e-04,  7.77747191e-04, -7.51162763e-04,  9.12950782e-04,
       -9.24831664e-04,  7.94099877e-04, -3.39813036e-04, -1.55711500e-03,
       -1.25299237e-04,  1.31499697e-03, -1.07990555e-03, -1.57173641e-03,
       -9.95508977e-04, -1.31331116e-03, -9.99436830e-04,  1.32224686e-03,
       -1.19107752e-03,  1.31757720e-03,  1.52445887e-03, -5.07724646e-04,
        1.08146470e-03,  1.12869765e-03,  4.67235717e-04, -5.31636528e-04,
       -2.43571150e-04, -5.67581214e-04, -1.61261210e-04,  1.60701585e-03,
       -1.17927731e-03, -1.29447016e-03,  1.45637791e-03,  9.64233986e-05,
       -5.74845471e-04,  1.23399356e-03, -9.68343054e-04, -9.39524500e-04,
       -7.02000223e-04,  5.65679395e-04,  7.15141243e-04,  4.03189733e-05,
       -1.60276250e-03,  1.11598161e-03, -1.03874772e-03, -6.64652034e-04,
        1.46818999e-03,  1.42809120e-03, -5.59733831e-04, -3.90343834e-04,
       -7.05113634e-04, -7.53021232e-05,  2.74322720e-05, -4.92932333e-04,
       -9.55337717e-04,  6.34255062e-04,  1.29280938e-03, -4.85283497e-04,
       -1.32880232e-03, -1.41044951e-03, -8.20751186e-04,  4.61015634e-06,
       -2.71514873e-04, -1.10504997e-03, -9.55768483e-05,  1.12787506e-03,
        1.45905965e-03,  1.58085104e-03, -1.59685698e-03,  2.18621295e-04,
        5.09664475e-04, -1.55338945e-04,  1.23669463e-03,  1.96945868e-04,
        1.41947967e-04,  1.48042850e-03,  1.10943872e-03, -1.14344317e-03,
        1.65881950e-03,  1.45826279e-03, -9.74185823e-04,  3.52423551e-04,
       -1.22553622e-03,  1.47468600e-04,  1.35395167e-04,  1.41580682e-03,
        9.10676317e-04, -1.42582797e-03, -2.87161209e-04,  1.38473010e-03,
        3.19217012e-04, -4.35483817e-04,  5.82421431e-04, -1.33084517e-03,
        9.93358481e-05,  1.03049830e-03, -9.15225828e-04,  1.50536152e-03,
        1.28896895e-03,  1.61419809e-03,  2.01449468e-04,  1.00132369e-03,
       -1.56998483e-03, -1.34477916e-03,  1.60844123e-04, -1.41050108e-03,
        3.68407287e-04,  1.24141679e-03,  3.35246179e-04, -1.39729716e-04,
        1.37440732e-03,  1.54752249e-03,  1.63330382e-03, -4.19510994e-04,
        5.78496023e-04,  8.90669704e-04,  1.51883753e-03,  1.25596579e-03,
        1.13689899e-03, -5.06002805e-04,  7.15256494e-04, -1.56985957e-03,
       -7.52614360e-05,  1.91148574e-04, -1.02225202e-03, -8.63118330e-04,
       -1.58842769e-03,  9.16173856e-04,  1.45495043e-03,  1.33247185e-03,
        5.65683295e-04,  5.17761400e-05,  9.08959075e-04,  3.02332483e-04,
        4.46042046e-04,  1.40330428e-03,  1.17137958e-03, -1.14644936e-04,
        1.33080955e-03, -1.63942843e-03,  6.22208827e-05,  5.81298431e-04,
        1.05937535e-03,  1.39693893e-03, -1.09336921e-03, -1.07880204e-03,
        9.15478508e-04, -6.62800274e-04, -5.66827250e-04, -1.41169119e-03,
       -1.34438265e-03, -1.08651689e-03,  3.55042575e-04, -1.54243060e-03,
       -1.80042058e-04, -1.40939653e-03, -1.25757302e-03, -4.48599749e-04,
       -1.22532609e-03, -1.02149497e-03, -4.14674403e-04,  9.73140763e-04,
        1.08551187e-03,  7.12205889e-04, -1.13845104e-03,  4.30394313e-04,
        9.50701593e-04,  1.87309139e-04, -1.30101480e-03, -3.38170968e-04,
       -2.68863951e-04, -8.46220530e-04, -8.21234207e-05, -7.17866467e-04,
       -3.67205444e-04, -8.77176935e-04, -6.89649547e-04, -5.99470513e-04,
        1.36283552e-03, -5.49837714e-04,  3.01873515e-04,  4.51580767e-04,
       -3.82866536e-04, -7.43020337e-07,  1.23502966e-03, -1.44779088e-03,
       -6.00819243e-04, -1.02760526e-03,  9.09426366e-04,  1.52316946e-03,
        8.94674566e-04, -3.95224284e-04,  1.29098748e-03, -9.82875586e-04,
       -9.83973150e-04, -7.16527691e-04,  1.11102662e-03,  1.64950092e-03,
        1.60315062e-03,  1.17079879e-03,  1.09600471e-04,  3.74121708e-04,
       -1.38131436e-03, -2.97997729e-04,  7.60201656e-04, -9.09025621e-05,
       -1.37156830e-03,  8.39024258e-04,  1.26102613e-03,  2.54095154e-04,
       -9.35553922e-04,  2.74013903e-04,  2.38458888e-04, -1.39605277e-03,
        1.41464977e-03, -1.45770190e-03, -5.67707000e-04,  4.64327459e-04,
        1.45531620e-03,  3.89221183e-04, -1.22344226e-03,  1.62306242e-03,
        1.16158847e-03,  4.26869810e-04,  7.20058415e-06, -1.42903591e-03,
        6.68151130e-04, -1.43818895e-03, -9.88423009e-04, -1.17211835e-03,
       -7.01633689e-04, -1.00185243e-04, -4.02247941e-04,  9.33111587e-04,
       -4.34284557e-05,  8.71165539e-05, -1.57590001e-03,  1.97194560e-04,
        9.03506880e-04,  1.19144947e-03,  7.62167387e-04,  6.39694335e-05,
       -6.20790932e-04,  8.66841365e-05,  6.34230848e-04,  2.49058765e-04,
       -2.09642138e-04, -1.16257218e-03, -2.82113848e-04,  9.52833099e-04])) - got shape [2, 300], but wanted [2].