In [14]:
import os
import pandas as pd
import numpy as np
import pickle

from sklearn.utils.class_weight import compute_class_weight
from keras.utils import to_categorical

from keras.models import Sequential
from keras.layers import *

from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

import matplotlib.pyplot as plt
plt.style.use('seaborn')

In [2]:
EMBEDD_FILE = os.path.join("gensim", "embedd_weights.npy")
X_FILE_BIN =  os.path.join("gensim", "embedded_X.npy")
Y_FILE_BIN =  os.path.join("gensim", "encoded_Y.npy")

In [3]:
embedd_weights = np.load(EMBEDD_FILE)

In [4]:
X = np.load(X_FILE_BIN)
Y = np.load(Y_FILE_BIN)

In [7]:
embedd_weights.shape, X.shape, Y.shape

((299305, 300), (838804, 679), (838804,))

In [None]:
NUM_WORDS = embedd_weights[0]
EMBEDD_DIM = embedd_weights[1]
PADDED_LEN = X.shape[1]

In [15]:
n_total = X.shape[0]
n_train = 500_000
n_val = 200_000
n_test = n_total - n_train - n_val

X_train = X[:n_train]
Y_train = to_categorical(Y[:n_train])

X_val = X[n_train:n_train+n_val]
Y_val = to_categorical(Y[n_train:n_train+n_val])

X_test = X[-n_test:]
Y_test = to_categorical(Y[-n_test:])

In [16]:
# compute class weights for keras (obtained from the whole of train sample)

classes = np.unique(Y[:n_train])
n_classes = len(classes)

class_weights = compute_class_weight('balanced', classes, Y[:n_train])  # ~ 1 / np.unique(Y, return_counts=True)[1]
class_weights

array([ 0.27602776,  0.72736371, 13.23801959,  9.49559404, 30.29201502,
        1.26825655])

In [None]:
from keras import backend as K

# Custom metrics.
# Note that any metric is computed per batch during training
# (hence one needs large batches for it to make sense).
# Also, there is the implicit 0.5 treshold in the K.round function below

def macroPrec(y_true, y_pred):
    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0)
    positives = K.sum(K.round(y_pred), axis=0)
    precision = true_positives / (positives + K.epsilon())

    macroPrec = K.mean( precision )
    
    return macroPrec


def macroRecall(y_true, y_pred):
    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0)
    actual_ones = K.sum(K.round(K.clip(y_true, 0, 1)), axis=0)
    recall = true_positives / (actual_ones + K.epsilon())

    macroRecall = K.mean( recall )
    
    return macroRecall


def macroF1(y_true, y_pred):
    
    def recall(y_true, y_pred):

        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0)
        actual_ones = K.sum(K.round(K.clip(y_true, 0, 1)), axis=0)
        recall = true_positives / (actual_ones + K.epsilon())
        return recall

    
    def precision(y_true, y_pred):
        
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0)
        positives = K.sum(K.round(K.clip(y_pred, 0, 1)), axis=0)
        precision = true_positives / (positives + K.epsilon())
        return precision
    
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    macroF1 = K.mean( 2*((precision*recall)/(precision+recall+K.epsilon())) )
    
    return macroF1



# Custom loss functions
# No clippping or rounding as those are not differentiable

def fuzzy_macroF1_flip(y_true, y_pred):
    
    def recall(y_true, y_pred):

        true_positives = K.sum(y_true * y_pred, axis=0)
        actual_ones = K.sum(y_true, axis=0)
        recall = true_positives / (actual_ones + K.epsilon())
        return recall

    
    def precision(y_true, y_pred):
        
        true_positives = K.sum(y_true * y_pred, axis=0)
        positives = K.sum(y_pred, axis=0)
        precision = true_positives / (positives + K.epsilon())
        return precision
    
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    macroF1 = K.mean( 2*((precision*recall)/(precision+recall+K.epsilon())) ) 
    
    return 1-macroF1

def weighted_cat_acc(y_true, y_pred):
    
    acc = K.sum(y_true * K.log(y_pred), axis=0)
    weighted_acc = class_weights * acc
    return K.mean(weighted_acc)