In [1]:
%load_ext autoreload
%autoreload 2
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, AdaBoostClassifier, AdaBoostRegressor, ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.metrics import roc_curve, auc

from matplotlib import pyplot as plt
%matplotlib inline  

from data import Data

import pandas as pd

import numpy as np

In [2]:
training_data = ['datasets/english/News_Train.tsv',
                 'datasets/english/WikiNews_Train.tsv',
                 'datasets/english/Wikipedia_Train.tsv',
                ]
dev_data = ['datasets/english/News_Dev.tsv',
            'datasets/english/WikiNews_Dev.tsv',
            'datasets/english/Wikipedia_Dev.tsv'
           ]

data_train = Data()
data_train.load_data(training_data)

data_dev = Data()
data_dev.load_data(dev_data)

In [3]:
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format(
    './glove.100d.bin',
    binary=True)

In [19]:
from sklearn.metrics import roc_auc_score
import keras.backend as K
import tensorflow as tf

# AUC for a binary classifier
def auc(y_true, y_pred):   
    ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)
    binSizes = -(pfas[1:]-pfas[:-1])
    s = ptas*binSizes
    return K.sum(s, axis=0)

#-----------------------------------------------------------------------------------------------------------------------------------------------------
# PFA, prob false alert for binary classifier
def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # N = total number of negative labels
    N = K.sum(1 - y_true)
    # FP = total number of false alerts, alerts from the negative class labels
    FP = K.sum(y_pred - y_pred * y_true)    
    return FP/N
#-----------------------------------------------------------------------------------------------------------------------------------------------------
# P_TA prob true alerts for binary classifier
def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # P = total number of positive labels
    P = K.sum(y_true)
    # TP = total number of correct alerts, alerts from the positive class labels
    TP = K.sum(y_pred * y_true)    
    return TP/P

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
model_nn = Sequential()


get_custom_objects().update({'swish': Activation(swish )})

model_nn.add(Dense(100, input_shape=(100,), activation='relu'))
model_nn.add(Dense(100, input_shape=(100,), activation='relu'))
model_nn.add(Dense(1, activation='sigmoid'))
model_nn.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=[auc])

In [5]:
def get_data(data, embeddings):
    data_embeddings = []
    for index, instance in enumerate(data.instances):
        words = []
        for i in instance.target:
            if instance.tokens[i] in embeddings:
                words.append(embeddings[instance.tokens[i]])
        if len(words) == 0:
            words.append(embeddings['unk'])
        data_embeddings.append(np.average(words, axis=0))
    print(len(data_embeddings))
    print(len(data_embeddings[0]))
    return np.asarray(data_embeddings, )

In [6]:
average_embedding_train = get_data(data_train, model)
average_embedding_dev = get_data(data_dev, model)
y_train = np.array([instance.label[0] for instance in data_train.instances])
y_dev = np.array([instance.label[0] for instance in data_dev.instances])

27299
100
3328
100


In [18]:
model_nn.fit(
    average_embedding_train,
    y_train,
    validation_data=(average_embedding_dev, y_dev),
    epochs=10,
    batch_size=32)

Train on 27299 samples, validate on 3328 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f1f30ab8da0>

In [20]:
model_nn.save('./model_2relu_sigmoid.h5')