In [1]:
import time
import pickle
from math import ceil

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_curve, roc_auc_score, roc_curve

random = np.random.RandomState(12)

Using TensorFlow backend.


In [2]:
SIZE = 1000000
TIME_GAP = 5
SEQUENCE = (3, 1)
VALUES = range(1, 6)
WINDOW = 10

In [3]:
def takespread(sequence, num):
    length = float(len(sequence))
    
    if num > length:
        num = int(length)
        
    return [sequence[int(ceil(i * length / num))] for i in range(num)]

In [4]:
def create_data(values, window, size, sequence, time_gap, dummy=False):
    data = random.randint(min(values), max(values), size=(size, ))
    df = pd.DataFrame({"x": data})
    
    def match(array):
        if (array == sequence).all():
            return 1
        else:
            return 0
        
    matches = df["x"].rolling(len(sequence)).apply(match)
    df["y"] = matches.shift(time_gap)
    
    if dummy:
        dummies = pd.get_dummies(df["x"])
        dummies["y"] = df["y"]
        df = dummies
    
    x_val = df.iloc[:, :-1].values
    y_val = df.iloc[:, -1].values
    
    X = []
    y = []
    
    for idx in range(df.shape[0]-window):
        X.append(x_val[idx:idx+window, :].flatten())
        y.append(y_val[idx+window-1])
        
    return train_test_split(np.array(X), np.array(y))

In [5]:
def L15_L10_L5(input_dim=1):
    model = Sequential()

    model.add(LSTM(units=15, return_sequences=True, input_shape=(None, input_dim)))
    model.add(Dropout(0.2))
    
    model.add(LSTM(units=10, return_sequences=True))
    model.add(Dropout(0.2))

    model.add(LSTM(units=5, return_sequences=False))
    model.add(Dropout(0.2))

    model.add(Dense(units=1))
    model.add(Activation("sigmoid"))

    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model


def L30_L20_L10(input_dim=1):
    model = Sequential()

    model.add(LSTM(units=30, return_sequences=True, input_shape=(None, input_dim)))
    model.add(Dropout(0.2))
    
    model.add(LSTM(units=20, return_sequences=True))
    model.add(Dropout(0.2))

    model.add(LSTM(units=10, return_sequences=False))
    model.add(Dropout(0.2))

    model.add(Dense(units=1))
    model.add(Activation("sigmoid"))

    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model


def L30_L15(input_dim=1):
    model = Sequential()

    model.add(LSTM(units=30, return_sequences=True, input_shape=(None, input_dim)))
    model.add(Dropout(0.2))

    model.add(LSTM(units=15, return_sequences=False))
    model.add(Dropout(0.2))

    model.add(Dense(units=1))
    model.add(Activation("sigmoid"))

    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

def L10_L5(input_dim=1):
    model = Sequential()

    model.add(LSTM(units=10, return_sequences=True, input_shape=(None, input_dim)))
    model.add(Dropout(0.2))

    model.add(LSTM(units=5, return_sequences=False))
    model.add(Dropout(0.2))

    model.add(Dense(units=1))
    model.add(Activation("sigmoid"))

    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

def L5(input_dim=1):
    model = Sequential()

    model.add(LSTM(units=5, return_sequences=False, input_shape=(None, input_dim)))
    model.add(Dropout(0.2))

    model.add(Dense(units=1))
    model.add(Activation("sigmoid"))

    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

In [6]:
def fit_model(X_train, y_train, model_builder=None, model=None, epochs=1):
    if not model:
        model = model_builder()
        
    X_reshaped = np.reshape(X_train, (X_train.shape[0], WINDOW, int(X_train.shape[1] / WINDOW)))
    
    try:
        model.fit(X_reshaped, y_train, batch_size=512, epochs=epochs, validation_split=0.05)
    except KeyboardInterrupt:
        return model

    return model

In [7]:
def prediction_results(X, y, model):
    predicted = model.predict(np.reshape(X, (X.shape[0], WINDOW, int(X.shape[1] / WINDOW))))
    predicted = np.reshape(predicted, (predicted.size,))

    precision, recall, thresholds = precision_recall_curve(y, predicted)
    f1 = 2 * (precision * recall) / (precision + recall)
    
    fpr, tpr, thresholds_roc = roc_curve(y, predicted) 
    auc = roc_auc_score(y, predicted)
    
    return {"Precision": takespread(precision, 100), 
            "Recall": takespread(recall, 100), 
            "F1": takespread(f1, 100),
            "F1 Max": np.nanmax(f1),
            "PR thresholds": takespread(thresholds, 100), 
            "FPR": takespread(fpr, 100), 
            "TPR": takespread(tpr, 100), 
            "ROC thresholds": takespread(thresholds_roc, 100), 
            "AUC": auc}

In [8]:
results = []

verbosity_string = "Finished in {:.02f} seconds with Test/Train AUC {:.04f}/{:.04f} and max F1 {:.04f}"
start_string = "\nTrain {} with {} samples and sequence {} for {} epochs"
SIZES = ((10**5, "10^5"), (5*10**5, "5*10^5"), (10**6, "10^6"))

try:
    for size, size_label in SIZES:
        for sequence in ((3, 1), (4, 1, 3)):
            X_train, X_test, y_train, y_test = create_data(VALUES, WINDOW, size, sequence, TIME_GAP, dummy=True)

            for model_builder in (L5, L10_L5, L30_L15, L15_L10_L5, L30_L20_L10):
                for epochs in (1, 2, 3, 4, 5):
                    print(start_string.format(model_builder.__name__,
                                              size_label, sequence, epochs))
                    start_time = time.time()
                    
                    if epochs == 1:
                        input_dim = int(X_train.shape[1] / WINDOW)
                        model = model_builder(input_dim)
                        
                    model = fit_model(X_train, y_train, model=model, epochs=1)
                    
                    train_res = prediction_results(X_train, y_train, model)
                    test_res = prediction_results(X_test, y_test, model)
                    
                    result = {"model": model_builder.__name__,
                              "size": size_label,
                              "sequence": sequence,
                              "epochs": epochs,
                              "train_res": train_res,
                              "test_res": test_res,
                              "duration": time.time() - start_time}
                    
                    print(verbosity_string.format(result["duration"],
                                                  result["test_res"]["AUC"],
                                                  result["train_res"]["AUC"],
                                                  result["test_res"]["F1 Max"]))
                    
                    results.append(result)                 
                    
                    
except Exception as e:
    print("Error Raised- Save Results")
    print(e)
finally:
    pickle.dump(results, open('BinaryF.p', 'wb'))


Train L5 with 10^5 samples and sequence (3, 1) for 1 epochs
Train on 71242 samples, validate on 3750 samples
Epoch 1/1




Finished in 9.34 seconds with Test/Train AUC 0.4540/0.4490 and max F1 0.1171

Train L5 with 10^5 samples and sequence (3, 1) for 2 epochs
Train on 71242 samples, validate on 3750 samples
Epoch 1/1
Finished in 7.73 seconds with Test/Train AUC 0.5136/0.5168 and max F1 0.1190

Train L5 with 10^5 samples and sequence (3, 1) for 3 epochs
Train on 71242 samples, validate on 3750 samples
Epoch 1/1
Finished in 7.88 seconds with Test/Train AUC 0.5713/0.5784 and max F1 0.1386

Train L5 with 10^5 samples and sequence (3, 1) for 4 epochs
Train on 71242 samples, validate on 3750 samples
Epoch 1/1
Finished in 7.75 seconds with Test/Train AUC 0.7183/0.7250 and max F1 0.2126

Train L5 with 10^5 samples and sequence (3, 1) for 5 epochs
Train on 71242 samples, validate on 3750 samples
Epoch 1/1
Finished in 8.12 seconds with Test/Train AUC 0.7463/0.7529 and max F1 0.2339

Train L10_L5 with 10^5 samples and sequence (3, 1) for 1 epochs
Train on 71242 samples, validate on 3750 samples
Epoch 1/1
Finished in

In [3]:
results = pickle.load(open("BinaryF.p", "rb"))

In [11]:
results[0].keys()

dict_keys(['epochs', 'test_res', 'size', 'sequence', 'model', 'duration', 'train_res'])

In [18]:
def extract_scores(result):
    extracted = {}
    for kind in ("train_", "test_"):
        for score in ("AUC", "F1 Max"):
            extracted[kind + score] = result[kind + "res"][score]

    extracted.update({k:v for k, v in result.items() if not k.endswith("res")})
    return extracted                     

In [19]:
parsed = [extract_scores(x) for x in results]

In [23]:
df = pd.DataFrame(parsed)
df.head()

Unnamed: 0,duration,epochs,model,sequence,size,test_AUC,test_F1 Max,train_AUC,train_F1 Max
0,9.335835,1,L5,"(3, 1)",10^5,0.453976,0.117137,0.449026,0.115465
1,7.733341,2,L5,"(3, 1)",10^5,0.513589,0.118988,0.51681,0.118044
2,7.882221,3,L5,"(3, 1)",10^5,0.571297,0.138604,0.578429,0.135557
3,7.754082,4,L5,"(3, 1)",10^5,0.718298,0.212551,0.724985,0.21171
4,8.123513,5,L5,"(3, 1)",10^5,0.746254,0.233888,0.752928,0.231837


In [24]:
df.groupby("size").mean()

Unnamed: 0_level_0,duration,epochs,test_AUC,test_F1 Max,train_AUC,train_F1 Max
size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10^5,22.881739,3,0.648394,0.200117,0.646781,0.198101
10^6,229.281549,3,0.904657,0.783093,0.905073,0.782867
5*10^5,112.210241,3,0.816076,0.555605,0.816722,0.557018
