In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


from tensorflow.keras.layers import Input, Dense, concatenate, LSTM, Bidirectional, Dropout, GRU
from tensorflow.keras.models import Model
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)


# Modeling using both types of data
As said before, the data presented have a static dimension as well as a continuous/dynamic dimension. to access all information we need both a recurrent network and a deep network. After both data go through a couple of its respective layers they will be concatenated and go through a dense layer with a sigmoid function. Back prop will adjust weight from both neworks and find minimal loss.

In [2]:
df_static = pd.read_csv('../treated_data/df_static.csv',index_col=0)
df_dynamic = pd.read_csv('../treated_data/df_dynamic.csv', index_col=0)

  mask |= (ar1 == a)


# Train test split
the split is using just the SK_ID_CURR that later will .loc() the other features

In [3]:
X_train, X_test, y_train, y_test  = train_test_split(df_static.index ,df_static.TARGET, test_size=0.33, random_state=42, shuffle=False)

In [4]:
X_static_train = df_static.loc[X_train]
X_static_train = X_static_train.fillna(X_static_train.mean()).values[:,1:]

X_static_test = df_static.loc[X_test]
X_static_test = X_static_test.fillna(X_static_train.mean()).values[:,1:]

Dynamic data needs to be pivoted on SK_ID_CURR so each observation has 97 months for each of the 57 features

In [5]:
X_dynamic_train = df_dynamic.loc[X_train].reset_index()
X_dynamic_train = X_dynamic_train.fillna(X_dynamic_train.mean())
X_dynamic_train = X_dynamic_train.pivot_table(values=list(set(X_dynamic_train.columns)-set(['SK_ID_PREV', 'MONTHS_BALANCE','SK_ID_CURR'])), index='SK_ID_CURR', columns='MONTHS_BALANCE',aggfunc='sum', fill_value=0).values


X_dynamic_test = df_dynamic.loc[X_test].reset_index()
X_dynamic_test = X_dynamic_test.fillna(X_dynamic_train.mean())
X_dynamic_test = X_dynamic_test.pivot_table(values=list(set(X_dynamic_test.columns)-set(['SK_ID_PREV', 'MONTHS_BALANCE','SK_ID_CURR'])), index='SK_ID_CURR', columns='MONTHS_BALANCE', aggfunc='sum', fill_value=0).values



## Reshape
Dynamic data has to be reshaped, each SK has features on one axis and months on another 

In [6]:
X_dynamic_train_reshaped = np.zeros((33265, 97,57))

for l in range(33265):
    for o in range(97):
        for i in range(57):
            X_dynamic_train_reshaped[l,o,i]=X_dynamic_train[l,i*97+o]

            
X_dynamic_test_reshaped = np.zeros((16385, 97,57))

for l in range(16385):
    for o in range(97):
        for i in range(57):
            X_dynamic_test_reshaped[l,o,i]=X_dynamic_test[l,i*97+o]

# Scaling
dynamic data needs special treatment, each feature has to be scaled through every time step and for every SK

In [7]:
scaler = StandardScaler()
X_static_train = scaler.fit_transform(X_static_train)
X_static_test = scaler.transform(X_static_test)

In [8]:
scalers = {}
for i in range(X_dynamic_train_reshaped.shape[2]):
    scalers[i] = MinMaxScaler()
    X_dynamic_train_reshaped[:, :, i] = scalers[i].fit_transform(X_dynamic_train_reshaped[:, :, i]) 

for i in range(X_dynamic_test_reshaped.shape[2]):
    X_dynamic_test_reshaped[:, :, 1] = scalers[i].transform(X_dynamic_test_reshaped[:, :, i]) 

# Model
will have two inputs, dy that has shape (time steps, number of features) and st has shape (number of features)
dy will go through a lstm that will output 84 neurons for each timestep, dropout 0.4, and go through an last lstm that outputs 46 neurons at the end.
st will go through a dense that outputs 100 neurons, dropsout 0.4 and another dense that outputs 50 neurons.
the output of those two branches will be concatenated and output 96 neurons
a last dense relu layer with dropout 0.4 will be used before the last relu layer

In [9]:
X_static_train.shape

(33265, 186)

In [17]:
x_in_dy = Input(shape=(36,57))
x_in_st = Input(shape=(186))

x_dy = GRU(84, return_sequences=False)(x_in_dy)
x_dy = Dropout(0.6)(x_dy)
x_dy = Dense(46)(x_dy)

x_st = Dense(100, activation="relu")(x_in_st)
x_st = Dropout(0.6)(x_st)
x_st = Dense(50, activation="relu")(x_st)



z=concatenate([x_dy,x_st])

out = Dense(24, activation='relu')(z)
out = Dropout(0.8)(out)
out = Dense(1, activation='sigmoid')(out)


model1 = Model(inputs = [x_in_dy,x_in_st], outputs = out)

In [11]:
# model1.compile(optimizer='rmsprop', 
#               loss = 'binary_crossentropy',
#               metrics=['accuracy'])
# history = model1.fit([X_dynamic_train, X_static_train], y_train, epochs=100, batch_size= 1000, validation_data=([X_dynamic_test, X_static_test], y_test)  )

In [18]:
opt = tf.keras.optimizers.SGD(lr=0.003)
m = tf.keras.metrics.AUC(num_thresholds=100)

In [19]:
model1.compile(optimizer=opt, 
              loss = 'binary_crossentropy',
              metrics=[m])
history = model1.fit( [X_dynamic_train_reshaped[:,-36:,:],X_static_train], y_train, epochs=400, batch_size= 500, validation_data=( [X_dynamic_test_reshaped[:,-36:,:],X_static_test], y_test)  )



Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400

KeyboardInterrupt: 

In [None]:
loss_train = history.history['auc_1']
loss_val = history.history['val_auc_1']
epochs = range(200)
plt.plot(epochs, loss_train, 'g', label='Training acc')
plt.plot(epochs, loss_val, 'b', label='validation acc')
plt.title('Training and Validation acc')
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.legend()
plt.show()

In [None]:
loss_train = history.history['auc_2']
loss_val = history.history['val_auc_2']
epochs = range(400)
plt.plot(epochs, loss_train, 'g', label='Training acc')
plt.plot(epochs, loss_val, 'b', label='validation acc')
plt.title('Training and Validation acc')
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.legend()
plt.show()

In [None]:

loss_train = history.history['auc']
loss_val = history.history['val_auc']
epochs = range(500)
plt.plot(epochs, loss_train, 'g', label='Training acc')
plt.plot(epochs, loss_val, 'b', label='validation acc')
plt.title('Training and Validation acc')
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.legend()
plt.show()

In [None]:
loss_train = history.history['auc_9']
loss_val = history.history['val_auc_9']
epochs = range(300)
plt.plot(epochs, loss_train, 'g', label='Training acc')
plt.plot(epochs, loss_val, 'b', label='validation acc')
plt.title('Training and Validation acc')
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.legend()
plt.show()

In [None]:

loss_train = history.history['auc']
loss_val = history.history['val_auc']
epochs = range(500)
plt.plot(epochs, loss_train, 'g', label='Training acc')
plt.plot(epochs, loss_val, 'b', label='validation acc')
plt.title('Training and Validation acc')
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import roc_curve
y_pred = model1.predict([X_dynamic_test, X_static_test]).ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_test, y_pred)

In [None]:
from sklearn.metrics import auc
auc_keras = auc(fpr_keras, tpr_keras)

In [None]:
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
# plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

In [None]:
print(thresholds_keras)