In [4]:
from collections import defaultdict

import pandas as pd
import numpy as np
import copy
import datetime

import tensorflow as tf

from tensorflow.python.keras.layers import Input, Dense, Activation, Flatten, Dropout, Embedding, concatenate,multiply,add
from tensorflow.python.keras.models import Model, Sequential
from tensorflow.python.keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.regularizers import l1
from keras.initializers import RandomNormal
from keras.constraints import unitnorm

from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split

The below data comes from the Kaggle IEEE fraud detection competition

In [2]:
df = pd.read_csv('/Users/michaelspillane/Downloads/ieee-fraud-detection/train_transaction.csv')
df_id = pd.read_csv('/Users/michaelspillane/Downloads/ieee-fraud-detection/train_identity.csv')

I merge the two data frames on the TransactionID using a left joing because not all entries in df and in df_id.  Next I convert the DateTime to several catagorical variables that may be of use when modeling.  The label is stored in y.

In [43]:
DF = pd.merge(df, df_id,how='left', on='TransactionID')

START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
DF['TransactionDT'] = DF['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))

DF['dow'] = DF['TransactionDT'].dt.dayofweek
DF['dow']= DF['dow'].apply(str)
DF['hour'] = DF['TransactionDT'].dt.hour
DF['hour']= DF['hour'].apply(str)
DF['day'] = DF['TransactionDT'].dt.day
DF['day']= DF['day'].apply(str)

dtyp = DF.dtypes

y = df['isFraud']

I also extract the cents and tenths of cents from transaction amount as that too coud be useful.  Some of the variables distributions are not gaussian and by taking their log the distributions become more Gaussian. 

In [44]:
DF['digit1'] = np.floor((DF['TransactionAmt'].round(decimals=3)%1)*10).astype(int)
DF['digit2'] = np.floor(((10*DF['TransactionAmt'].round(decimals=3))%1)*10).astype(int)
DF['digit3'] = (np.floor(((100*DF['TransactionAmt'].round(decimals=3))%1)*10).astype(int)>.5)
DF['logTransactionAmt'] = np.log(DF['TransactionAmt'])
DF['logC1'] = np.log(DF['C1']+1)
DF['logC2'] = np.log(DF['C2']+1)
DF['logC3'] = np.log(DF['C3']+1)
DF['logC4'] = np.log(DF['C4']+1)
DF['logC5'] = np.log(DF['C5']+1)
DF['logC6'] = np.log(DF['C6']+1)
DF['logD1']  = np.log(DF['D1']+1)
DF['logD10']  = np.log(DF['D10']+1)

I perform a train test split on the data.  

In [45]:
DF_train, DF_test, y_train, y_test = train_test_split(DF, y, test_size=0.05)

From the Kaggle website we can find a list of which features are catagorical in nature. 

In [46]:
l = list(DF)[4:13]+list(DF)[15:17]+list(DF)[46:55]+list(DF)[405:]  #catagorical columns

I will use an embedding layer from keras for the catagorical variables and so I seperate them into their own dateframe here.  Some of the numerical features only have a couple of values and so I add them as potential catagorical features though I do not remove them from the numerical features.

In [47]:
DF_train_time = DF_train['TransactionDT']
cat_var_train = DF_train[l]
DF_train = DF_train.drop(l,axis = 1)
DF_train = DF_train.drop(['TransactionID','isFraud','TransactionDT'],axis = 1)

nunq = DF_train.nunique(axis=0)

l2 = list(nunq[(nunq<15).values].index)

emb_train = pd.concat([cat_var_train,DF_train[l2]],axis=1)



I impute the missing values as the median of the column and store the imputer for use on the testing set.  I then scale the columns using a standard scaler and finally scale the time.

In [48]:
col = DF_train.columns

imp = SimpleImputer(missing_values=np.nan, strategy='median')

DF_train = imp.fit_transform(DF_train)

scaler = StandardScaler()
X_scaled_train  = scaler.fit_transform(DF_train)


In [49]:
scaler_t = StandardScaler()
X_scaled_time_train  = scaler_t.fit_transform(DF_train_time.values.reshape(-1, 1))

emb_train = emb_train.applymap(str)

emb_train[pd.isnull(emb_train)]  = 'NaN'



Next we use the above preprocessing on the test set.

In [50]:
DF_test_time = DF_test['TransactionDT']

cat_var_test = DF_test[l]
DF_test = DF_test.drop(l,axis = 1)
DF_test = DF_test.drop(['TransactionID','isFraud','TransactionDT'],axis = 1)

emb_test = cat_var_test.loc[:,list(emb_train)]


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Checking to make sure that all the columns are the same

In [51]:
col2 = DF_test.columns
list(set(col)-set(col2))

[]

Performing the same imput and scaling used on the training set.

In [53]:
DF_test = imp.transform(DF_test)

DF_scaled_test  = scaler.transform(DF_test)

X_scaled_time_test  = scaler_t.transform(DF_test_time.values.reshape((-1,1)))

emb_test = emb_test.applymap(str)



I encode the catagorical variables into integers for use by keras enbedding layer.

In [55]:
d = defaultdict(preprocessing.LabelEncoder)

fit = emb_train.apply(lambda x: d[x.name].fit_transform(x))

And then use the same mapping for the test set

In [56]:
fit_max = fit.max(axis=0)
fit_test = copy.deepcopy(emb_test)

for i in list(emb_test):
    fit_test[i] = emb_test[i].apply(lambda x: x if x in set(d[i].classes_) else 'unknown')
    d[i].classes_ = np.append(d[i].classes_, 'unknown')
    
fit_test = fit_test.apply(lambda x: d[x.name].transform(x))

The input to the keras model come in three parts the numerical columns, the catagorical columns and the time column.
The time is treated seperately because the distribution (mean,std) of the variables changes with time. 
To account for this a time dependent linear offset is added.

In [67]:
data = Input(shape = (DF_train.shape[1],),name = 'data')
tm = Input(shape = (1,),name = 'tm')

emb_input = [Input(shape = (1,)) for i in range(fit.shape[1])]  #The catagorical variables to be used in embeddings

in_size = (fit.max(axis=0)+2).values.astype(int)  #The number of unique values in the catogorical columns
out_size = np.maximum(np.minimum(np.rint(np.log(in_size)),7),2).astype(int)  # the output dimension of the embedding layers

emb_list = [Embedding(output_dim=out_size[i], input_dim=in_size[i], input_length=1,embeddings_constraint=unitnorm(axis = 1))(emb_input[i]) for i in range(fit.shape[1])]

emb = concatenate(emb_list)

emb = Flatten()(emb)
            
x = concatenate([emb,data,tm])

x = Flatten()(x)

y1 = Dense(out_size.sum()+DF_train.shape[1]+1,activation = 'linear')(tm)

x = add([y1,x])  #This adds the linear offset to account for the changing distribution over time
x = concatenate([x,tm])
x = Dense(200, activation='relu')(x)
x = Dropout(.8)(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(inputs = [emb_input[i] for i in range(fit.shape[1])]+[data]+[tm],outputs = [x])

optimizer = Adam(beta_1=0.9, beta_2=0.999, epsilon=1e-8)

class_weight = {0: 1.,1: 10.}
model.compile(optimizer=optimizer, loss=['binary_crossentropy'])

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [None]:
model.fit([fit[i] for i in list(fit)]+[X_scaled_train.reshape((X_scaled_train.shape[0],X_scaled_train.shape[1],))]+[X_scaled_time_train],
                    y_train,epochs=10,batch_size = 5000,class_weight = class_weight,shuffle=True)
Y_pred = model.predict([fit[i] for i in list(fit)]+[X_scaled_train.reshape((X_scaled_train.shape[0],X_scaled_train.shape[1],))]+[X_scaled_time_train])
y_pred = model.predict([fit_test[i] for i in list(fit_test)]+[X_scaled_test.reshape((X_scaled_test.shape[0],X_scaled_test.shape[1],))]+[X_scaled_time_test])

Instructions for updating:
Use tf.cast instead.
Epoch 1/40


We can finally test the model

In [1]:
print(roc_auc_score(y_train, Y_pred))
print(roc_auc_score(y_test, y_pred))

NameError: name 'roc_auc_score' is not defined