In [677]:
import numpy as np
import pandas as pd
import pickle
import os, glob
import logging
import seaborn as sns

In [312]:
from keras import optimizers
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers.recurrent import LSTM

In [314]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
import joblib

In [315]:
from tqdm import tqdm
import matplotlib.pyplot as plt
import tabulate

In [679]:
path = os.getcwd()
path

'/home/m.shah/projects/models/kaggle-models'

In [680]:
logging.basicConfig(filename='/home/m.shah/projects/models/kaggle-models/qapp.log', 
                    filemode='a', format='%(name)s - %(levelname)s - %(message)s')

## Loading Data

In [17]:
dfs = []

for filename in glob.glob(os.path.join(path, "../../data/simulated-data-raw/", "data", "*.pkl")):
    with open(filename, 'rb') as f:
        temp = pd.read_pickle(f)
        dfs.append(temp)
df = pd.DataFrame()
df = df.append(dfs)

In [316]:
df = df.iloc[:, [0, 1, 2, 3, 4, 5, 6, 8, 7]]

In [844]:
#Feature Define
if "WEEK_DAY" not in df.columns:
    df.insert(7, "WEEK_DAY", df["TX_DATETIME"].apply(lambda x : x.weekday()))

#Feature Selection
selected_features = ["CUSTOMER_ID",
                     "TERMINAL_ID",
                     'TX_AMOUNT', 
                     'TX_TIME_SECONDS', 
                     'TX_TIME_DAYS', 
                     "WEEK_DAY", 
                     "TX_FRAUD_SCENARIO"]

In [867]:
class Preprocessor:    
    def __init__(self, input_data, logger):
        self.logger = logger
        self.data = input_data
    def pre_process(self, 
                    feature_columns,
                    label_columns,
                    window_size = 128,
                    numericals = None,
                    categoricals = None,
                    test_train_split = 0.7,
                    roll_base = 'time',
                    drop_rollbase = True,
                    imbalanced = False):
        # ---------- 
        self.features = feature_columns
        self.labels = label_columns
        self.window_size = window_size
        self.drop_rollbase = drop_rollbase
        self.test_train_split = test_train_split
        if 'TX_DATETIME' in feature_columns:
            self.data['TX_DATETIME'] = self.data['TX_DATETIME'].values.astype(float) 
        # -----------
        self.roll_base = roll_base
        self.customers = np.array([])
        # ---------- Standardization
        self.scaler(numericals)
        # ---------- Categorical features, OneHotEncoding
        if categoricals is not None:
            self.logger.info(str(("Raw data shape:", self.data.shape)))
            self.encode(categoricals)
            self.logger.info(str(("categorized data shape:", self.data.shape)))
        # ---------- Rolling X and Y Tensors
        self.logger.info( "======== Making X and Y Tensors ==========")
        self.x_tensor, self.y_tensor = self.roll(self.window_size)
        self.logger.info( "Input X by shape:", self.data.shape,"is rolled to X Tensor: ", self.x_tensor.shape)
        self.logger.info( "X and Y tensors are Rolled \n")

        self.train_idx = np.random.choice(self.x_tensor.shape[0], 
                                      int(self.x_tensor.shape[0]*self.test_train_split), replace=False)                      
        self.X_train = self.x_tensor[self.train_idx, :, :]
        self.X_test = np.delete(self.x_tensor, self.train_idx, axis=0)
        self.y_train = self.y_tensor[self.train_idx, ]
        self.y_test = np.delete(self.y_tensor, self.train_idx, axis=0)
        self.logger.info( "Preprocessing Done! \n")
        return self.X_train, self.X_test, self.y_train, self.y_test
                
    def encode(self, categoricals):
        if categoricals[0] is not None:
            self.data = pd.get_dummies(self.data, columns = categoricals[0])
        if categoricals[1] is not None:
            self.data = pd.get_dummies(self.data, columns = categoricals[1])
    
    def scaler(self, numericals):
        scaler = MinMaxScaler()
        self.data[numericals[0]] = scaler.fit_transform(self.data[numericals[0]])
        self.data[numericals[1]] = scaler.fit_transform(self.data[numericals[1]])

    def roll(self, window_size):
        x_filter = [col for col in self.data.columns if col.startswith(tuple(self.features))]
        y_filter = [col for col in self.data.columns if col.startswith(tuple(self.labels))]
        ix_tensor = np.zeros([(self.data.shape[0] - (window_size)) * window_size, len(x_filter)], dtype = 'float32')
        iy_tensor = np.zeros((0, len(y_filter)), dtype = 'float32')

        if self.roll_base == 'time':
            self.dg_x = self.data[x_filter]
            self.dg_y = self.data[y_filter]
            # - - - - - Check Drive files - - - - 
            if os.path.isfile('ix_tensor.pickle'):
                print("- - - -  X Tensor founded on Local Drive - - - - ")
                print("- - - -  Reading X TENSOR - - - - ")
                with open('ix_tensor.pickle', 'rb') as file:
                    ix_tensor = joblib.load(file)
                with open('iy_tensor.pickle', 'rb') as file:
                    iy_tensor = joblib.load(file)
            else:
                # Rolling Loop for making ix_tensor
                for i in tqdm(range(self.dg_x.shape[0]-(window_size))):
                    s = np.array(self.dg_x[i:i+window_size], dtype='float32')
                    ix_tensor[(window_size*i):(window_size*(i+1)), :] = s
                    iy_tensor = np.vstack((iy_tensor, self.dg_y.iloc[i+window_size, ]))
                ix_tensor = ix_tensor.reshape(-1, window_size, np.shape(ix_tensor)[1])
                self.logger.info( "- - - - Writing X Tensor on Drive- - - -")
                with open('ix_tensor.pickle', 'wb') as file:
                    joblib.dump(ix_tensor, file)
                with open('iy_tensor.pickle', 'wb') as file:
                    joblib.dump(iy_tensor, file) 
        else:
            if os.path.isfile('ix_tensor.pickle'):
                print("- - - -  X Tensor founded on Local Drive - - - - ")
                print("- - - -  Reading X TENSOR - - - - ")
                with open('ix_tensor.pickle', 'rb') as file:
                    ix_tensor = joblib.load(file)
                with open('iy_tensor.pickle', 'rb') as file:
                    iy_tensor = joblib.load(file)
            else:
                dg = self.data.sort_values(by = self.roll_base)
                self.dg_x = dg[x_filter]
                self.dg_y = dg[y_filter]
                print(self.dg_x.columns)
                del(dg)
                # ------- Rolling Looop -----
                i = 0
                j = 0
                while (i < self.dg_x.shape[0]-(window_size)):
                    if(self.dg_x[self.roll_base[0]].iloc[i + window_size] == self.dg_x[self.roll_base[0]].iloc[i + window_size - 1]):
                        s = np.array(self.dg_x[i:i + window_size], dtype='float32')
                        ix_tensor[(window_size*j):(window_size*(j+1)), :] = s
                        iy_tensor = np.vstack((iy_tensor, self.dg_y.iloc[i+window_size,:]))
                        self.customers = np.append(self.customers, self.dg_x[self.roll_base[0]][i + window_size])
                        j += 1
                        i += 1
                    else:
                        i += window_size
                        continue
                # -------- Drop RoleBase Column -----
                if self.drop_rollbase:
                    ix_tensor = np.delete(ix_tensor, self.dg_x.columns.get_loc(self.roll_base[0]), axis = 1)
                ix_tensor = ix_tensor[:len(iy_tensor)*window_size ,:].reshape(-1, window_size, ix_tensor.shape[1])
                iy_tensor = iy_tensor.reshape(-1, 1)
                # ------- Dump Tensors on Drive ------
                with open('ix_tensor.pickle', 'wb') as file:
                    joblib.dump(ix_tensor, file)
                with open('iy_tensor.pickle', 'wb') as file:
                    joblib.dump(iy_tensor, file) 
        return ix_tensor, iy_tensor

## Transaction Prediction

In [910]:
class LSTM_REGRESSION:
    def __init__(self,
                 input_shape,
                 lstm_units = 50,
                 n_outputs = 1,
                 optimizer = 'adam',
                loss = 'mse',
                metrics = 'mae'):
        #Model Layers
        self.model = Sequential()
        self.model.add(LSTM(lstm_units, input_shape=input_shape, return_sequences = True))#, return_sequences=True))            
        self.model.add(Dropout(0.2))
        self.model.add(LSTM(lstm_units, return_sequences = True))            
        self.model.add(Dropout(0.2))
        self.model.add(LSTM(lstm_units))            
        self.model.add(Dropout(0.2))
        self.model.add(Dense(n_outputs))
        
        self.model.compile(loss=loss, 
                           optimizer=optimizer, 
                           metrics=metrics) #optimizer='rmsprop'
        self.model.build()
        print("Model Summary: \n\n", self.model.summary())
        self.model
        
    def train(self, train_x, train_y, batch_size = 100, epochs  =1):
        csv_logger = CSVLogger('/home/m.shah/projects/models/kaggle-models/training.log', append=True, separator=';')
        result = self.model.fit(train_x,
                                train_y, 
                                batch_size = batch_size,
                                validation_split = 0.2,
                                epochs = epochs,
                               callbacks=[csv_logger])
        return result
    def predict(self, test_x):
        return self.model.predict(test_x)

In [901]:
pp = Preprocessor(df.iloc[:50000, :], logging.getLogger('pre-processor'))

In [902]:
train_x, test_x, train_y, test_y = pp.pre_process(selected_features, 
                               ['TX_AMOUNT'], 
                               numericals = [["TX_AMOUNT", "TX_TIME_SECONDS",'TX_TIME_DAYS'],["TX_AMOUNT"]],
                               categoricals = [["TERMINAL_ID", "WEEK_DAY", "TX_FRAUD_SCENARIO"],None], 
                               window_size = 8,
                               roll_base = ["CUSTOMER_ID", "TX_TIME_SECONDS"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[numericals[0]] = scaler.fit_transform(self.data[numericals[0]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[numericals[1]] = scaler.fit_transform(self.data[numericals[1]]

- - - -  X Tensor founded on Local Drive - - - - 
- - - -  Reading X TENSOR - - - - 


In [909]:
model = LSTM_REGRESSION(train_x.shape[1:], n_outputs = train_y.shape[1], lstm_units = train_x.shape[1])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 8, 8)              317120    
_________________________________________________________________
dropout (Dropout)            (None, 8, 8)              0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 8, 8)              544       
_________________________________________________________________
dropout_1 (Dropout)          (None, 8, 8)              0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 8)                 544       
_________________________________________________________________
dropout_2 (Dropout)          (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 1)                 9

In [911]:
history = model.train(train_x, train_y, batch_size = 128, epochs = 2)

Epoch 1/2
Epoch 2/2


In [913]:
with open('model_history.pickle', 'wb') as file:
    joblib.dump(history.history, file)
