# Final Model V2 (for deployment)

In [1]:
# importing useful libraries

import numpy as np
import pandas as pd

import gzip
import dill

from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from xgboost import XGBClassifier

from sklearn.calibration import CalibratedClassifierCV

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
credit_card_df = pd.read_csv('../input/creditcardfraud/creditcard.csv') 

# alternatively you can use 
# credit_card_df = pd.read_csv('https://raw.github.com/HamoyeHQ/g01-fraud-detection/master/data/credit_card_dataset.zip')
# to load in the dataset if you don't have it downloaded

print(credit_card_df.shape)
credit_card_df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# 27 most important features according to our EDA
cols = ['V'+str(i) for i in range(1, 29) if i != 25]
print(cols)

['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V26', 'V27', 'V28']


In [4]:
df = credit_card_df.copy()

In [5]:
# selecting the features as X and target as y
y = df.pop('Class')
X = df

In [6]:
admin_cost = 2.5

In [7]:
# defining a function to calculate cost savings
def cost_saving(ytrue, ypred, amount, threshold=0.5, epsilon=1e-7):
    ypred = ypred.flatten()
    fp = np.sum((ytrue == 0) & (ypred == 1))
    cost = np.sum(fp*admin_cost) + np.sum((amount[(ytrue == 1) & (ypred == 0)]))
    max_cost = np.sum((amount[(ytrue == 1)])) 
    savings = 1 - (cost/(max_cost+epsilon))
    
    return savings

In [8]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, cols=cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return np.array(X[self.cols])

        elif isinstance(X, pd.Series):
            return np.array(X[self.cols]).reshape(1, -1)

        elif isinstance(X, np.ndarray):
            self.cols_ind = [int(col[1:]) for col in self.cols]
            if len(X.shape) == 1: # if one dimensional array
                return X[self.cols_ind].reshape(1, -1)
            return X[:, self.cols_ind]

        else:
            raise TypeError('expected input type to be any of pd.Series, pd.DataFrame or np.ndarray but got {}'.format(type(X)))

## Serializing model

In [9]:
cols_select = ColumnSelector()
scaler = StandardScaler()

data_prep = Pipeline([('columns', cols_select), ('scaler', scaler)]) # data preparation pipeline

X_prep = data_prep.fit_transform(X, y) # fitting and transforming the data

In [10]:
# saving the data prep object
with gzip.open('data_prep_pipe.gz.dill', 'wb') as f:
    dill.dump(data_prep, f)

In [11]:
model = XGBClassifier(random_state=1)

sample_weights = np.array([X['Amount'].iloc[ind] if fraud else admin_cost for ind, fraud in enumerate(y.values)])

model.fit(X_prep, y, sample_weight=sample_weights);

In [12]:
calibration = CalibratedClassifierCV(model, method='isotonic', cv='prefit')
calibration.fit(X_prep, y);

In [13]:
# saving the calibration object
with gzip.open('calibration.gz.dill', 'wb') as f:
    dill.dump(calibration, f)

## Prediction

In [14]:
# defining function to get predictions
def get_predictions(X, proba=False):
      # loading in useful objects
    with gzip.open('data_prep_pipe.gz.dill', 'rb') as f:
        data_prep = dill.load(f)

    #model = XGBClassifier(random_state=1)
    
    #model.load_model('xgboost.bin')

    with gzip.open('calibration.gz.dill', 'rb') as f:
        calibration = dill.load(f)
  
    Xt = data_prep.transform(X) # prepare (preprocess) the user's input
    
    

    if proba:
        pred = calibration.predict_proba(Xt) # gets the probability of belonging to the positvie class

        if len(pred.shape) > 1: # pred is 2-dim (multi-input)
            pred = pred[:, 1]
    
        else: # pred is 1-dim (single-input)
            pred = pred[1]

    else: # get raw predictions
        pred = calibration.predict(Xt) # gets the prediction

    return pred

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=1)

In [16]:
def prediction_summary(user_input, ytrue=None):
    """
      This function is both for testing our model when we know the true label of user's input and getting only predictions when we don't know the true labels.

      Args:
        user_input: 
          type: any of numpy array, pandas Series or dataframe. 

          User's input is expected to be for all features apart from 'Class' feature making them 30 in number as arranged in our dataset.

        y_true:
          type: any of numpy array or pandas Series.
          The true labels for user_input

    Return:
        a dataframe of 'Class' and the probability of 'Class' being fraud. A 'Class' of 1 means fraud, while 0 means not fraud. If ytrue is given;
        f1_score and cost saving are also printed out.
    """
    
    proba = get_predictions(user_input, proba=True)
    pred = get_predictions(user_input)
    pred_df = pd.DataFrame({'Class': pred, 'Fraud_Probabilty': proba})

    if ytrue is not None: # if we know the true labels, it means we want to test the model and printing out metrics will be useful

        if len(user_input.shape) > 1: # if the input has more than 1 row (multi-input)
            print('f1_score is {}'.format(f1_score(ytrue, pred)))
            if isinstance(user_input, np.ndarray):
                amount = user_input[:, -1]
            else:
                amount = user_input.iloc[:, -1]
            print('cost saving is {}'.format(cost_saving(ytrue, pred, amount)))

        else: # a single input.
            print('f1_score is {}'.format(f1_score(ytrue, pred)))
            print('cost saving is {}'.format(cost_saving(ytrue, pred, user_input[-1].reshape(1))))

    return pred_df # in any case, finally return the dataframe of predictions.

In [17]:
result = prediction_summary(X_test, y_test)
result.head()

f1_score is 1.0
cost saving is 1.0


Unnamed: 0,Class,Fraud_Probabilty
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,7e-06


In [18]:
result[(result['Fraud_Probabilty'] > 0) & (result['Fraud_Probabilty'] < 1)].head()

Unnamed: 0,Class,Fraud_Probabilty
4,0,7e-06
5,0,7e-06
7,0,7e-06
10,0,7e-06
11,0,7e-06


In [19]:
prediction_summary(X_test.iloc[0].values, np.array([0]))

f1_score is 0.0
cost saving is 1.0


Unnamed: 0,Class,Fraud_Probabilty
0,0,0.0


## The exceptionaly good results is due to the fact that we trained our whole dataset. This is for deployment and is to check if our model is predicting and it is not for model evaluation. So, don't bother about the perfect scores.