## Baseline model (Logistic Regression)

In [1]:
# importing useful libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
# loading in the dataset
df = pd.read_csv('../input/creditcardfraud/creditcard.csv')
print(df.shape)
df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# selecting the 19 most important features according to our EDA 
X = df[['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', \
        'V16', 'V17', 'V21', 'V23', 'V27']]

# scaling the features
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

y = df['Class'] # selecting the target variable

In [4]:
val_count = df['Class'].value_counts()
weights = dict(1 / val_count) # to be used as class weights
weights

{0: 3.51722561243691e-06, 1: 0.0020325203252032522}

## <center>Cost matrix for fraud detection</center>

||Actual Positive|Actual Negative|
|:-:|:-:|:-:|
|**Predicted Positive**|administrative cost|administrative cost|
|**Predicted Negative**|transactional cost|0|

### *see [this article](https://towardsdatascience.com/fraud-detection-with-cost-sensitive-machine-learning-24b8760d35d9) for more about cost sensitive machine learning for fraud detection*

In [5]:
admin_cost = 2.5

### our choice of using 2.5euros as the administartive cost because it seems to be the best for our problem as suggested by [this paper](https://www.researchgate.net/publication/262390835_Cost_Sensitive_Credit_Card_Fraud_Detection_Using_Bayes_Minimum_Risk)

## Also, implementing cost sensitive models for sklearn's classifier is challenging. It is as good as creating a new model from the scratch. For simplicity, we will not penalize our model of True Positives (although, the cost matrix suggests we should penalize it for true positives). This will mean that we are only penalizing MISCLASSIFICATIONS (trying to minimize costs due to misclassification and not cost in general).

In [6]:
# defining a function to calculate cost savings
def cost_saving(ytrue, ypred, amount):
    fp = np.sum((ytrue == 0) & (ypred == 1))
    cost = np.sum(fp*admin_cost) + np.sum((amount[(ytrue == 1) & (ypred == 0)]))
    max_cost = np.sum((amount[(ytrue == 1)]))
    savings = 1 - (cost/max_cost)
    
    return savings

In [7]:
cv = StratifiedKFold(n_splits=4, random_state=1)

In [8]:
# defining a function to calculate cost saving per fold (splits) of our cv
def cost_saving_per_split(scores, x, y, cv_object):
    results = []
    for i, (_, test_ind) in zip(range(cv_object.n_splits), cv_object.split(x, y)):
        ypred = scores['estimator'][i].predict(x[test_ind])
        ytrue = y[test_ind]
        amount = df['Amount'].values[test_ind]
        results.append(cost_saving(ytrue, ypred, amount))
        
    return results

In [9]:
# defining a function to return a dataframe of metrics results for each fold in our cv
def get_metric_scores(scores, x, y=y, cv_object=cv):
    ind = ['split_'+str(n) for n in range(1, cv_object.n_splits+1)]
    
    scores_df = pd.DataFrame(index=ind)
    
    scores_df['f1_score'] = scores['test_f1']
    scores_df['auc_pr'] = scores['test_average_precision']
    scores_df['cost_savings'] = cost_saving_per_split(scores, x, y, cv_object)

    return scores_df

## Fraud Sensitive model (Not cost sensitive)

In [10]:
fraud_sensitive_model = LogisticRegression(class_weight=weights)
fraud_sensitive_scores = cross_validate(fraud_sensitive_model, scaled_X, y, \
                            scoring=['f1', 'average_precision'], cv=cv, n_jobs=4, return_estimator=True)

### Note: we used cross_validate because it can take more than one scoring metrics and it can also return the fitting model for each fold.

In [11]:
fraud_sensitive_results = get_metric_scores(fraud_sensitive_scores, scaled_X)
fraud_sensitive_results

Unnamed: 0,f1_score,auc_pr,cost_savings
split_1,0.647564,0.731073,0.850228
split_2,0.806584,0.785895,0.852801
split_3,0.670769,0.851064,0.570675
split_4,0.687943,0.789623,0.720068


In [12]:
fraud_sensitive_results.mean()

f1_score        0.703215
auc_pr          0.789414
cost_savings    0.748443
dtype: float64

## Cost sensitive model

### we will use sample weight to penalize our model accordingly. admin cost for FP and transactional cost (amount) for FN.

In [13]:
sample_weights = np.array([df['Amount'][ind] if fraud else admin_cost for ind, fraud in enumerate(y)])

In [14]:
cost_sensitive_model = LogisticRegression()
cost_sensitive_scores = cross_validate(cost_sensitive_model, scaled_X, y, \
                        scoring=['f1', 'average_precision'], cv=cv, n_jobs=4, return_estimator=True, \
                          fit_params={'sample_weight': sample_weights})

In [15]:
cost_sensitive_results = get_metric_scores(cost_sensitive_scores, scaled_X)
cost_sensitive_results

Unnamed: 0,f1_score,auc_pr,cost_savings
split_1,0.384615,0.606198,0.764231
split_2,0.545977,0.717824,0.818455
split_3,0.435897,0.76718,0.701623
split_4,0.454333,0.60342,0.699351


In [16]:
cost_sensitive_results.mean()

f1_score        0.455206
auc_pr          0.673656
cost_savings    0.745915
dtype: float64

## Bayes Mininmum Risk (BMR)

### Note: cost dependent classification is also called Bayes Mininmum Risk.
***see more about BMR [here](https://link.springer.com/article/10.1007/s42452-020-03375-w)***

In [17]:
bmr_model = LogisticRegression()
bmr_scores = cross_validate(bmr_model, scaled_X, y, cv=cv, n_jobs=4, return_estimator=True)

In [18]:
# defining a function to predict based on the predicting that will minimize the expected cost.
def bmr_predict(model, x, trans_cost):
    prob = model.predict_proba(x)[:, 1]
        
    expected_cost_0 = prob * trans_cost
    expected_cost_1 = (1-prob) * admin_cost
        
    pred = (expected_cost_1 < expected_cost_0).astype(int)
    return pred

### Calculating AUC PR curve is not possible for BMR because predictions are based on minimum expected cost and not on probability. For this reason, we will calculate only f1_score and cost-savings for this model.

In [19]:
def get_bmr_metric_scores(scores, x, y=y, cv_object=cv):
    ind = ['split_'+str(n) for n in range(1, cv_object.n_splits+1)]
    scores_df = pd.DataFrame(index=ind)

    f1_results = []
    cs_results = []
    
    for i, (_, test_ind) in zip(range(cv_object.n_splits), cv_object.split(x, y)):
        amount = df['Amount'].values[test_ind]
        
        ypred = bmr_predict(scores['estimator'][i], x[test_ind], amount)
        ytrue = y[test_ind]
                
        f1_results.append(f1_score(ytrue, ypred))
        cs_results.append(cost_saving(ytrue, ypred, amount))
        
    scores_df['f1_score'] = f1_results
    #scores_df['auc_pr'] = scores['test_average_precision']
    scores_df['cost_savings'] = cs_results

    return scores_df    

In [20]:
bmr_results = get_bmr_metric_scores(bmr_scores, scaled_X)
bmr_results

Unnamed: 0,f1_score,cost_savings
split_1,0.513447,0.902878
split_2,0.529412,0.802759
split_3,0.423077,0.592869
split_4,0.461017,0.732828


In [21]:
bmr_results.mean()

f1_score        0.481738
cost_savings    0.757833
dtype: float64

# We then see that for Logistic Regression, setting class weight without cost sensitivity seems to be the best approach. We will repeat these processes for other more sophisticated models and choose the best one as our final model.