# Import Modules

In [12]:
import logging
import warnings

import numpy as np
import pandas as pd
from fraud_detection import pipelines, preprocess
from fraud_detection.utils import OUTPUT_PATH, mlflow
from sklearn.base import BaseEstimator
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import ParameterGrid, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.tree import DecisionTreeClassifier

logging.basicConfig(level=logging.WARN)

logger = logging.getLogger(__name__)

warnings.filterwarnings("ignore")

## Load Data

In [13]:
df = pipelines.make_subsample(stratified=True)
X, y = df, df.isFraud
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,259,CASH_OUT,10355.91,C424879898,93838.69,83482.78,C1120137800,655844.96,666200.86,0,0
1,370,CASH_OUT,31369.79,C718359358,8094.0,0.0,C647555714,0.0,31369.79,0,0
2,202,CASH_OUT,276254.46,C180281973,2616.0,0.0,C498203287,0.0,276254.46,0,0
3,16,CASH_OUT,170501.95,C722125644,0.0,0.0,C480876567,1025928.23,1196430.18,0,0
4,656,PAYMENT,1373.53,C1507460935,10428.0,9054.47,M1437153295,0.0,0.0,0,0


# Select Model

## Business Metric

In [14]:
def average_profit(y, y_pred, amount):
    profit = (y_pred * y * amount).sum() * .25
    profit -= (y_pred * (1 - y) * amount).sum() * 0.05
    profit -= ((1 - y_pred) * y * amount).sum()
    total_frauds = (y * amount).sum()
    return (profit / total_frauds) / .25


def average_saving(y, y_pred, amount):
    expected_economy = (y * amount).sum() * .75
    economy = (y_pred * y * amount).sum() * .75
    economy += (y_pred * (1 - y) * amount).sum() * 0.05
    return economy / expected_economy

## Validation Code

In [15]:
def validate(X, y, model: BaseEstimator, fit_params={}):
    score_names = [
        'accuracy', 'precision_macro', 'recall_macro', 'f1_macro',
        'average_profit', 'average_saving'
    ]
    score_metrics = [
        lambda y_true, y_pred, amount: accuracy_score(y_true, y_pred),
        lambda y_true, y_pred, amount: precision_score(
            y_true, y_pred, average='macro'), lambda y_true, y_pred, amount:
        recall_score(y_true, y_pred, average='macro'),
        lambda y_true, y_pred, amount: f1_score(
            y_true, y_pred, average='macro'), average_profit, average_saving
    ]

    score_names = ['test_' + metric for metric in score_names]
    skf = StratifiedKFold(shuffle=True)
    skf.get_n_splits(X, y)
    scores = dict([(metric, []) for metric in score_names])
    for train_index, test_index in skf.split(X, y):
        X_transf = model[:-1].transform(X)
        model[-1].fit(X_transf[train_index], y.iloc[train_index])
        y_pred = model[-1].predict(X_transf[test_index])
        for metric, score in zip(score_names, score_metrics):
            scores[metric].append(
                score(y_pred,
                      y.iloc[test_index],
                      amount=X.iloc[test_index].amount))
    for metric in scores:
        scores[metric] = np.mean(scores[metric])
    return scores

## Set Logging Code

In [16]:
def log(model: BaseEstimator,
        X,
        y,
        model_name,
        parameters: dict = {},
        fit_params={},
        draw=False):
    with mlflow.start_run(run_name=model_name):
        results = validate(X, y, model, fit_params)
        for param, value in parameters.items():
            mlflow.log_param(param, value)
        mlflow.log_param('model', model_name)
        for metric, values in results.items():
            mlflow.log_metric(metric, values.mean())
        mlflow.sklearn.log_model(model, "model")
    if draw:
        pd.DataFrame(results).plot(figsize=(15, 5), kind='box')

## Test Models

In [17]:
def cat2label(df):
    df = df.copy()
    for col in df.columns[df.dtypes == 'category']:
        df[col] = df[col].cat.codes
    return df

In [18]:
preprocess_pipe = preprocess.get_pipe()
same_weight = y.sum() / y.size
fit_params = [
    ('money_swing', dict(clf__sample_weight=(y * 1.2 + .05) * X.amount)),
    ('same',
     dict(clf__sample_weight=(1 - y) * same_weight + y * (1 - same_weight)))
]
models = [
    [('label_encoder', FunctionTransformer(cat2label)),
     ('fillna', FunctionTransformer(lambda df: df.fillna(-1e10).values)),
     ('clf', DecisionTreeClassifier(random_state=0, max_depth=None))],
]

In [19]:
param_grid = [
     dict(numsamples=[50000],
         stratified=[True],
         fit_params=fit_params,
         model=models),
     dict(numsamples=[-1], 
         stratified=[False],
         fit_params=fit_params,
         model=models)]
for params in ParameterGrid(param_grid):
     break
     df = pipelines.make_subsample(params['numsamples'], params['stratified'])
     X, y = df, df.isFraud
     X_preprocess = preprocess_pipe.transform(X)
     pipe = Pipeline(params['model'])
     model_name = pipe[-1].__class__.__name__
     parameters = params.copy()
     parameters['data_weights'] = parameters.pop('fit_params')[0]
     parameters.pop('model')
     log(pipe, X_preprocess, y, model_name, parameters, params['fit_params'][1])

# Results

We choose the decision tree for this problem because of its cabability to detect
condition based classifications.
We tested four diferent configurations of the data:
  - using all the data available;
  - using all the fraudulant cases + the same amount of non fraudulent
  - weighting samples inverse proportional to the frequency;
  - weighting samples proportional to the money at risk;

To verify the performance of the classifier, we messured the expected profit
divided by the maximum possible profit. Profit here is how much money the ensurance
money earns using this classifier. Also, we messure a score on how much money the
back will saved divided by the maximum that can be saved.

Also, to deal with class imbalance, two approaches were tested.
Inverse frequency weights, witch is labeled 'same'.
Weights proportional to the money at risk on the transaction. Those are labeled 'money_swing'.

The results are logged using mlflow, and a table with the results are shown below:

In [21]:
pd.read_csv(OUTPUT_PATH / 'mlflow/runs.csv').drop(
    ['Run ID', 'Source Type', 'Source Name', 'User', 'Status'],
    axis=1).sort_values(['numsamples', 'test_average_profit'])

Unnamed: 0,Name,data_weights,numsamples,stratified,test_average_profit,test_average_saving,test_f1_macro
3,DecisionTreeClassifier,same,-1,False,0.989281,0.997958,0.99735
2,DecisionTreeClassifier,money_swing,-1,False,0.993518,0.998821,0.99744
1,DecisionTreeClassifier,money_swing,50000,True,0.997335,0.999525,0.997626
0,DecisionTreeClassifier,same,50000,True,0.998018,0.999664,0.997747


By training the classifier in all the data available, the decision tree achieved an score of 99.35% of the potential profit beeing earned, and 99.88% of the potential saving beeing achieved by the bank.