In [1]:
import evalml
from evalml.automl import AutoMLSearch
from evalml.objectives import FraudCost
import pandas as pd
import matplotlib.pyplot as plt

dataframe=pd.read_csv('c:/users/omar/DATA/creditcard.csv')

x = dataframe.iloc[:,:-1].copy()
y = dataframe.iloc[:,-1].copy()
#Make the data wrangling, i.e. drop irrelevant features as well as handle missing values

In [2]:
fraud_objective = FraudCost(retry_percentage=0.5,
                           interchange_fee=0.02,
                           fraud_payout_percentage=0.75,
                           amount_col='Amount')

x_train, x_hold, y_train, y_hold = evalml.preprocessing.split_data(x, y, problem_type='binary', test_size=0.2, random_seed=0)

automl = AutoMLSearch (x_train, y_train,
                       problem_type = 'binary', 
                       objective = fraud_objective, 
                       additional_objectives = [ 'auc', 'f1', 'precision'],
                       max_batches = 1,
                       optimize_thresholds = True)
automl.search()


Generating pipelines to search over...

*****************************
* Beginning pipeline search *
*****************************

Optimizing for Fraud Cost. 
Lower score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 1 batches for a total of 9 pipelines. 
Allowed model families: xgboost, random_forest, lightgbm, catboost, decision_tree, linear_model, extra_trees



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Evaluating Baseline Pipeline: Mode Baseline Binary Classification Pipeline
Mode Baseline Binary Classification Pipeline:
	Starting cross validation
	Finished cross validation - mean Fraud Cost: 0.002

*****************************
* Evaluating Batch Number 1 *
*****************************

Decision Tree Classifier w/ Imputer:
	Starting cross validation
	Finished cross validation - mean Fraud Cost: 0.010
Extra Trees Classifier w/ Imputer:
	Starting cross validation
	Finished cross validation - mean Fraud Cost: 0.010
CatBoost Classifier w/ Imputer:
	Starting cross validation
	Finished cross validation - mean Fraud Cost: 0.010
Random Forest Classifier w/ Imputer:
	Starting cross validation
	Finished cross validation - mean Fraud Cost: 0.009
LightGBM Classifier w/ Imputer:
	Starting cross validation
	Finished cross validation - mean Fraud Cost: 0.002
XGBoost Classifier w/ Imputer:
	Starting cross validation
	Finished cross validation - mean Fraud Cost: 0.006
Elastic Net Classifier w/ Impu

In [3]:
automl.rankings

Unnamed: 0,id,pipeline_name,mean_cv_score,standard_deviation_cv_score,validation_score,percent_better_than_baseline,high_variance_cv,parameters
0,0,Mode Baseline Binary Classification Pipeline,0.001606,,0.001606,0.0,False,{'Baseline Classifier': {'strategy': 'mode'}}
1,5,LightGBM Classifier w/ Imputer,0.002039,,0.002039,-0.043318,False,{'Imputer': {'categorical_impute_strategy': 'm...
2,6,XGBoost Classifier w/ Imputer,0.005868,,0.005868,-0.426202,False,{'Imputer': {'categorical_impute_strategy': 'm...
3,8,Logistic Regression Classifier w/ Imputer + St...,0.007311,,0.007311,-0.570534,False,{'Imputer': {'categorical_impute_strategy': 'm...
4,4,Random Forest Classifier w/ Imputer,0.008501,,0.008501,-0.68956,False,{'Imputer': {'categorical_impute_strategy': 'm...
5,2,Extra Trees Classifier w/ Imputer,0.009957,,0.009957,-0.835189,False,{'Imputer': {'categorical_impute_strategy': 'm...
6,3,CatBoost Classifier w/ Imputer,0.009966,,0.009966,-0.836087,False,{'Imputer': {'categorical_impute_strategy': 'm...
7,1,Decision Tree Classifier w/ Imputer,0.009967,,0.009967,-0.836126,False,{'Imputer': {'categorical_impute_strategy': 'm...
8,7,Elastic Net Classifier w/ Imputer + Standard S...,0.009968,,0.009968,-0.836269,False,{'Imputer': {'categorical_impute_strategy': 'm...


In [4]:
best_pipeline = automl.best_pipeline

automl.describe_pipeline(automl.rankings.iloc[0]['id'])


************************************************
* Mode Baseline Binary Classification Pipeline *
************************************************

Problem Type: binary
Model Family: Baseline

Pipeline Steps
1. Baseline Classifier
	 * strategy : mode

Training
Training for binary problems.
Objective to optimize binary classification pipeline thresholds for: <evalml.objectives.fraud_cost.FraudCost object at 0x000002590121A3C8>
Total training time (including CV): 3.6 seconds

Cross Validation
----------------
            Fraud Cost AUC  F1 Precision # Training # Validation
0                0.002 0.5 0.0       0.0        510      170,884
mean             0.002 0.5 0.0       0.0          -            -
std                  -   -   -         -          -            -
coef of var          -   - inf       inf          -            -


In [5]:
best_pipeline.score(x_hold, y_hold, objectives = ["auc",fraud_objective])

OrderedDict([('AUC', 0.5), ('Fraud Cost', 0.0014974103776637443)])