In [8]:
# import models
import pandas as pd
import numpy as np

#pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 

#models
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

**IMPORT DATA**

In [9]:
from project_fraud.data import cleaned_featured_data

In [10]:
df = cleaned_featured_data('~/data/')

In [11]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,hours,cardID,mean,min,max,median,dist_mean,dist_median,dist_mean_rel,dist_median_rel
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,0.0,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,0.0,2755404.0150.0mastercard102.0credit,235.020796,10.0,6085.23,115.0,-206.020796,-86.0,-0.876607,-0.747826
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,0.0,4663490.0150.0visa166.0debit,96.791005,12.5,994.0,59.0,-37.791005,0.0,-0.390439,0.0
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,0.0,18132567.0150.0mastercard117.0debit,123.308485,6.0,3190.0,59.95,-73.308485,-9.95,-0.594513,-0.165972
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,4497514.0150.0mastercard102.0credit,96.972222,20.95,200.0,108.95,-46.972222,-58.95,-0.484388,-0.541074


In [12]:
df.shape

(590540, 239)

**SPLIT DATA**

In [13]:
X = df[['TransactionID','card1','card2','addr1','TransactionAmt','card5','D15','C13','D2','D10','D4','weekday','hours','dist_mean', 'dist_median','dist_mean_rel','dist_median_rel']]
y = df['isFraud']

In [14]:
X.shape

(590540, 17)

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

#X_test_1, X_val, y_test_1, y_val = train_test_split(X_test, y_test, test_size=0.00017, random_state=1)

**SAVE VAL_DATA**

X_val.to_csv('val_data.csv', index =False, na_rep='nan')

**PIPELINES**

In [16]:
n = (X.dtypes != 'object')
num_cols = list(n[n].index)
medium_missing_num_cols = []
low_missing_num_cols =[]
for i in num_cols:
    percentage = df[i].isnull().sum() * 100 / len(df[i])
    if percentage < 15:
        low_missing_num_cols.append(i)
    elif percentage >= 15 and percentage <= 60:
        medium_missing_num_cols.append(i)

In [17]:
# trainer.py

num_transformer_low = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

num_transformer_medium = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant')),
    ('one_hot', OneHotEncoder())
])
    
preprocessor = ColumnTransformer([
    ('low_num_imputer',num_transformer_low, low_missing_num_cols),
    ('medium_num_imputer', num_transformer_medium, medium_missing_num_cols),
    ('cat_transformer', cat_pipeline, ['weekday','hours'])],
    remainder='drop')

**MODEL XGB**

In [11]:
from xgboost.sklearn import XGBClassifier 

In [12]:
pipeline_xgb = Pipeline([
    ("preprocessor", preprocessor),
    ("xgb_classifier", XGBClassifier(categories='auto') ),
     ])

In [13]:
pipeline_xgb.fit(X_train, y_train)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Parameters: { categories } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('low_num_imputer', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
  ...cale_pos_weight=1, subsample=1,
       tree_method='exact', validate_parameters=1, verbosity=None))])

In [14]:
y_pred_xgb = pipeline_xgb.predict(X_test)

In [15]:
from sklearn.metrics import f1_score
f1_xgb = f1_score(y_test,y_pred_xgb)
f1_xgb

0.3997063681409433

In [16]:
from sklearn.metrics import recall_score
recall_xgb =recall_score(y_test,y_pred_xgb)
recall_xgb

0.26342525399129174

**MODEL LOGREG**

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
pipeline_log = Pipeline([
    ("preprocessor", preprocessor),
    ("log_reg", LogisticRegression()),
     ])

In [19]:
pipeline_log.fit(X_train, y_train)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('low_num_imputer', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
  ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [20]:
y_pred_log = pipeline_log.predict(X_test)

In [21]:
from sklearn.metrics import f1_score
f1_log = f1_score(y_test,y_pred_log)
f1_log

0.0

In [22]:
from sklearn.metrics import recall_score
recall_log =recall_score(y_test,y_pred_log)
recall_log

0.0

**MODEL RANDOMFOREST**

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
pipeline_rfc = Pipeline([
    ("preprocessor", preprocessor),
    ("rfc", RandomForestClassifier(n_estimators=1500, max_features='auto', max_depth = 50)),
     ])

In [25]:
pipeline_rfc.fit(X_train, y_train)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('low_num_imputer', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
  ...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [26]:
y_pred_rfc = pipeline_rfc.predict(X_test)

In [27]:
from sklearn.metrics import f1_score
f1_rfc = f1_score(y_test,y_pred_rfc)
f1_rfc

0.5130217623974314

In [28]:
from sklearn.metrics import recall_score
recall_rfc =recall_score(y_test,y_pred_rfc)
recall_rfc

0.3478471214320271

**MODEL LGB**

In [18]:
import lightgbm as lgb
lgb = lgb.LGBMClassifier()

In [19]:
pipeline_lgb = Pipeline([
    ("preprocessor", preprocessor),
    ("lgb", lgb),
     ])

In [20]:
pipeline_lgb.fit(X_train, y_train)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('low_num_imputer', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
  ...0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0))])

In [21]:
y_pred_lgb = pipeline_lgb.predict(X_test)

In [22]:
from sklearn.metrics import f1_score
f1_lgb = f1_score(y_test,y_pred_lgb)
f1_lgb

0.28611277146842

In [23]:
from sklearn.metrics import recall_score
recall_lgb =recall_score(y_test,y_pred_lgb)
recall_lgb

0.17368166424770198

**SUMMARY**

In [24]:
print(f'XGB_f1 = {f1_xgb}')
print(f'XGB_recall = {recall_xgb}', '\n')

print(f'LOG_f1 = {f1_log}')
print(f'LOG_recall = {recall_log}', '\n')

print(f'RFC_f1 = {f1_rfc}')
print(f'RFC_recall = {recall_rfc}', '\n')

print(f'LGB_f1 = {f1_lgb}')
print(f'LGB_recall = {recall_lgb}', '\n')

NameError: name 'f1_xgb' is not defined