In [1]:
# import models
import pandas as pd
import numpy as np

#pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 

#models
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

**IMPORT DATA**

In [2]:
from project_fraud.data import cleaned_featured_data

In [3]:
df = cleaned_featured_data('~/data/')

In [4]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,hours,cardID,mean,min,max,median,dist_mean,dist_median,dist_mean_rel,dist_median_rel
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,0.0,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,0.0,2755404.0150.0mastercard102.0credit,235.020796,10.0,6085.23,115.0,-206.020796,-86.0,-0.876607,-0.747826
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,0.0,4663490.0150.0visa166.0debit,96.791005,12.5,994.0,59.0,-37.791005,0.0,-0.390439,0.0
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,0.0,18132567.0150.0mastercard117.0debit,123.308485,6.0,3190.0,59.95,-73.308485,-9.95,-0.594513,-0.165972
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,4497514.0150.0mastercard102.0credit,96.972222,20.95,200.0,108.95,-46.972222,-58.95,-0.484388,-0.541074


In [5]:
df.shape

(590540, 239)

**SPLIT DATA**

In [6]:
X = df[['TransactionID','card1','card2','addr1','TransactionAmt','card5','D15','C13','D2','D10','D4','weekday','hours','dist_mean', 'dist_median','dist_mean_rel','dist_median_rel']]
y = df['isFraud']

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3)

sample_size = 15000

X_small = X_train.sample(sample_size, random_state=0)

y_small = y_train.sample(sample_size, random_state=0)

X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(X_small, y_small, random_state=0)

**SAVE VAL_DATA**

X_val.to_csv('val_data.csv', index =False, na_rep='nan')

**PIPELINES**

In [8]:
n = (X.dtypes != 'object')
num_cols = list(n[n].index)
medium_missing_num_cols = []
low_missing_num_cols =[]
for i in num_cols:
    percentage = df[i].isnull().sum() * 100 / len(df[i])
    if percentage < 15:
        low_missing_num_cols.append(i)
    elif percentage >= 15 and percentage <= 60:
        medium_missing_num_cols.append(i)

In [9]:
# trainer.py

num_transformer_low = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

num_transformer_medium = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant')),
    ('one_hot', OneHotEncoder())
])
    
preprocessor = ColumnTransformer([
    ('low_num_imputer',num_transformer_low, low_missing_num_cols),
    ('medium_num_imputer', num_transformer_medium, medium_missing_num_cols),
    ('cat_transformer', cat_pipeline, ['weekday','hours'])],
    remainder='drop')

**MODEL XGB**

In [10]:
from xgboost.sklearn import XGBClassifier 

In [11]:
pipeline_xgb = Pipeline([
    ("preprocessor", preprocessor),
    ("xgb_classifier", XGBClassifier() ),
     ])

In [12]:
pipeline_xgb.fit(X_train_small, y_train_small)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('low_num_imputer', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
  ...cale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None))])

In [13]:
y_pred_small_xgb = pipeline_xgb.predict(X_test_small)

In [14]:
from sklearn.metrics import f1_score
f1_xgb = f1_score(y_test_small,y_pred_small_xgb)
f1_xgb

0.18478260869565216

In [15]:
from sklearn.metrics import recall_score
recall_xgb =recall_score(y_test_small,y_pred_small_xgb)
recall_xgb

0.10759493670886076

**MODEL LOGREG**

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
pipeline_log = Pipeline([
    ("preprocessor", preprocessor),
    ("log_reg", LogisticRegression()),
     ])

In [18]:
pipeline_log.fit(X_train_small, y_train_small)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('low_num_imputer', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
  ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [19]:
y_pred_small_log = pipeline_log.predict(X_test_small)

In [20]:
from sklearn.metrics import f1_score
f1_log = f1_score(y_test_small,y_pred_small_log)
f1_log

  'precision', 'predicted', average, warn_for)


0.0

In [21]:
from sklearn.metrics import recall_score
recall_log =recall_score(y_test_small,y_pred_small_log)
recall_log

0.0

**MODEL SVC**

In [22]:
from sklearn.svm import SVC

In [23]:
pipeline_svc = Pipeline([
    ("preprocessor", preprocessor),
    ("scv", SVC()),
     ])

In [24]:
pipeline_svc.fit(X_train_small, y_train_small)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('low_num_imputer', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
  ...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [27]:
y_pred_small_svc = pipeline_svc.predict(X_test_small)

In [28]:
from sklearn.metrics import f1_score
f1_svc = f1_score(y_test_small,y_pred_small_svc)
f1_svc

  'precision', 'predicted', average, warn_for)


0.0

In [29]:
from sklearn.metrics import recall_score
recall_svc =recall_score(y_test_small,y_pred_small_svc)
recall_svc

0.0

**MODEL RANDOMFOREST**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
pipeline_rfc = Pipeline([
    ("preprocessor", preprocessor),
    ("rfc", RandomForestClassifier()),
     ])

In [None]:
pipeline_rfc.fit(X_train, y_train)

In [None]:
y_pred_rfc = pipeline_rfc.predict(X_test)

In [None]:
from sklearn.metrics import f1_score
f1_rfc = f1_score(y_test,y_pred_rfc)
f1_rfc

In [None]:
from sklearn.metrics import recall_score
recall_rfc =recall_score(y_test,y_pred_rfc)
recall_rfc

**MODEL LGB**

In [None]:
import lightgbm as lgb
lgb = lgb.LGBMClassifier()

In [None]:
pipeline_lgb = Pipeline([
    ("preprocessor", preprocessor),
    ("rfc", lgb),
     ])

In [None]:
pipeline_lgb.fit(X_train, y_train)

In [None]:
y_pred_lgb = pipeline_lgb.predict(X_test)

In [None]:
from sklearn.metrics import f1_score
f1_lgb = f1_score(y_test,y_pred_lgb)
f1_lgb

In [None]:
from sklearn.metrics import recall_score
recall_lgb =recall_score(y_test,y_pred_lgb)
recall_lgb

**MODEL KNN**

In [None]:
#from sklearn.neighbors import KNeighborsClassifier

In [None]:
#pipeline_knn = Pipeline([
    #("preprocessor", preprocessor),
    #("knn", KNeighborsClassifier()),
     #])

In [None]:
#pipeline_knn.fit(X_train, y_train)

In [None]:
#y_pred_knn = pipeline_knn.predict(X_test)

In [None]:
#from sklearn.metrics import f1_score
#f1_knn = f1_score(y_test,y_pred_knn)
#f1_knn

In [None]:
#from sklearn.metrics import recall_score
#recall_knn =recall_score(y_test,y_pred_knn)
#recall_knn

**SUMMARY**

In [None]:
print(f'XGB_f1 = {f1_xgb}')
print(f'XGB_recall = {recall_xgb}', '\n')

print(f'LOG_f1 = {f1_log}')
print(f'LOG_recall = {recall_log}', '\n')

print(f'SVC_f1 = {f1_svc}')
print(f'SVC_recall = {recall_svc}', '\n')

print(f'RFC_f1 = {f1_rfc}')
print(f'RFC_recall = {recall_rfc}', '\n')

print(f'LGB_f1 = {f1_lgb}')
print(f'LGB_recall = {recall_lgb}', '\n')

#print(f'KNN_f1 = {f1_knn}')
#print(f'KNN_recall = {recall_knn}', '\n')