In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [5]:
!jt -r

Reset css and font defaults in:
/home/joaquin/.jupyter/custom &
/home/joaquin/.local/share/jupyter/nbextensions


In [6]:
!pip install imbalanced-learn
!pip install category_encoders
!pip install optuna



In [7]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from category_encoders.leave_one_out import LeaveOneOutEncoder

from category_encoders.wrapper import NestedCVWrapper

from sklearn.preprocessing import OneHotEncoder

from imblearn.over_sampling import SMOTE
import imblearn

In [8]:
import optuna
from optuna.exceptions import TrialPruned
import lightgbm as lgb
from lightgbm import LGBMClassifier

In [9]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [10]:
from sklearn.metrics import recall_score


In [11]:
!pip install xgboost



In [12]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

# Levanto DF

In [13]:
train_transactions = pd.read_csv('./train_transaction.csv')
train_identity = pd.read_csv('./train_identity.csv')
#test_transactions = pd.read_csv('test_transaction.csv')
#test_identity = pd.read_csv('test_identity.csv')

In [14]:
df_train = pd.merge(train_transactions,train_identity, how ='inner', on = 'TransactionID')#.sample(frac=0.25)
#df_test = pd.merge(test_transactions,test_identity, how ='inner', on = 'TransactionID')

In [15]:
df_train.shape

(144233, 434)

# Esquema de Columnas

In [16]:
numeric_features = df_train.select_dtypes(include='number').columns
object_features = df_train.select_dtypes(include='object').columns

numeric_features = [ x for x in numeric_features if x not in ['TransactionID','isFraud','TransactionDT']]
object_features = [ x for x in object_features if x not in ['TransactionID','isFraud','TransactionDT']]

In [17]:
print("Cantidad de Columnas Numericas: "+ str(len(numeric_features)))
print("Cantidad de Columnas Categoricas: "+ str(len(object_features)))

Cantidad de Columnas Numericas: 400
Cantidad de Columnas Categoricas: 31


In [18]:
df_train.isFraud.value_counts(normalize=True)

0    0.92153
1    0.07847
Name: isFraud, dtype: float64

In [19]:
df_train.isFraud.isna().sum()

0

# Split Train Test 

In [20]:

df2 = df_train.dropna(subset=['isFraud'])
df2.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,0,86535,15.0,H,2803,100.0,150.0,visa,226.0,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,0,86549,75.887,C,16496,352.0,117.0,mastercard,134.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,0,86555,16.495,C,4461,375.0,185.0,mastercard,224.0,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0,86620,30.0,H,1790,555.0,150.0,visa,226.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [21]:
X, X_val, y, y_val = train_test_split(df2.drop(['isFraud'],axis=1),df2.isFraud, test_size=0.3, random_state=42,stratify=df2.isFraud)


# Transformacion de Variables 

In [22]:
cols_tr =  numeric_features + object_features


In [23]:
fill_nan_defaults = {
    "numerical": -999,
    "categorical": "null",
    'ignored' : "null"
}

In [24]:
#ignored_transformer = Pipeline([('imputer', SimpleImputer(strategy='constant',fill_value=fill_nan_defaults['ignored']))])
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='constant',fill_value=fill_nan_defaults['numerical']))])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='constant',fill_value=fill_nan_defaults['categorical'])),
                            ('loenc', NestedCVWrapper(LeaveOneOutEncoder(random_state=17),cv = 5 , random_state=42))])


preprocessor = ColumnTransformer(
    transformers=[
    #    ("ignored", ignored_transformer, ignored_features),
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, object_features),
    ]
)

In [25]:
X_pre = preprocessor.fit_transform(X,y)
X_val_pre = preprocessor.transform(X_val)

In [26]:
X_pre.shape

(100963, 431)

In [27]:
X_pre_pd = pd.DataFrame(X_pre)
X_pre_pd.columns = cols_tr

In [28]:
X_pre_pd.head()

Unnamed: 0,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,C1,...,id_30,id_31,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,100.0,15875.0,555.0,150.0,226.0,315.0,87.0,-999.0,-999.0,1.0,...,0.05518,0.052937,0.058298,0.064618,0.045131,0.081638,0.064254,0.097751,0.101652,0.063401
1,43.382,14276.0,177.0,185.0,137.0,-999.0,-999.0,-999.0,-999.0,2.0,...,0.117835,0.091638,0.111753,0.117687,0.12239,0.081638,0.0838,0.097751,0.065296,0.064707
2,200.0,17947.0,371.0,150.0,226.0,299.0,87.0,-999.0,-999.0,1.0,...,0.031033,0.099742,0.02793,0.039809,0.044742,0.081877,0.083433,0.09755,0.064939,0.064425
3,64.77,9300.0,103.0,185.0,138.0,-999.0,-999.0,-999.0,-999.0,1.0,...,0.117835,0.104275,0.111753,0.117687,0.12239,0.081638,0.0838,0.097751,0.065296,0.106581
4,100.0,7262.0,583.0,150.0,135.0,330.0,87.0,-999.0,-999.0,1.0,...,0.030965,0.05744,0.028532,0.038956,0.044717,0.082,0.083767,0.098747,0.06503,0.065319


In [29]:
import gc
gc.collect()

108

# SMOTE

In [30]:
sm = SMOTE(random_state=42,sampling_strategy=0.3)


In [31]:
X_res, y_res = sm.fit_resample(X_pre_pd, y)

In [32]:
X_res.shape

(120952, 431)

In [33]:
len(y_res)-len(y)

19989

# Optuna

In [None]:
import optuna.integration.lightgbm as lgb
def default_objective(trial):

    param = {
                "eval_metric"           : "auc",
                "verbosity"        : 3,
                "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
                "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),
                "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
                'max_depth'        :trial.suggest_int('max_depth', 2, 25),
                "n_estimators"     : trial.suggest_int("n_estimators", 50, 10000),
                "learning_rate"   : trial.suggest_loguniform('learning_rate', 1e-8, 0.2),
                "reg_alpha": trial.suggest_loguniform('reg_alpha', 1e-8, 5),
                "reg_lambda": trial.suggest_loguniform('reg_lambda', 1e-8, 5),
                "min_split_gain": trial.suggest_loguniform('min_split_gain', 1e-5, 1e0),
                'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree',0.1,1,0.01),
                'subsample' : trial.suggest_uniform('subsample' ,0.2,0.8)
            }
    
    #model_pipeline  = make_pipeline(pre_pipe, LGBMClassifier(**param, silent=True,
                                                                # metric='auc',
    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42, stratify=y_res)
                                                            
    model_pipeline = make_pipeline(None, LGBMClassifier( **param, n_jobs=-1
                                                                ))                                          
    m = model_pipeline.fit(X_train, y_train)
    
    preds           = m.predict_proba(X_test)[:,1]
   # preds           = m.predict(X_test)
    roc_score       = roc_auc_score(y_test, preds)
   # f1_test= f1_score(y_test,preds)
    preds_train     = m.predict_proba(X_train)[:,1]
   # preds_train     = m.predict(X_train)
  #  f1_train = f1_score(y_train,preds_train)
    roc_score_train = roc_auc_score(y_train, preds_train)
    trial.set_user_attr('overfit', roc_score_train - roc_score)
    if abs(roc_score_train - roc_score) < 0.05:
        return roc_score
    raise TrialPruned()

    if objective  == None:
        objective = default_objective
    else:
        objective = objective

In [None]:
def do_optuna_tuning(minimize_overfit = True):
    study = optuna.create_study(direction = "maximize")
    study.optimize(default_objective, n_trials = 150,n_jobs=-1)

    if minimize_overfit:
        trial      = study.best_trial
        best_trial = sorted([t for t in study.trials if t.value != None and t.value >= trial.value - 0.0005],
                            key = lambda a: a.user_attrs['overfit'])[0]
    else:
        best_trial = study.best_trial
    params         = best_trial.params
    params.update({'random_state': 42}) 
    return study,trial,best_trial

In [None]:
study,trial,best_trial=do_optuna_tuning()

In [None]:
best_trial


In [None]:
optuna.visualization.plot_optimization_history(study)


In [None]:
best_trial.params

## RandomizedSearchCV

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV, RandomizedSearchCV, cross_validate


In [None]:
pars = {'n_estimators':range(50, 200, 5),'learning_rate':np.arange(0,1,0.01), 'max_depth':range(2,25,3), 'n_estimators':range(50,500,25),
        'reg_alpha':[0.1,0.01,0.001,1,0.0001],'reg_lambda':[0.1,0.01,0.001,1,0.0001]}
clf = RandomizedSearchCV(XGBClassifier(), pars, n_jobs=-1, scoring='roc_auc',cv=3, n_iter=500)


In [None]:
clf.fit(X_res, y_res)
ab = clf.best_estimator_
print (clf.best_score_, clf.best_params_)
print("ROC training : {:.3f}".format(ab.score(X_res, y_res)))
print("ROC test: {:.3f}".format(ab.score(X_val, y_val)))

# Re entreno mejores hiperparametros

In [34]:
#params = best_trial.params
params = {'booster': 'dart',
 'gamma': 6.809838165457656e-07,
 'grow_policy': 'depthwise',
 'max_depth': 18,
 'n_estimators': 5139,
 'learning_rate': 0.08398279121454409,
 'reg_alpha': 1.1871503314088151e-06,
 'reg_lambda': 2.36401299672874e-05,
 #'min_split_gain': 1.996500311031048e-05,
 'colsample_bytree': 0.8,
 'subsample': 0.6244547423128666,
 'random_state': 42}

In [35]:
XGB_t = XGBClassifier(**params, verbosity=1,tree_method = 'gpu_hist', n_jobs=-1,feval = 'roc_auc_score')

In [36]:
#?LGBM_t.fit()

In [37]:
XGB_t.fit(X_res,y_res)#,eval_metric='recall_score')


Parameters: { "feval" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [38]:
train_out = XGB_t.predict_proba(X_res)
test_out  = XGB_t.predict_proba(X_val_pre)

In [39]:
print("ROC Train: "+str(roc_auc_score(np.array(y_res),train_out[:,1])))
print("ROC Val: "+str(roc_auc_score(np.array(y_val),test_out[:,1])))

ROC Train: 1.0
ROC Val: 0.9786459157621618


In [40]:
train_out = XGB_t.predict(X_res)
test_out  = XGB_t.predict(X_val_pre)

In [41]:

print("F1 Score Train: "+str(f1_score(np.array(y_res),train_out)))
print("F1 Score Val: "+str(f1_score(np.array(y_val),test_out)))

F1 Score Train: 1.0
F1 Score Val: 0.7311192530433754


In [42]:

print("Recal Score Train: "+str(recall_score(np.array(y_res),train_out)))
print("Recall Score Val: "+str(recall_score(np.array(y_val),test_out)))

Recal Score Train: 1.0
Recall Score Val: 0.9110456553755523


In [43]:

print("Accuracy Score Train: "+str(accuracy_score(np.array(y_res),train_out)))
print("Accuracy Score Val: "+str(accuracy_score(np.array(y_val),test_out)))

Accuracy Score Train: 1.0
Accuracy Score Val: 0.9474231569216547


In [44]:
np.histogram(train_out)

(array([93040,     0,     0,     0,     0,     0,     0,     0,     0,
        27912]),
 array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]))

In [45]:
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp =  confusion_matrix(y_val,test_out).ravel()

In [46]:
(tn, fp, fn, tp)

(37902, 1973, 302, 3093)

In [47]:
import joblib


In [48]:
pipe = make_pipeline(preprocessor,XGB_t)


joblib.dump(pipe, './xgb.pkl')



['./xgb.pkl']