In [9]:
# import models
import pandas as pd
import numpy as np

#pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 

#models
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [1]:
from project_fraud.data import cleaned_featured_data

**IMPORT DATA**

In [2]:
df = cleaned_featured_data('~/data/')

In [3]:
X = df[['TransactionID','card1','card2','addr1','TransactionAmt','card5','D15','C13','D2','D10','D4','weekday','hours','dist_mean', 'dist_median','dist_mean_rel','dist_median_rel']]
y = df['isFraud']

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

**PIPELINES**

In [18]:
n = (X.dtypes != 'object')
num_cols = list(n[n].index)
medium_missing_num_cols = []
low_missing_num_cols =[]
for i in num_cols:
    percentage = df[i].isnull().sum() * 100 / len(df[i])
    if percentage < 15:
        low_missing_num_cols.append(i)
    elif percentage >= 15 and percentage <= 60:
        medium_missing_num_cols.append(i)

In [19]:
num_transformer_low = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

num_transformer_medium = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant')),
    ('one_hot', OneHotEncoder())
])
    
preprocessor = ColumnTransformer([
    ('low_num_imputer',num_transformer_low, low_missing_num_cols),
    ('medium_num_imputer', num_transformer_medium, medium_missing_num_cols),
    ('cat_transformer', cat_pipeline, ['weekday','hours'])
],remainder='drop')


**MODEL RANDOM FOREST CLASSIFIER**

In [20]:
pipeline_rfc = Pipeline([
    ("preprocessor", preprocessor),
    ("model_rfc", RandomForestClassifier()),
     ])

In [21]:
# Number of trees in random forest
#n_estimators = 
# Number of features to consider at every split
#max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
#max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
#max_depth.append(None)
# Minimum number of samples required to split a node
#min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
#min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
#bootstrap = [True, False]

In [35]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# hyperparameter 

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

random_grid ={
    'model_rfc__n_estimators' : [int(x) for x in np.linspace(start = 0, stop = 2000, num = 100)],
    'model_rfc__max_features': ['auto', 'sqrt'],
    'model_rfc__max_depth': max_depth,
    #'model_rfc__min_samples_split': [2, 5, 10],
    #'model_rfc__min_samples_leaf': [1, 2, 4],
    #'model_rfc__bootstrap': [True, False]
}

scorers = {
    'recall_score': make_scorer(recall_score),
    'f1_score': make_scorer(f1_score)
}

# Random search 
random_search = RandomizedSearchCV(
    pipeline_rfc, 
    random_grid,
    cv=5,
    n_iter = 3,
    scoring = scorers, 
    refit = 'recall_score')

random_search.fit(X_train, y_train)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  'precision', 'predicted', average, warn_for)
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the cate

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('low_num_imputer', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
  ...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
          fit_params=None, iid='warn', n_iter=3, n_jobs=None,
          param_distributions={'model_rfc__n_estimators': [0, 20, 40, 60, 80, 101, 121, 141, 161, 181, 202, 222, 242, 262, 282, 303, 323, 343, 363, 383, 404, 424, 444, 464, 484, 505, 525, 545, 565, 585, 606, 626, 646, 666, 686, 707, 727, 747, 767, 787, 808, 828, 848, 868, 888, 909, 929, 949, 969, 989, 1010, 1...es': ['auto', 'sqrt'], 'model_rfc__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None]},
          pr

In [36]:
random_search.best_score_

0.3096380617119186

In [37]:
random_search.best_params_

{'model_rfc__n_estimators': 1494,
 'model_rfc__max_features': 'auto',
 'model_rfc__max_depth': 50}

What is the performance of the optimal pipeline? Make sure you cross validate!

In [None]:
from sklearn.model_selection import cross_val_predict

y_pred_2 = cross_val_predict(random_search.best_estimator_, X, y, cv=5)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [None]:
y_pred_2