In [1]:
dbutils.library.installPyPI('featuretools')

In [2]:
import pandas as pd
import featuretools as ft
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt

## Partie I : data load + transformation

## on charge le fichier
df = pd.read_json("https://github.com/unravelin/code-test-data-science/raw/master/customers.json", lines=True)

## force col ordering
df = df[['customer', 'fraudulent', 'orders', 'paymentMethods', 'transactions']]

## apercu des données
df.head(10)

df_orders = pd.DataFrame()
df_transactions = pd.DataFrame()

df_target = pd.DataFrame(columns=['user_id', 'fraud_label'])


for row in df.iterrows():
    customer = row[1][0]
    orders = row[1][2]
    transactions = row[1][4]

    user_id = customer['customerEmail']
    user_label = row[1][1]

    row_dict = {}
    row_dict['user_id'] = user_id
    row_dict['fraud_label'] = user_label
    df_target = df_target.append(row_dict, ignore_index=True)

    for order in orders:
         order['CustomerId'] = customer['customerEmail']
         df_orders = df_orders.append(order, ignore_index=True)

    for transaction in transactions:
         transaction['CustomerId'] = customer['customerEmail']
         df_transactions = df_transactions.append(transaction, ignore_index=True)
          
print("="*80)
print("Data chargee")
print("="*80)

In [3]:
## first we define entityset, and entities from loaded pandas dataframes

es = ft.EntitySet(id="fraud")

df_target.drop_duplicates(subset='user_id', inplace=True)

es.entity_from_dataframe(entity_id='users', dataframe=df_target, make_index=False)

es.entity_from_dataframe(entity_id='orders', dataframe=df_orders, make_index=False,
                         index='orderId', variable_types={"orderState": ft.variable_types.Categorical})

es.entity_from_dataframe(entity_id='trx', dataframe=df_transactions, make_index=False,
                         index='transactionId', variable_types={"paymentMethodId": ft.variable_types.Categorical,
                                                                "transactionFailed":ft.variable_types.Categorical})


## we define relationships between entities using parent -> childs (1:n)

user_orders = ft.Relationship(es["users"]["user_id"], es["orders"]["CustomerId"])
user_trx = ft.Relationship(es["users"]["user_id"], es["trx"]["CustomerId"])
order_trx = ft.Relationship(es["orders"]["orderId"], es["trx"]["orderId"])

es = es.add_relationship(user_orders)
es = es.add_relationship(user_trx)
es = es.add_relationship(order_trx)

es.add_interesting_values()

## we call features calculation using Deep Features Synthesis algorithm (DFS)

fm_full, feature_defs = ft.dfs(entityset=es,
                                      target_entity="users",
                                      max_depth=1,
                                      where_primitives=["count"],
                                      groupby_trans_primitives=["cum_sum"],
                                      ignore_variables={
                                           "trx": ["CustomerId", "orderId"],
                                           "orders" : ["CustomerId", "orderShippingAddress"],
                                           "users": ["fraud_label"]},
                                          verbose=True)
## for this example drop categ variables
categ_cols = fm_full.select_dtypes(include='object').columns.tolist()

fm_full_export = fm_full.drop(categ_cols,axis=1)

## merge with id + label tbl
fm_full_export= fm_full_export.reset_index()
fm_full_export2 = pd.merge(left=df_target, right=fm_full_export, how='inner')

## print(feature_defs)

## persist to parquet
import os
PATH = os.getcwd()
fm_full_export2.to_parquet(PATH + 'fm_full.parquet', index=True, compression='snappy')

print("="*80)
print("Features engineering complete !")
print("="*80)

In [4]:
fm_full_export2.head(4)

Unnamed: 0,user_id,fraud_label,SUM(orders.orderAmount),STD(orders.orderAmount),MAX(orders.orderAmount),SKEW(orders.orderAmount),MIN(orders.orderAmount),MEAN(orders.orderAmount),COUNT(orders),NUM_UNIQUE(orders.orderState),SUM(orders.trx.transactionAmount),STD(orders.trx.transactionAmount),MAX(orders.trx.transactionAmount),SKEW(orders.trx.transactionAmount),MIN(orders.trx.transactionAmount),MEAN(orders.trx.transactionAmount),COUNT(orders.trx),NUM_UNIQUE(orders.trx.transactionFailed),NUM_UNIQUE(orders.trx.paymentMethodId),MODE(orders.trx.transactionFailed),SUM(trx.transactionAmount),STD(trx.transactionAmount),MAX(trx.transactionAmount),SKEW(trx.transactionAmount),MIN(trx.transactionAmount),MEAN(trx.transactionAmount),COUNT(trx),NUM_UNIQUE(trx.transactionFailed),NUM_UNIQUE(trx.paymentMethodId),MODE(trx.transactionFailed),COUNT(orders WHERE orderState = fulfilled),COUNT(orders WHERE orderState = failed),COUNT(orders WHERE orderState = pending),COUNT(orders.trx WHERE transactionFailed = 0.0),COUNT(orders.trx WHERE transactionFailed = 1.0),COUNT(trx WHERE transactionFailed = 0.0),COUNT(trx WHERE transactionFailed = 1.0),COUNT(orders.trx WHERE orders.orderState = fulfilled),COUNT(orders.trx WHERE orders.orderState = pending),COUNT(orders.trx WHERE orders.orderState = failed),COUNT(trx WHERE orders.orderState = fulfilled),COUNT(trx WHERE orders.orderState = pending),COUNT(trx WHERE orders.orderState = failed)
0,josephhoward@yahoo.com,False,44.0,5.656854,26.0,,18.0,22.0,2.0,2.0,44.0,5.656854,26.0,,18.0,22.0,2.0,1.0,1.0,0.0,44.0,5.656854,26.0,,18.0,22.0,2.0,1.0,1.0,0.0,1.0,0.0,1.0,2.0,0.0,2.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
1,evansjeffery@yahoo.com,True,111.0,12.165525,45.0,-1.679536,23.0,37.0,3.0,1.0,111.0,12.165525,45.0,-1.679536,23.0,37.0,3.0,2.0,2.0,0.0,111.0,12.165525,45.0,-1.679536,23.0,37.0,3.0,2.0,2.0,0.0,3.0,0.0,0.0,2.0,1.0,2.0,1.0,3.0,0.0,0.0,3.0,0.0,0.0
2,andersonwilliam@yahoo.com,False,82.0,4.932883,33.0,1.652317,24.0,27.333333,3.0,2.0,131.0,3.834058,33.0,2.141565,24.0,26.2,5.0,2.0,2.0,0.0,131.0,3.834058,33.0,2.141565,24.0,26.2,5.0,2.0,2.0,0.0,2.0,1.0,0.0,3.0,2.0,3.0,2.0,4.0,0.0,1.0,4.0,0.0,1.0
3,rubenjuarez@yahoo.com,False,85.0,4.932883,34.0,1.652317,25.0,28.333333,3.0,1.0,85.0,4.932883,34.0,1.652317,25.0,28.333333,3.0,1.0,1.0,0.0,85.0,4.932883,34.0,1.652317,25.0,28.333333,3.0,1.0,1.0,0.0,3.0,0.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0


In [5]:
# =============================================================================
# MODELISATION RF standard
# =============================================================================

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

## read features matrix
df_all = pd.read_parquet(PATH + 'fm_full.parquet')

## fill missing values with special value
df_all.fillna(-9999, inplace=True)

## encode target values to binary flag 1/0
df_all['target_class'] = df_all['fraud_label'].map({True:1, False:0})
y = df_all['target_class']

## drop id, target
df_all.drop(columns=['user_id', 'fraud_label'], inplace=True)

## vary basic features selection based on correlation treshold

# Threshold for removing correlated variables 
threshold = 0.95

corr_matrix = df_all.corr().abs() 
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
collinear_features = [column for column in upper.columns if any(upper[column] > threshold)]

df_nocorr = df_all.drop(columns = collinear_features)


inputs_cols = [x for x in df_nocorr if 'target_class' not in x]

X = df_nocorr[inputs_cols]

## Train Test Random Split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print('# records training data:', len(X_train))
print('# records test data:',len(X_test))

X.head(4)

Unnamed: 0,SUM(orders.orderAmount),STD(orders.orderAmount),MAX(orders.orderAmount),SKEW(orders.orderAmount),COUNT(orders),STD(orders.trx.transactionAmount),SKEW(orders.trx.transactionAmount),COUNT(orders.trx),COUNT(orders WHERE orderState = failed),COUNT(orders WHERE orderState = pending),COUNT(orders.trx WHERE transactionFailed = 1.0),COUNT(orders.trx WHERE orders.orderState = fulfilled),COUNT(orders.trx WHERE orders.orderState = pending),COUNT(orders.trx WHERE orders.orderState = failed)
0,44.0,5.656854,26.0,-9999.0,2.0,5.656854,-9999.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0
1,111.0,12.165525,45.0,-1.679536,3.0,12.165525,-1.679536,3.0,0.0,0.0,1.0,3.0,0.0,0.0
2,82.0,4.932883,33.0,1.652317,3.0,3.834058,2.141565,5.0,1.0,0.0,2.0,4.0,0.0,1.0
3,85.0,4.932883,34.0,1.652317,3.0,4.932883,1.652317,3.0,0.0,0.0,0.0,3.0,0.0,0.0


In [6]:
### MODEL BUILDING ###

# classifer params instanciation
clf = RandomForestClassifier(n_jobs=2, random_state=47, n_estimators=10, max_depth=8, bootstrap=True)

# run model training
clf.fit(X_train, y_train)

## RF features importance
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

# sort par importance
names = [X_train.columns[i] for i in indices]

rf_predictions = clf.predict(X_test)

#### Baseline model AUC #####

baseline_auc = roc_auc_score(y_test, rf_predictions)
print("Random Forest AUC : {0:0.3f}".format(baseline_auc)) ## 0.09

print('classement des variables importantes')
print('='*80)
print(names[:10])

In [7]:
## global feature importance
feature_importances = pd.DataFrame( clf.feature_importances_, index = X_train.columns,columns=['importance']).sort_values('importance',ascending=False)
feature_importances.reset_index(inplace=True)
ax= feature_importances[:5].plot.bar(y='importance', rot=45)
display(feature_importances[:5])

index,importance
MAX(orders.orderAmount),0.2657238558365813
SUM(orders.orderAmount),0.239662797982079
SKEW(orders.orderAmount),0.0828436970166209
STD(orders.orderAmount),0.0811990164341104
COUNT(orders),0.0724150000299249


In [8]:
print('max_depth : {}'.format(clf.max_depth))
print('nb_trees : {}'.format(clf.n_estimators))
print('max_features : {}'.format(clf.max_features))

In [9]:
### tuning RF ####
param_grid = { "n_estimators"  : [100, 200, 300],
           "max_features"      : [0.5, 0.75],
           "max_depth"         : [8, 10, 16],
           "min_samples_split" : [2, 4]
            }

grid_search = GridSearchCV(clf, param_grid, n_jobs=8, cv=5, scoring='roc_auc')
grid_search.fit(X, y)

## stocker la meilleure combin de parametres
rf_best_params = grid_search.best_params_

## calculer predictions sur base des best params
cv_predictions = grid_search.predict(X_test)
best_auc = roc_auc_score(y_test, cv_predictions)
best_rf = grid_search.best_estimator_
print("Tuned RF CV_predictions : {0:0.3f}".format(best_auc))

print('='*80)

In [10]:
##### ML FLOW ######
dbutils.library.installPyPI('mlflow')
import os
import mlflow

In [11]:
print('max_depth : {}'.format(best_rf.max_depth))
print('nb_trees : {}'.format(best_rf.n_estimators))
print('max_features : {}'.format(best_rf.max_features))

In [12]:
##### ML FLOW ######
dbutils.library.installPyPI('mlflow')
import os
import mlflow
import mlflow.sklearn

## start new ML flow session
with mlflow.start_run():
  
  ## first, we log plain randomforest model params ##
  mlflow.log_param("max_depth", clf.max_depth)
  mlflow.log_param("nb_trees", clf.n_estimators)
  mlflow.log_param("max_features",clf.max_features)
  
  ## we log base model AUC
  mlflow.log_metric("auc", baseline_auc, step=1)
  
  ## store model artefact
  mlflow.sklearn.log_model(clf, "baseline_RF")
  
  ## We log optimized model params

  mlflow.log_param("best_max_depth", best_rf.max_depth)
  mlflow.log_param("best_nb_trees", best_rf.n_estimators)
  mlflow.log_param("best_max_features",best_rf.max_features)
  
  ## and optimized AUC
  mlflow.log_metric("auc", best_auc, step=2)
  
  mlflow.sklearn.log_model(best_rf, "tuned_RF")
 
  ## Ml Flow can also store plain binary data
  mlflow.log_artifact(PATH + "fm_full.parquet")

In [13]:
##%sh mlflow models serve --model-uri runs:/101/model --port 54321

In [14]:
res = mlflow.search_runs('2561679846085066')

In [15]:
res.head(20)

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.auc,params.best_max_depth,params.nb_trees,params.best_max_features,params.max_features,params.best_nb_trees,params.max_depth,tags.mlflow.databricks.notebookID,tags.mlflow.user,tags.mlflow.databricks.notebookRevisionID,tags.mlflow.source.name,tags.mlflow.databricks.notebookPath,tags.mlflow.databricks.webappURL,tags.mlflow.source.type
0,884a4b1a37134c6baca7cf4a347ef3e3,2561679846085066,FINISHED,dbfs:/databricks/mlflow/2561679846085066/884a4...,2019-12-25 23:11:30.565000+00:00,2019-12-25 23:11:34.330000+00:00,0.875,8,100,0.75,auto,300,5,2561679846085066,mejdoubi2005@gmail.com,1577315494412,/Users/mejdoubi2005@gmail.com/test_RF,/Users/mejdoubi2005@gmail.com/test_RF,https://community.cloud.databricks.com,NOTEBOOK


In [16]:
## retrive model
model_uri = res['artifact_uri']
print(model_uri)

sk_model = mlflow.sklearn.load_model('runs:/884a4b1a37134c6baca7cf4a347ef3e3/tuned_RF/')

print(sk_model)