## Imports

In [0]:
!pip install imbalanced-learn

In [0]:
!pip install boruta

In [0]:
!pip install sklearn-genetic

In [0]:
!pip install hyperopt

In [0]:
!pip install xgboost

In [0]:
!pip install lightgbm

In [0]:
!pip install implicit

In [0]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, StringType, DateType, DoubleType

import numpy as np
import pandas as pd
from IPython.display import Image
import warnings
from sklearn.feature_selection import f_classif, mutual_info_classif, SequentialFeatureSelector, SelectKBest
from scipy.stats import chi2_contingency
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, recall_score, precision_score, make_scorer, f1_score, accuracy_score
from sklearn.model_selection import cross_val_predict, LeaveOneOut, cross_val_score, StratifiedKFold, GridSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE
from boruta import BorutaPy
from genetic_selection import GeneticSelectionCV
from hyperopt import fmin, tpe, hp, Trials
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
from functools import partial
import xgboost as xgb
from lightgbm import LGBMClassifier
from scipy.sparse import coo_matrix
import implicit
from implicit.evaluation import ndcg_at_k

In [0]:
spark = SparkSession.builder.appName("ifood_case").getOrCreate()

In [0]:
path_features = 'dbfs:/FileStore/ifood_case/data/processed/df_customers_offers.parquet'

## Extract

In [0]:
df_features = spark.read.parquet(path_features)
df_features.groupBy('offer_accepted').count().display()

offer_accepted,count
0,45856
1,24000


In [0]:
df = df_features.toPandas()

In [0]:
df.head()

Unnamed: 0,account_id,split,amount,offer_id,min_value,duration,discount_value,avg_ticket,variability_ticket,offer_acceptance_rate,...,is_female,is_other,credit_card_limit,offer_accepted,is_bogo_offer,is_discount_offer,is_informational_offer,has_web_notification,has_mobile_notification,has_social_notification
0,2c4f8a7e0933415ca5ceda66946c1255,train,0.0,5a8bc65990b245e5a138643cd4eb9837,0,3.0,0,19.543333,1.944154,0.25,...,0,0,79000.0,0,0,0,1,0,1,1
1,27586221a0a5482b960ac41506d2c316,train,0.0,4d5c57ea9a6940dd891ad53e9dbe8da0,10,5.0,10,27.118,8.582239,0.285714,...,0,0,114000.0,0,1,0,0,1,1,1
2,36399e0a45554cafabaab01cdb80c480,validation,0.0,0b1e1539f2cc45b7b9fa7c272da2e1d7,20,10.0,5,28.39,2.941751,0.333333,...,1,0,84000.0,0,0,1,0,1,0,0
3,aeea18cf2e8d455c98453c546292a9f6,train,0.0,4d5c57ea9a6940dd891ad53e9dbe8da0,10,5.0,10,22.278571,6.869938,0.428571,...,1,0,73000.0,0,1,0,0,1,1,1
4,5fc96150a4994e3c982e723d06d35e8b,train,0.0,5a8bc65990b245e5a138643cd4eb9837,0,3.0,0,24.08,4.299209,0.285714,...,1,0,81000.0,0,0,0,1,0,1,1


## ML Tests

### Feature Transformers

In [0]:
target = 'offer_accepted'
key_feats = ['account_id', 'offer_id', 'split', 'amount']
feats = df_features_pd.drop([target] + key_feats, axis=1).columns
num_feats = [feat for feat in feats if df_features_pd[feat].dtype != 'O']
cat_feats = [feat for feat in feats if feat not in num_feats]

In [0]:
num_transformer = FeatureUnion(    
    [
        ('num_pipe', Pipeline(
            [
                ('norm', StandardScaler()),
                ('nan_input', SimpleImputer())
            ]
        )),
        ('nan_flag', MissingIndicator(error_on_new=False))
    ]
)
feat_transformer = ColumnTransformer(
    [
        ('num_trans', num_transformer, num_feats),
        ('cat_trans', OneHotEncoder(handle_unknown='ignore'), cat_feats)    
    ],
    remainder='passthrough', sparse_threshold=0
)

In [0]:
df_train = df[df['split'] == 'train']
X_train = df_train[feats]
y_train = df_train["offer_accepted"]

df_test = df[df['split'] != 'train']
X_test = df_test[feats]
y_test = df_test["offer_accepted"]

df_validation = df[df['split'] == 'validation']
X_validation = df_validation[feats]
y_validation = df_validation["offer_accepted"]


### Models

In [0]:
def optimize_hyperparameters(opt_space, pipe, X, y, max_evals=100):

    def obj(x):
        
        model = clone(pipe).set_params(**x)
        preds = cross_val_predict(model, X, y, cv=3, n_jobs=-1)

        return -f1_score(y, preds, average='macro')


    best_hypers = fmin(obj, space=opt_space, algo=tpe.suggest, 
                    max_evals=max_evals, return_argmin=False)

    return best_hypers

def optimize_als_hyperparameters(opt_space, train_matrix, val_matrix, max_evals=50):
    def obj(params):
        model = implicit.als.AlternatingLeastSquares(
            factors=params['factors'],
            regularization=params['regularization'],
            iterations=params['iterations'],
            random_state=42
        )
        model.fit((train_matrix * params['alpha']).astype('double'))

        score = ndcg_at_k(model, train_matrix, val_matrix, K=10)
        return -score 

    best_hypers = fmin(obj, space=opt_space, algo=tpe.suggest, max_evals=max_evals, return_argmin=False)
    return best_hypers

### Algorithms Comparison

In [0]:
lr_pipe = Pipeline(
  [
      ('feat_trans', feat_transformer),
      ('over', SMOTE()),
      ('logreg', LogisticRegression(random_state = 0))
]
)

# lr_opt_space = {'logreg__solver': hp.choice('logreg__solver', ['liblinear', 'lbfgs']),
#                 'logreg__C': hp.loguniform('logreg__C', np.log(1e-5), np.log(100))}
               
lr_opt_space = {'logreg__warm_start' : hp.choice('logreg__warm_start', [True, False]),
                'logreg__fit_intercept' : hp.choice('logreg__fit_intercept', [True, False]),
                'logreg__tol' : hp.uniform('logreg__tol', 0.00001, 0.0001),
                'logreg__C' : hp.uniform('logreg__C', 0.05, 3),
                'logreg__solver' : hp.choice('logreg__solver', ['newton-cg', 'lbfgs', 'liblinear']),
                'logreg__multi_class' : 'auto',
                'logreg__class_weight' : 'balanced'}

best_hypers = optimize_hyperparameters(lr_opt_space, clone(lr_pipe), X_train, y_train)
model = clone(lr_pipe).set_params(**best_hypers).fit(X_train, y_train)
y_pred = cross_val_predict(clone(lr_pipe).set_params(**best_hypers), X_test, y_test, cv=3)
y_pred_prob = cross_val_predict(clone(lr_pipe).set_params(**best_hypers), X_test, y_test, cv=3, method='predict_proba')

print(classification_report(y_test, y_pred))

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]  1%|          | 1/100 [00:09<15:53,  9.63s/trial, best loss: -0.5437228672807652]  2%|▏         | 2/100 [00:15<12:32,  7.68s/trial, best loss: -0.5437228672807652]  3%|▎         | 3/100 [00:22<11:48,  7.30s/trial, best loss: -0.5437228672807652]  4%|▍         | 4/100 [00:28<10:55,  6.83s/trial, best loss: -0.5437228672807652]  5%|▌         | 5/100 [00:35<10:32,  6.66s/trial, best loss: -0.5437228672807652]  6%|▌         | 6/100 [00:41<10:10,  6.50s/trial, best loss: -0.5437228672807652]  7%|▋         | 7/100 [00:47<09:55,  6.41s/trial, best loss: -0.5437228672807652]  8%|▊         | 8/100 [00:54<09:50,  6.42s/trial, best loss: -0.5437228672807652]  9%|▉         | 9/100 [01:00<09:37,  6.35s/trial, best loss: -0.5438692218254706] 10%|█         | 10/100 [01:06<09:21,  6.23s/trial, best loss: -0.5441364396906996] 11%|█         | 11/100 [01:12<09:15,  6.24s/trial, best loss: -0.5441364396906996] 12%|█▏        | 12/100 [01:

In [0]:
y_pred

Out[56]: array([0, 0, 0, ..., 0, 0, 0])

In [0]:
xgb_pipe = Pipeline(
  [
      ('feat_trans', feat_transformer),
      ('over', SMOTE()),
      ('xgb', xgb.XGBClassifier())
]
)

xgb_opt_space = {'xgb__max_depth':  hp.choice('xgb__max_depth', np.arange(1, 14, dtype=int)),
                'xgb__gamma': hp.uniform ('xgb__gamma', 1,9),
                'xgb__reg_alpha' : hp.quniform('xgb__reg_alpha', 40,180,1),
                'xgb__reg_lambda' : hp.uniform('xgb__reg_lambda', 0,1),
                'xgb__colsample_bytree' : hp.uniform('xgb__colsample_bytree', 0.5,1),
                'xgb__min_child_weight' : hp.quniform('xgb__min_child_weight', 0, 10, 1),
                'xgb__n_estimators': 180}

best_hypers = optimize_hyperparameters(xgb_opt_space, clone(xgb_pipe), X_train, y_train)
model = clone(xgb_pipe).set_params(**best_hypers).fit(X_train, y_train)
y_pred = cross_val_predict(clone(xgb_pipe).set_params(**best_hypers), X_test, y_test, cv=3)
y_pred_prob = cross_val_predict(clone(xgb_pipe).set_params(**best_hypers), X_test, y_test, cv=3, method='predict_proba')

print(classification_report(y_test, y_pred))

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]  1%|          | 1/100 [00:07<12:01,  7.29s/trial, best loss: -0.5803161714969923]  2%|▏         | 2/100 [00:14<11:34,  7.09s/trial, best loss: -0.5803161714969923]  3%|▎         | 3/100 [00:21<11:20,  7.02s/trial, best loss: -0.5803161714969923]  4%|▍         | 4/100 [00:28<11:19,  7.08s/trial, best loss: -0.5803161714969923]  5%|▌         | 5/100 [00:35<11:12,  7.08s/trial, best loss: -0.5803161714969923]  6%|▌         | 6/100 [00:42<10:58,  7.01s/trial, best loss: -0.5803161714969923]  7%|▋         | 7/100 [00:49<10:46,  6.95s/trial, best loss: -0.5803161714969923]  8%|▊         | 8/100 [00:56<10:37,  6.92s/trial, best loss: -0.5803161714969923]  9%|▉         | 9/100 [01:02<10:31,  6.94s/trial, best loss: -0.5803161714969923] 10%|█         | 10/100 [01:09<10:23,  6.93s/trial, best loss: -0.5803161714969923] 11%|█         | 11/100 [01:16<10:20,  6.97s/trial, best loss: -0.5803161714969923] 12%|█▏        | 12/100 [01: