In [2]:
import pandas as pd
import numpy as np
import  pyspark.sql. functions as F
import pyspark.sql.types as T

from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

import plotly.graph_objects as go
from plotly.colors import DEFAULT_PLOTLY_COLORS

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

spark = SparkSession.builder.appName("iFood Case").master("local[*]").getOrCreate()

25/07/11 21:34:47 WARN Utils: Your hostname, MARCO-PC resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/07/11 21:34:47 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/11 21:34:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
BASE_PATH = '/home/marco/ifood-case'
DATA_PROCESSED_PATH = BASE_PATH + '/data/processed/'

## Preprocessed Data Load

In [4]:
df_profile = spark.read.csv(DATA_PROCESSED_PATH + 'profile_processed.csv', header=True, inferSchema=True)
df_offers = spark.read.csv(DATA_PROCESSED_PATH + 'offers_processed.csv', header=True, inferSchema=True)
df_transactions = spark.read.csv(DATA_PROCESSED_PATH + 'event_transaction_processed.csv', header=True, inferSchema=True)
df_offer_received = spark.read.csv(DATA_PROCESSED_PATH + 'event_offer_received_processed.csv', header=True, inferSchema=True)
df_offer_viewed = spark.read.csv(DATA_PROCESSED_PATH + 'event_offer_viewed_processed.csv', header=True, inferSchema=True)
df_offer_completed = spark.read.csv(DATA_PROCESSED_PATH + 'event_offer_completed_processed.csv', header=True, inferSchema=True)

## Feature Engineering

### Timeline Features

In [5]:
df_offer_transactions_tl = (
    df_transactions
    .join(
        df_offer_completed,
        on=['account_id', 'transaction_time',],
        how='left'
    )
    .withColumn('event', F.when(
        F.col('offer_id').isNotNull(), F.lit('offer completed')).otherwise(F.lit('transaction'))
    )
    .select([
        F.col('account_id'),
        F.col('offer_id'),
        F.col('transaction_time').alias('time'),
        F.col('event'),
        F.col('amount'),
        F.col('reward')
    ])
    .fillna(0, subset=['amount', 'reward'])
)

df_offer_received_tl = (
    df_offer_received
    .select([
        F.col('account_id'),
        F.col('offer_id'),
        F.col('received_time').alias('time'),
        F.lit('offer received').alias('event'),
        F.lit(0).alias('amount'),
        F.lit(0).alias('reward')
    ])
)

df_offer_viewed_tl = (
    df_offer_viewed
    .select([
        F.col('account_id'),
        F.col('offer_id'),
        F.col('viewed_time').alias('time'),
        F.lit('offer viewed').alias('event'),
        F.lit(0).alias('amount'),
        F.lit(0).alias('reward')
    ])
)

In [6]:
df_timeline_features = (
    df_offer_transactions_tl
    .union(df_offer_received_tl)
    .union(df_offer_viewed_tl)
    .orderBy(['account_id', 'time'])
    .join(
        df_offers,
        on='offer_id',
        how='left'
    )
    .withColumn('transactions_count', 
        (F.col('event') == 'transaction').cast(T.IntegerType())
    )
    .withColumn('received_bogo_count', 
        ((F.col('event') == 'offer received') & 
         (F.col('offer_type') == 'bogo')).cast(T.IntegerType())
    )
    .withColumn('received_discount_count', 
        ((F.col('event') == 'offer received') & 
         (F.col('offer_type') == 'discount')).cast(T.IntegerType())
    )
    .withColumn('received_info_count', 
        ((F.col('event') == 'offer received') & 
         (F.col('offer_type') == 'informational')).cast(T.IntegerType())
    )
    .withColumn('viewed_bogo_count', 
        ((F.col('event') == 'offer viewed') & 
         (F.col('offer_type') == 'bogo')).cast(T.IntegerType())
    )
    .withColumn('viewed_discount_count', 
        ((F.col('event') == 'offer viewed') & 
         (F.col('offer_type') == 'discount')).cast(T.IntegerType())
    )
    .withColumn('viewed_info_count', 
        ((F.col('event') == 'offer viewed') & 
         (F.col('offer_type') == 'informational')).cast(T.IntegerType())
    )
    .withColumn('completed_bogo_count', 
        ((F.col('event') == 'offer completed') & 
         (F.col('offer_type') == 'bogo')).cast(T.IntegerType())
    )
    .withColumn('completed_discount_count', 
        ((F.col('event') == 'offer completed') & 
         (F.col('offer_type') == 'discount')).cast(T.IntegerType())
    )
    .withColumn('completed_info_count', 
        ((F.col('event') == 'offer completed') & 
         (F.col('offer_type') == 'informational')).cast(T.IntegerType())
    )
)

w = Window.partitionBy('account_id').orderBy('time').rowsBetween(Window.unboundedPreceding, Window.currentRow)

df_timeline_features = (
    df_timeline_features
    .groupBy(['account_id', 'time']).sum()
    .withColumn('total_amount', F.round(F.sum('sum(amount)').over(w), 2))
    .withColumn('total_reward', F.round(F.sum('sum(reward)').over(w), 2))
    .withColumn('total_transactions', F.sum('sum(transactions_count)').over(w))
    .withColumn('total_received_bogo', F.sum('sum(received_bogo_count)').over(w))
    .withColumn('total_received_discount', F.sum('sum(received_discount_count)').over(w))
    .withColumn('total_received_info', F.sum('sum(received_info_count)').over(w))
    .withColumn('total_viewed_bogo', F.sum('sum(viewed_bogo_count)').over(w))
    .withColumn('total_viewed_discount', F.sum('sum(viewed_discount_count)').over(w))
    .withColumn('total_viewed_info', F.sum('sum(viewed_info_count)').over(w))
    .withColumn('total_completed_bogo', F.sum('sum(completed_bogo_count)').over(w))
    .withColumn('total_completed_discount', F.sum('sum(completed_discount_count)').over(w))
    .withColumn('total_completed_info', F.sum('sum(completed_info_count)').over(w))
)

drop_agg_cols = [col for col in df_timeline_features.columns if col.startswith('sum(')]
df_timeline_features = df_timeline_features.drop(*drop_agg_cols)

### Offer Event Features

In [7]:
df_offers_scope = (
    df_offers
    .filter(F.col('offer_type').isin(['bogo', 'discount']))
    .drop('channels')
    .dropDuplicates()
)

df_offer_event_features = (
    df_transactions
    .join(
        df_offer_completed, 
        on=['account_id', 'transaction_time'], 
        how='left'
    )
    .join(
        df_offer_received.drop('amount', 'reward'), 
        on=['account_id', 'offer_id'], 
        how='outer'
    )
    .join(
        df_offers_scope, 
        on=['offer_id'], 
        how='inner'
    )
    .withColumn('offer_consumed', F.col('transaction_time').isNotNull().cast(T.IntegerType()))
    .drop('transaction_time', 'amount', 'reward')
    .withColumnRenamed('received_time', 'time')
)

### Offer Types Datasets

In [8]:
df_bogo_features = (
    df_offer_event_features
    .filter(F.col('offer_type') == 'bogo')
    .join(
        df_profile, 
        on=['account_id'], 
        how='inner'
    )
    .join(
        df_timeline_features, 
        on=['account_id', 'time'],
        how='left'
    )
    .drop('offer_type')
    .dropDuplicates()
)

df_discount_features = (
    df_offer_event_features
    .filter(F.col('offer_type') == 'discount')
    .join(
        df_profile, 
        on=['account_id'], 
        how='inner'
    )
    .join(
        df_timeline_features, 
        on=['account_id', 'time'],
        how='left'
    )
    .drop('offer_type')
    .dropDuplicates()
)

## Modeling

In [9]:
df_bogo = df_bogo_features.toPandas().set_index(['account_id', 'offer_id']).fillna(0)
df_discount = df_discount_features.toPandas().set_index(['account_id', 'offer_id']).fillna(0)

25/07/11 21:34:54 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

### Model for BOGO Offers

In [10]:
bogo_value_counts = df_bogo['offer_consumed'].value_counts()
scale_pos_weight_b = bogo_value_counts[0] / bogo_value_counts[1]

df_bogo['random_feature'] = np.random.rand(len(df_bogo))

#### DAtaset Split

In [11]:
X_b = df_bogo.drop(columns=['offer_consumed'])
y_b = df_bogo['offer_consumed']

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_b, y_b, test_size=0.3, random_state=42, stratify=y_b
)

#### GridSearch

In [12]:
params = {
    'n_estimators': [100, 1000],
    'learning_rate': [0.01, 0.1],
    'max_depth': [4, 6, 8],
    'min_child_weight': [1, 5],
    'gamma': [0, 0.2],
}

xgb = XGBClassifier(
    objective='binary:logistic', 
    scale_pos_weight=scale_pos_weight_b, 
    use_label_encoder=False, 
    eval_metric='logloss'
)

grid = GridSearchCV(
    estimator=xgb, 
    param_grid=params, 
    scoring='roc_auc', 
    cv=3, 
)

grid.fit(X_train_b, y_train_b)
grid.best_params_

{'gamma': 0,
 'learning_rate': 0.01,
 'max_depth': 4,
 'min_child_weight': 5,
 'n_estimators': 1000}

#### Model Training

In [13]:
model_b = XGBClassifier(
    objective='binary:logistic', 
    scale_pos_weight=scale_pos_weight_b, 
    use_label_encoder=False, 
    eval_metric='logloss',
    **grid.best_params_,
    random_state=42
)

model_b.fit(X_train_b, y_train_b)
y_pred_b = model_b.predict(X_test_b)

print(classification_report(y_test_b, y_pred_b))

              precision    recall  f1-score   support

           0       0.68      0.78      0.73      3035
           1       0.85      0.78      0.81      4927

    accuracy                           0.78      7962
   macro avg       0.77      0.78      0.77      7962
weighted avg       0.79      0.78      0.78      7962



#### Feature Importances

In [14]:
importances_b = model_b.feature_importances_
features_names_b = X_b.columns

df_importances_b = pd.DataFrame({
    "Feature": features_names_b,
    "Importância": importances_b
}).sort_values(by="Importância", ascending=False)

fig = go.Figure([
    go.Bar(
        x=df_importances_b["Importância"],
        y=df_importances_b["Feature"],
        orientation='h'
    )
])

fig.update_layout(
    title="Ofertas BOGO - Importância das Features",
    xaxis_title="Importância",
    yaxis_title="Feature",
    yaxis=dict(autorange="reversed"),
    height=600,
)

fig.show()

### Model for Discount Offers

In [15]:
discount_value_counts = df_discount['offer_consumed'].value_counts()
scale_pos_weight_d = discount_value_counts[0] / bogo_value_counts[1]

df_discount['random_feature'] = np.random.rand(len(df_discount))

#### DAtaset Split

In [16]:
X_d = df_discount.drop(columns=['offer_consumed'])
y_d = df_discount['offer_consumed']

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    X_d, y_d, test_size=0.3, random_state=42, stratify=y_d
)

#### GridSearch

In [17]:
params = {
    'n_estimators': [100, 1000],
    'learning_rate': [0.01, 0.1],
    'max_depth': [4, 6, 8],
    'min_child_weight': [1, 5],
    'gamma': [0, 0.2],
}

xgb = XGBClassifier(
    objective='binary:logistic', 
    scale_pos_weight=scale_pos_weight_d,
    use_label_encoder=False, 
    eval_metric='logloss'
)

grid = GridSearchCV(
    estimator=xgb, 
    param_grid=params, 
    scoring='roc_auc', 
    cv=3, 
)

grid.fit(X_train_d, y_train_d)
grid.best_params_

{'gamma': 0.2,
 'learning_rate': 0.1,
 'max_depth': 4,
 'min_child_weight': 5,
 'n_estimators': 100}

#### Model Training

In [18]:
model_d = XGBClassifier(
    objective='binary:logistic', 
    scale_pos_weight=scale_pos_weight_d, 
    use_label_encoder=False, 
    eval_metric='logloss',
    **grid.best_params_,
    random_state=42
)

model_d.fit(X_train_d, y_train_d)
y_pred_d = model_d.predict(X_test_d)

print(classification_report(y_test_d, y_pred_d))

              precision    recall  f1-score   support

           0       0.63      0.77      0.69      2541
           1       0.88      0.79      0.83      5459

    accuracy                           0.78      8000
   macro avg       0.75      0.78      0.76      8000
weighted avg       0.80      0.78      0.79      8000



#### Feature Importances

In [19]:
importances_d = model_d.feature_importances_
features_names_d = X_d.columns

df_importances_d = pd.DataFrame({
    "Feature": features_names_d,
    "Importância": importances_d
}).sort_values(by="Importância", ascending=False)

fig = go.Figure([
    go.Bar(
        x=df_importances_d["Importância"],
        y=df_importances_d["Feature"],
        orientation='h'
    )
])

fig.update_layout(
    title="Ofertas BOGO - Importância das Features",
    xaxis_title="Importância",
    yaxis_title="Feature",
    yaxis=dict(autorange="reversed"),
    height=600,
)

fig.show()