In [41]:
import pandas as pd
import numpy as np

import  pyspark.sql. functions as F
import pyspark.sql.types as T

from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

spark = SparkSession.builder.appName("iFood Case").master("local[*]").getOrCreate()

In [36]:
BASE_PATH = '/home/marco/ifood-case'
DATA_PROCESSED_PATH = BASE_PATH + '/data/processed/'

In [42]:
df_profile = spark.read.csv(DATA_PROCESSED_PATH + 'profile_processed.csv', header=True, inferSchema=True)
df_offers = spark.read.csv(DATA_PROCESSED_PATH + 'offers_processed.csv', header=True, inferSchema=True)
df_transactions = spark.read.csv(DATA_PROCESSED_PATH + 'event_transaction_processed.csv', header=True, inferSchema=True)
df_offer_received = spark.read.csv(DATA_PROCESSED_PATH + 'event_offer_received_processed.csv', header=True, inferSchema=True)
df_offer_viewed = spark.read.csv(DATA_PROCESSED_PATH + 'event_offer_viewed_processed.csv', header=True, inferSchema=True)
df_offer_completed = spark.read.csv(DATA_PROCESSED_PATH + 'event_offer_completed_processed.csv', header=True, inferSchema=True)

## Feature Engineering

In [43]:
df_offer_transactions_tl = (
    df_transactions
    .join(
        df_offer_completed,
        on=['account_id', 'transaction_time',],
        how='left'
    )
    .withColumn('event', F.when(
        F.col('offer_id').isNotNull(), F.lit('offer completed')).otherwise(F.lit('transaction'))
    )
    .select([
        F.col('account_id'),
        F.col('offer_id'),
        F.col('transaction_time').alias('time'),
        F.col('event'),
        F.col('amount'),
        F.col('reward')
    ])
    .fillna(0, subset=['amount', 'reward'])
)

df_offer_received_tl = (
    df_offer_received
    .select([
        F.col('account_id'),
        F.col('offer_id'),
        F.col('received_time').alias('time'),
        F.lit('offer received').alias('event'),
        F.lit(0).alias('amount'),
        F.lit(0).alias('reward')
    ])
)

df_offer_viewed_tl = (
    df_offer_viewed
    .select([
        F.col('account_id'),
        F.col('offer_id'),
        F.col('viewed_time').alias('time'),
        F.lit('offer viewed').alias('event'),
        F.lit(0).alias('amount'),
        F.lit(0).alias('reward')
    ])
)

In [44]:
df_timeline_features = (
    df_offer_transactions_tl
    .union(df_offer_received_tl)
    .union(df_offer_viewed_tl)
    .orderBy(['account_id', 'time'])
    .join(
        df_offers,
        on='offer_id',
        how='left'
    )
    .withColumn('transactions_count', 
        (F.col('event') == 'transaction').cast(T.IntegerType())
    )
    .withColumn('received_bogo_count', 
        ((F.col('event') == 'offer received') & 
         (F.col('offer_type') == 'bogo')).cast(T.IntegerType())
    )
    .withColumn('received_discount_count', 
        ((F.col('event') == 'offer received') & 
         (F.col('offer_type') == 'discount')).cast(T.IntegerType())
    )
    .withColumn('received_info_count', 
        ((F.col('event') == 'offer received') & 
         (F.col('offer_type') == 'informational')).cast(T.IntegerType())
    )
    .withColumn('viewed_bogo_count', 
        ((F.col('event') == 'offer viewed') & 
         (F.col('offer_type') == 'bogo')).cast(T.IntegerType())
    )
    .withColumn('viewed_discount_count', 
        ((F.col('event') == 'offer viewed') & 
         (F.col('offer_type') == 'discount')).cast(T.IntegerType())
    )
    .withColumn('viewed_info_count', 
        ((F.col('event') == 'offer viewed') & 
         (F.col('offer_type') == 'informational')).cast(T.IntegerType())
    )
    .withColumn('completed_bogo_count', 
        ((F.col('event') == 'offer completed') & 
         (F.col('offer_type') == 'bogo')).cast(T.IntegerType())
    )
    .withColumn('completed_discount_count', 
        ((F.col('event') == 'offer completed') & 
         (F.col('offer_type') == 'discount')).cast(T.IntegerType())
    )
    .withColumn('completed_info_count', 
        ((F.col('event') == 'offer completed') & 
         (F.col('offer_type') == 'informational')).cast(T.IntegerType())
    )
)

w = Window.partitionBy('account_id').orderBy('time').rowsBetween(Window.unboundedPreceding, Window.currentRow)

df_timeline_features = (
    df_timeline_features
    .groupBy(['account_id', 'time']).sum()
    .withColumn('total_amount', F.round(F.sum('sum(amount)').over(w), 2))
    .withColumn('total_reward', F.round(F.sum('sum(reward)').over(w), 2))
    .withColumn('total_transactions', F.sum('sum(transactions_count)').over(w))
    .withColumn('total_received_bogo', F.sum('sum(received_bogo_count)').over(w))
    .withColumn('total_received_discount', F.sum('sum(received_discount_count)').over(w))
    .withColumn('total_received_info', F.sum('sum(received_info_count)').over(w))
    .withColumn('total_viewed_bogo', F.sum('sum(viewed_bogo_count)').over(w))
    .withColumn('total_viewed_discount', F.sum('sum(viewed_discount_count)').over(w))
    .withColumn('total_viewed_info', F.sum('sum(viewed_info_count)').over(w))
    .withColumn('total_completed_bogo', F.sum('sum(completed_bogo_count)').over(w))
    .withColumn('total_completed_discount', F.sum('sum(completed_discount_count)').over(w))
    .withColumn('total_completed_info', F.sum('sum(completed_info_count)').over(w))
)

drop_agg_cols = [col for col in df_timeline_features.columns if col.startswith('sum(')]
df_timeline_features = df_timeline_features.drop(*drop_agg_cols)

In [65]:
df_offers_scope = (
    df_offers
    .filter(F.col('offer_type').isin(['bogo', 'discount']))
    .drop('channels')
    .dropDuplicates()
)

df_offer_event_features = (
    df_transactions
    .join(
        df_offer_completed, 
        on=['account_id', 'transaction_time'], 
        how='left'
    )
    .join(
        df_offer_received.drop('amount', 'reward'), 
        on=['account_id', 'offer_id'], 
        how='outer'
    )
    .join(
        df_offers_scope, 
        on=['offer_id'], 
        how='inner'
    )
    .withColumn('offer_consumed', F.col('transaction_time').isNotNull().cast(T.IntegerType()))
    .drop('transaction_time', 'amount', 'reward')
    .withColumnRenamed('received_time', 'time')
)

In [66]:
df_bogo_features = (
    df_offer_event_features
    .filter(F.col('offer_type') == 'bogo')
    .join(
        df_profile, 
        on=['account_id'], 
        how='inner'
    )
    .join(
        df_timeline_features, 
        on=['account_id', 'time'],
        how='left'
    )
    .drop('offer_type')
    .dropDuplicates()
)

df_discount_features = (
    df_offer_event_features
    .filter(F.col('offer_type') == 'discount')
    .join(
        df_profile, 
        on=['account_id'], 
        how='inner'
    )
    .join(
        df_timeline_features, 
        on=['account_id', 'time'],
        how='left'
    )
    .drop('offer_type')
    .dropDuplicates()
)

# Test Classification

In [67]:
df_transaction_bogo = df_bogo_features.toPandas()
df_transaction_discount = df_discount_features.toPandas()

In [70]:
df_transaction_bogo['offer_consumed'].value_counts(normalize=True)

offer_consumed
1    0.618796
0    0.381204
Name: proportion, dtype: float64

## BOGO Model

In [71]:
df_features_bogo = df_transaction_bogo.set_index(['account_id', 'offer_id']).fillna(0)

X_b = df_features_bogo.drop(columns=['offer_consumed'])
y_b = df_features_bogo['offer_consumed']

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_b, y_b, test_size=0.3, random_state=42, stratify=y_b
)

bogo_models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

bogo_results = []
for nome, modelo in bogo_models.items():
    modelo.fit(X_train_b, y_train_b)
    y_pred_b = modelo.predict(X_test_b)
    
    bogo_results.append({
        'Modelo': nome,
        'AUC': roc_auc_score(y_test_b, y_pred_b),
        'Acurácia': accuracy_score(y_test_b, y_pred_b),
    })

df_bogo_results = pd.DataFrame(bogo_results)
df_bogo_results

Unnamed: 0,Modelo,AUC,Acurácia
0,LogisticRegression,0.724048,0.750942
1,RandomForest,0.748015,0.770661
2,XGBoost,0.75838,0.777694


In [72]:
xgb_model = bogo_models["XGBoost"]
importancias = xgb_model.feature_importances_
nomes_features = X_b.columns

importancia_df = pd.DataFrame({
    "Feature": nomes_features,
    "Importância": importancias
}).sort_values(by="Importância", ascending=False)

fig = go.Figure([
    go.Bar(
        x=importancia_df["Importância"],
        y=importancia_df["Feature"],
        orientation='h'
    )
])

fig.update_layout(
    title="Importância das Features - XGBoost - Ofertas BOGO",
    xaxis_title="Importância",
    yaxis_title="Feature",
    yaxis=dict(autorange="reversed"),
    height=600,
)

fig.show()

In [73]:
xgb_model = bogo_models["XGBoost"]
y_pred_b = xgb_model.predict(X_test_b)

X_test_result = X_test_b.copy()
X_test_result['target_true'] = y_test_b
X_test_result['target_pred'] = y_pred_b
X_test_result['not_converted_prob'] = xgb_model.predict_proba(X_test_b)[:, 0].round(3)
X_test_result['converted_prob'] = xgb_model.predict_proba(X_test_b)[:, 1].round(3)

print(classification_report(y_test_b, y_pred_b))

              precision    recall  f1-score   support

           0       0.72      0.68      0.70      3035
           1       0.81      0.84      0.82      4927

    accuracy                           0.78      7962
   macro avg       0.77      0.76      0.76      7962
weighted avg       0.78      0.78      0.78      7962



In [74]:
X_test_result.drop(columns=X_test_b)

Unnamed: 0_level_0,Unnamed: 1_level_0,target_true,target_pred,not_converted_prob,converted_prob
account_id,offer_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
671a142bc07e47389b920cb0c37793bd,4d5c57ea9a6940dd891ad53e9dbe8da0,0,0,0.985,0.015
386d9947e96a428b91132d3455d17976,4d5c57ea9a6940dd891ad53e9dbe8da0,0,0,0.633,0.367
35f39fb3476249988ddc964f24e41edc,9b98b8c7a33c4b65b9aebfe6a799e6d9,0,0,0.817,0.183
2dc57aa3c24c4468841be4b02ac54fe3,4d5c57ea9a6940dd891ad53e9dbe8da0,0,0,0.962,0.038
428111c5138a4a4c8131e5d2f258fa95,9b98b8c7a33c4b65b9aebfe6a799e6d9,0,1,0.052,0.948
...,...,...,...,...,...
9fa8ee92f21e425ca26519b84d513c70,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,0.406,0.594
efb2c8c6351b423b97bc5508ba62dfde,4d5c57ea9a6940dd891ad53e9dbe8da0,0,1,0.140,0.860
104a73b6d8f049f0bdc9388ef4998526,ae264e3637204a6fb9bb56bc8210ddfd,1,1,0.008,0.992
4a0b380c3fc14b30a8cc577eeeaa0c3c,f19421c1d4aa40978ebb69ca19b0e20d,1,1,0.276,0.724


## Discount Model

In [77]:
df_features_discount = df_transaction_discount.set_index(['account_id', 'offer_id']).fillna(0)

X_d = df_features_discount.drop(columns=['offer_consumed'])
y_d = df_features_discount['offer_consumed']

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    X_d, y_d, test_size=0.3, random_state=42, stratify=y_d
)

discount_models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

discount_results = []
for nome, modelo in discount_models.items():
    modelo.fit(X_train_d, y_train_d)
    y_pred_d = modelo.predict(X_test_d)
    
    discount_results.append({
        'Modelo': nome,
        'AUC': roc_auc_score(y_test_d, y_pred_d),
        'Acurácia': accuracy_score(y_test_d, y_pred_d),
    })

df_discount_results = pd.DataFrame(discount_results)
df_discount_results

Unnamed: 0,Modelo,AUC,Acurácia
0,LogisticRegression,0.700304,0.768125
1,RandomForest,0.737433,0.7865
2,XGBoost,0.746814,0.792125


In [78]:
xgb_model = discount_models["XGBoost"]
importancias = xgb_model.feature_importances_
nomes_features = X_d.columns

importancia_df = pd.DataFrame({
    "Feature": nomes_features,
    "Importância": importancias
}).sort_values(by="Importância", ascending=False)

fig = go.Figure([
    go.Bar(
        x=importancia_df["Importância"],
        y=importancia_df["Feature"],
        orientation='h'
    )
])

fig.update_layout(
    title="Importância das Features - XGBoost - Ofertas Desconto",
    xaxis_title="Importância",
    yaxis_title="Feature",
    yaxis=dict(autorange="reversed"),
    height=600,
)

fig.show()

In [79]:
xgb_model = discount_models["XGBoost"]
y_pred_d = xgb_model.predict(X_test_d)

X_test_result = X_test_d.copy()
X_test_result['target_true'] = y_test_d
X_test_result['target_pred'] = y_pred_d
X_test_result['not_converted_prob'] = xgb_model.predict_proba(X_test_d)[:, 0].round(3)
X_test_result['converted_prob'] = xgb_model.predict_proba(X_test_d)[:, 1].round(3)

print(classification_report(y_test_d, y_pred_d))

              precision    recall  f1-score   support

           0       0.69      0.62      0.66      2541
           1       0.83      0.87      0.85      5459

    accuracy                           0.79      8000
   macro avg       0.76      0.75      0.75      8000
weighted avg       0.79      0.79      0.79      8000



In [80]:
X_test_result.drop(columns=X_test_b)

Unnamed: 0_level_0,Unnamed: 1_level_0,target_true,target_pred,not_converted_prob,converted_prob
account_id,offer_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
36ccfdf3474f472484327c266ef4fa84,fafdcd668e3743c1bb461111dcafc2a4,1,1,0.054,0.946
07b78a8616714b58ad3a9ed94126c459,2906b810c7d4411798c6938adc9daaa5,1,1,0.413,0.587
65d4a94bef0841cc9b02753119742ccc,2906b810c7d4411798c6938adc9daaa5,1,0,0.741,0.259
f0469fdceb9244078e09592c3e7e4ce8,2298d6c36e964ae4a3e7e9706d1fb8c2,0,1,0.049,0.951
ea80e80e157a410d8fdfc6221f4a785a,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,1,0.064,0.936
...,...,...,...,...,...
93a4aed8d91f4e128f38ba62e2b9d97c,0b1e1539f2cc45b7b9fa7c272da2e1d7,0,0,0.642,0.358
45da79fb1a6040c0864e788ddce7af10,fafdcd668e3743c1bb461111dcafc2a4,1,1,0.003,0.997
083d316ee57640689731e5e7be9d5c20,2298d6c36e964ae4a3e7e9706d1fb8c2,0,0,0.749,0.251
391d8ec364374b38b3b2573d9e6b7d07,fafdcd668e3743c1bb461111dcafc2a4,1,1,0.010,0.990
