In [48]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [3]:
# BASE_PATH = '/Workspace/Users/marcodaniel.ml@hotmail.com/ifood-case'
BASE_PATH = 'D:/Downloads/IFood/ifood-case/'
DATA_RAW_PATH = BASE_PATH + '/data/raw/'
DATA_TEMP_PATH = BASE_PATH + '/data/temp/'

# profile

In [13]:
df_profile = pd.read_json(DATA_RAW_PATH + 'profile.json')

df_profile['age'] = df_profile['age'].astype(int)
df_profile = df_profile[df_profile['age'] <= 101]
df_profile['registered_on'] = pd.to_datetime(df_profile['registered_on'], format='%Y%m%d')
df_profile['gender'] = df_profile['gender'].str.upper().fillna('O')
df_profile['id'] = df_profile['id'].str.strip()
df_profile['credit_card_limit'] = df_profile['credit_card_limit'].astype(float)
df_profile = df_profile.rename(columns={'id': 'account_id'})

df_profile['actual_date'] = pd.to_datetime('2019-01-01')
df_profile['registered_days'] = (df_profile['actual_date'] - df_profile['registered_on']).dt.days
df_profile = df_profile.drop(['registered_on', 'actual_date'], axis=1)

df_profile = pd.get_dummies(df_profile, columns=['gender'])
df_profile['gender_F'] = df_profile['gender_F'].astype(int)
df_profile['gender_M'] = df_profile['gender_M'].astype(int)
df_profile['gender_O'] = df_profile['gender_O'].astype(int)

# offers

In [6]:
df_offers = pd.read_json(DATA_RAW_PATH + 'offers.json')

df_offers['min_value'] = df_offers['min_value'].astype(float)
df_offers['duration'] = df_offers['duration'].astype(int)
df_offers['id'] = df_offers['id'].str.strip()
df_offers['discount_value'] = df_offers['discount_value'].astype(float)

df_offers = df_offers.rename(columns={'id': 'offer_id'})

# transactions

In [7]:
df_transactions = pd.read_json(DATA_RAW_PATH + 'transactions.json')

df_transactions['account_id'] = df_transactions['account_id'].str.strip()
df_transactions['time_since_test_start'] = df_transactions['time_since_test_start'].astype(float)

value_expanded = df_transactions['value'].apply(pd.Series)
df_transactions_s = pd.concat([df_transactions.drop(columns='value'), value_expanded], axis=1)

In [8]:
df_transaction = df_transactions_s[df_transactions_s['event'] == 'transaction']
df_offer_received = df_transactions_s[df_transactions_s['event'] == 'offer received']
df_offer_completed = df_transactions_s[df_transactions_s['event'] == 'offer completed']

df_transaction = df_transaction[['account_id', 'amount', 'time_since_test_start']]
df_offer_received = df_offer_received[['account_id', 'offer id', 'time_since_test_start']]
df_offer_completed = df_offer_completed[['account_id', 'offer_id', 'reward', 'time_since_test_start']]

df_transaction = df_transaction.rename(columns={
    'time_since_test_start': 'transaction_time',
})
df_offer_received = df_offer_received.rename(columns={
    'offer id': 'offer_id',
    'time_since_test_start': 'received_time',
})
df_offer_completed = df_offer_completed.rename(columns={
    'time_since_test_start': 'transaction_time',
})

## Timeline

In [23]:
df_timeline = pd.read_csv(DATA_TEMP_PATH + 'transaction_timeline.csv')

## Merge

In [42]:
df_transaction_m = (
    df_transaction
    .merge(
        df_offer_completed, 
        on=['account_id', 'transaction_time'], 
        how='left'
    )
    .merge(
        df_offer_received, 
        on=['account_id', 'offer_id'], 
        how='outer'
    )
    .merge(
        df_profile,
        on='account_id',
        how='inner'
    )
    .merge(
        df_offers,
        on='offer_id',
        how='inner'
    )
    .merge(
        df_timeline,
        left_on=['account_id', 'received_time'],
        right_on=['account_id', 'time'],
        how='left'
    )
)

df_transaction_m = df_transaction_m[df_transaction_m['offer_type'].isin(['discount', 'bogo'])]
df_transaction_m['offer_type'] = df_transaction_m['offer_type'].fillna('no_offer')

df_transaction_m['random_feature'] = np.random.rand(len(df_transaction_m))

df_transaction_m['converted'] = df_transaction_m['transaction_time'].notnull().astype(int)
# df_transaction_m['target'] = np.where(
#     df_transaction_m['converted'] == 0, 0,
#     np.where(
#         df_transaction_m['offer_type'] == 'bogo', 1, 2
#     )
# )

df_transaction_bogo = df_transaction_m[df_transaction_m['offer_type'] == 'bogo']
df_transaction_discount = df_transaction_m[df_transaction_m['offer_type'] == 'discount']

drop_cols = [
    'amount',
    'transaction_time',
    'received_time',
    'time',
    'channels',
    'reward',
    'offer_type',
]

df_transaction_bogo = df_transaction_bogo.drop(columns=drop_cols)
df_transaction_discount = df_transaction_discount.drop(columns=drop_cols)

# Test Classification

In [44]:
df_transaction_bogo['converted'].value_counts(normalize=True)

converted
1    0.674842
0    0.325158
Name: proportion, dtype: float64

## BOGO Model

In [60]:
df_features_bogo = df_transaction_bogo.set_index(['account_id', 'offer_id']).fillna(0)

X_b = df_features_bogo.drop(columns=['converted'])
y_b = df_features_bogo['converted']

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_b, y_b, test_size=0.3, random_state=42, stratify=y_b
)

bogo_models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

bogo_results = []
# Treinar e avaliar cada modelo
for nome, modelo in bogo_models.items():
    modelo.fit(X_train_b, y_train_b)
    y_pred_b = modelo.predict(X_test_b)
    
    bogo_results.append({
        'Modelo': nome,
        'AUC': roc_auc_score(y_test_b, y_pred_b),
        'Acurácia': accuracy_score(y_test_b, y_pred_b),
    })

df_bogo_results = pd.DataFrame(bogo_results)
df_bogo_results

Unnamed: 0,Modelo,AUC,Acurácia
0,LogisticRegression,0.734358,0.790444
1,RandomForest,0.773796,0.819477
2,XGBoost,0.779077,0.819691


In [61]:
xgb_model = bogo_models["XGBoost"]
importancias = xgb_model.feature_importances_
nomes_features = X_b.columns

# Criar DataFrame ordenado
importancia_df = pd.DataFrame({
    "Feature": nomes_features,
    "Importância": importancias
}).sort_values(by="Importância", ascending=False)

# Plot com plotly.graph_objects
fig = go.Figure([
    go.Bar(
        x=importancia_df["Importância"],
        y=importancia_df["Feature"],
        orientation='h'
    )
])

fig.update_layout(
    title="Importância das Features - XGBoost - Ofertas BOGO",
    xaxis_title="Importância",
    yaxis_title="Feature",
    yaxis=dict(autorange="reversed"),
    height=600,
)

fig.show()

In [64]:
xgb_model = bogo_models["XGBoost"]
y_pred_b = xgb_model.predict(X_test_b)

X_test_result = X_test_b.copy()
X_test_result['target_true'] = y_test_b
X_test_result['target_pred'] = y_pred_b
X_test_result['not_converted_prob'] = xgb_model.predict_proba(X_test_b)[:, 0].round(3)
X_test_result['converted_prob'] = xgb_model.predict_proba(X_test_b)[:, 1].round(3)

print(classification_report(y_test_b, y_pred_b))

              precision    recall  f1-score   support

           0       0.75      0.66      0.71      3035
           1       0.85      0.90      0.87      6299

    accuracy                           0.82      9334
   macro avg       0.80      0.78      0.79      9334
weighted avg       0.82      0.82      0.82      9334



In [66]:
X_test_result.drop(columns=X_test_b)

Unnamed: 0_level_0,Unnamed: 1_level_0,target_true,target_pred,not_converted_prob,converted_prob
account_id,offer_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0d1f4bc41883493ba6f72af2485bab65,ae264e3637204a6fb9bb56bc8210ddfd,1,1,0.036,0.964
db71cc5c39584742b495139b180daa20,ae264e3637204a6fb9bb56bc8210ddfd,0,0,0.917,0.083
095bc1a9b5f64d0f88ed616df292a3ec,f19421c1d4aa40978ebb69ca19b0e20d,1,1,0.347,0.653
16fb582d943d42c4a8a9865aaacf51d6,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,0,0.602,0.398
3dfc442e93894be19a3f5e8cf7e9c779,9b98b8c7a33c4b65b9aebfe6a799e6d9,0,0,0.724,0.276
...,...,...,...,...,...
2d567d7e1ca546c69208142a66ae2c2c,f19421c1d4aa40978ebb69ca19b0e20d,0,1,0.377,0.623
d88cdfa2c2784945a1716ea39c03e6ae,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0.353,0.647
ccdf319d46c4438880b52e240c1a4397,9b98b8c7a33c4b65b9aebfe6a799e6d9,0,0,0.860,0.140
316ff0fdde4547b68c6aefe8d362f4f6,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0.143,0.857


## Discount Model

In [62]:
df_features_discount = df_transaction_discount.set_index(['account_id', 'offer_id']).fillna(0)

X_d = df_features_discount.drop(columns=['converted'])
y_d = df_features_discount['converted']

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    X_d, y_d, test_size=0.3, random_state=42, stratify=y_d
)

discount_models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

discount_results = []
# Treinar e avaliar cada modelo
for nome, modelo in discount_models.items():
    modelo.fit(X_train_d, y_train_d)
    y_pred_d = modelo.predict(X_test_d)
    
    discount_results.append({
        'Modelo': nome,
        'AUC': roc_auc_score(y_test_d, y_pred_d),
        'Acurácia': accuracy_score(y_test_d, y_pred_d),
    })

df_discount_results = pd.DataFrame(discount_results)
df_discount_results

Unnamed: 0,Modelo,AUC,Acurácia
0,LogisticRegression,0.691175,0.803701
1,RandomForest,0.757795,0.836037
2,XGBoost,0.758656,0.829694


In [63]:
xgb_model = discount_models["XGBoost"]
importancias = xgb_model.feature_importances_
nomes_features = X_d.columns

# Criar DataFrame ordenado
importancia_df = pd.DataFrame({
    "Feature": nomes_features,
    "Importância": importancias
}).sort_values(by="Importância", ascending=False)

# Plot com plotly.graph_objects
fig = go.Figure([
    go.Bar(
        x=importancia_df["Importância"],
        y=importancia_df["Feature"],
        orientation='h'
    )
])

fig.update_layout(
    title="Importância das Features - XGBoost - Ofertas Desconto",
    xaxis_title="Importância",
    yaxis_title="Feature",
    yaxis=dict(autorange="reversed"),
    height=600,
)

fig.show()

In [67]:
xgb_model = discount_models["XGBoost"]
y_pred_d = xgb_model.predict(X_test_d)

X_test_result = X_test_d.copy()
X_test_result['target_true'] = y_test_d
X_test_result['target_pred'] = y_pred_d
X_test_result['not_converted_prob'] = xgb_model.predict_proba(X_test_d)[:, 0].round(3)
X_test_result['converted_prob'] = xgb_model.predict_proba(X_test_d)[:, 1].round(3)

print(classification_report(y_test_d, y_pred_d))

              precision    recall  f1-score   support

           0       0.71      0.61      0.65      2541
           1       0.87      0.91      0.89      7077

    accuracy                           0.83      9618
   macro avg       0.79      0.76      0.77      9618
weighted avg       0.82      0.83      0.83      9618



In [68]:
X_test_result.drop(columns=X_test_b)

Unnamed: 0_level_0,Unnamed: 1_level_0,target_true,target_pred,not_converted_prob,converted_prob
account_id,offer_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
93893ec00b5346228088f867e6164b1e,fafdcd668e3743c1bb461111dcafc2a4,1,1,0.005,0.995
9f9651ad180f4b6291a58455eee9f2ff,fafdcd668e3743c1bb461111dcafc2a4,0,1,0.044,0.956
397e33ebb12e4a3aa1e8ec840c84d104,2298d6c36e964ae4a3e7e9706d1fb8c2,1,1,0.004,0.996
91b4ec7019ad47538ba7a3b82f1119af,2298d6c36e964ae4a3e7e9706d1fb8c2,1,1,0.022,0.978
8d30c524932649a5b4021e56f6ad9580,2906b810c7d4411798c6938adc9daaa5,0,1,0.046,0.954
...,...,...,...,...,...
6fb5a191d0d6459fbfb659bb14a3101e,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,1,0.043,0.957
8fc40f08bcc7442796802b19ffcc9e04,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,1,0.130,0.870
b5b515ff26234d548bad3558992d1389,fafdcd668e3743c1bb461111dcafc2a4,0,1,0.198,0.802
bfdf513e4761437a920fe955751069d4,2906b810c7d4411798c6938adc9daaa5,1,1,0.148,0.852
