In [94]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import plotly.graph_objects as go
from plotly.colors import DEFAULT_PLOTLY_COLORS

import warnings
warnings.filterwarnings('ignore')

In [2]:
# BASE_PATH = '/Workspace/Users/marcodaniel.ml@hotmail.com/ifood-case'
BASE_PATH = 'D:/Downloads/IFood/ifood-case/'
DATA_RAW_PATH = BASE_PATH + '/data/raw/'

# profile

In [60]:
df_profile = pd.read_json(DATA_RAW_PATH + 'profile.json')

df_profile['age'] = df_profile['age'].astype(int)
df_profile = df_profile[df_profile['age'] <= 101]
df_profile['registered_on'] = pd.to_datetime(df_profile['registered_on'], format='%Y%m%d')
df_profile['gender'] = df_profile['gender'].str.upper().fillna('O')
df_profile['id'] = df_profile['id'].str.strip()
df_profile['credit_card_limit'] = df_profile['credit_card_limit'].astype(float)
df_profile = df_profile.rename(columns={'id': 'account_id'})

df_profile['actual_date'] = pd.to_datetime('2019-01-01')
df_profile['registered_days'] = (df_profile['actual_date'] - df_profile['registered_on']).dt.days
df_profile = df_profile.drop(['registered_on', 'actual_date'], axis=1)

df_profile = pd.get_dummies(df_profile, columns=['gender'])
df_profile['gender_F'] = df_profile['gender_F'].astype(int)
df_profile['gender_M'] = df_profile['gender_M'].astype(int)
df_profile['gender_O'] = df_profile['gender_O'].astype(int)

# offers

In [62]:
df_offers = pd.read_json(DATA_RAW_PATH + 'offers.json')

df_offers['min_value'] = df_offers['min_value'].astype(float)
df_offers['duration'] = df_offers['duration'].astype(int)
df_offers['id'] = df_offers['id'].str.strip()
df_offers['discount_value'] = df_offers['discount_value'].astype(float)

df_offers = df_offers.rename(columns={'id': 'offer_id'})

# transactions

In [63]:
df_transactions = pd.read_json(DATA_RAW_PATH + 'transactions.json')

df_transactions['account_id'] = df_transactions['account_id'].str.strip()
df_transactions['time_since_test_start'] = df_transactions['time_since_test_start'].astype(float)

value_expanded = df_transactions['value'].apply(pd.Series)
df_transactions_s = pd.concat([df_transactions.drop(columns='value'), value_expanded], axis=1)

In [64]:
df_transaction = df_transactions_s[df_transactions_s['event'] == 'transaction']
df_offer_received = df_transactions_s[df_transactions_s['event'] == 'offer received']
df_offer_viewed = df_transactions_s[df_transactions_s['event'] == 'offer viewed']
df_offer_completed = df_transactions_s[df_transactions_s['event'] == 'offer completed']

df_transaction = df_transaction[['account_id', 'amount', 'time_since_test_start']]
df_offer_received = df_offer_received[['account_id', 'offer id', 'time_since_test_start']]
df_offer_viewed = df_offer_viewed[['account_id', 'offer id', 'time_since_test_start']]
df_offer_completed = df_offer_completed[['account_id', 'offer_id', 'reward', 'time_since_test_start']]

df_offer_received = df_offer_received.rename(columns={'offer id': 'offer_id'})
df_offer_viewed = df_offer_viewed.rename(columns={'offer id': 'offer_id'})

In [65]:
df_transaction_g = df_transaction.groupby('account_id').agg(
    total_amount=('amount', 'sum'),
    transactions=('amount', 'count')
)
df_transaction_g = df_transaction_g.reset_index()

In [66]:
df_offer_received_m = df_offer_received.merge(df_offers, on='offer_id')
df_offer_received_g = df_offer_received_m.groupby(['account_id', 'offer_type'])['offer_id'].count()
df_offer_received_g = df_offer_received_g.reset_index()

df_offer_received_p = df_offer_received_g.pivot_table(
    index='account_id',
    columns='offer_type',
    values='offer_id',
    fill_value=0
)
df_offer_received_p = df_offer_received_p.rename(columns={
    'bogo': 'received_bogo',
    'discount': 'received_discount',
    'informational': 'received_info'
})
df_offer_received_p = df_offer_received_p.reset_index()

In [67]:
df_offer_viewed_m = df_offer_viewed.merge(df_offers, on='offer_id')
df_offer_viewed_g = df_offer_viewed_m.groupby(['account_id', 'offer_type'])['offer_id'].count()
df_offer_viewed_g = df_offer_viewed_g.reset_index()

df_offer_viewed_p = df_offer_viewed_g.pivot_table(
    index='account_id',
    columns='offer_type',
    values='offer_id',
    fill_value=0
)
df_offer_viewed_p = df_offer_viewed_p.rename(columns={
    'bogo': 'viewed_bogo',
    'discount': 'viewed_discount',
    'informational': 'viewed_info'
})
df_offer_viewed_p = df_offer_viewed_p.reset_index()

In [68]:
df_offer_completed_m = df_offer_completed.merge(df_offers, on='offer_id')
df_offer_completed_g = df_offer_completed_m.groupby(['account_id', 'offer_type'])['offer_id'].count()
df_offer_completed_g = df_offer_completed_g.reset_index()

df_offer_completed_p = df_offer_completed_g.pivot_table(
    index='account_id',
    columns='offer_type',
    values='offer_id',
    fill_value=0
)
df_offer_completed_p = df_offer_completed_p.rename(columns={
    'bogo': 'completed_bogo',
    'discount': 'completed_discount',
    'informational': 'completed_info'
})
df_offer_completed_p = df_offer_completed_p.reset_index()

In [69]:
df_profile_m = (
    df_profile
    .merge(
        df_transaction_g, 
        on=['account_id'], 
        how='left'
    )
    .merge(
        df_offer_received_p, 
        on=['account_id'], 
        how='left'
    )
    .merge(
        df_offer_viewed_p, 
        on=['account_id'], 
        how='left'
    )
    .merge(
        df_offer_completed_p, 
        on=['account_id'], 
        how='left'
    )
    .fillna(0)
)

df_profile_m

Unnamed: 0,age,account_id,credit_card_limit,registered_days,gender_F,gender_M,gender_O,total_amount,transactions,received_bogo,received_discount,received_info,viewed_bogo,viewed_discount,viewed_info,completed_bogo,completed_discount
0,55,0610b486422d4921ae7d2bf64640c50b,112000.0,535,1,0,0,77.01,3.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,602,1,0,0,159.27,7.0,3.0,0.0,1.0,3.0,0.0,1.0,3.0,0.0
2,68,e2127556f4f64592b11af22de27a7932,70000.0,250,0,1,0,57.73,3.0,1.0,2.0,1.0,1.0,2.0,0.0,1.0,1.0
3,65,389bc3fa690240e798340f5a15918d5c,53000.0,326,0,1,0,36.43,3.0,4.0,2.0,0.0,4.0,2.0,0.0,3.0,2.0
4,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,416,0,1,0,15.62,4.0,0.0,2.0,1.0,0.0,2.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14820,45,6d5f3a774f3d4714ab0c092238f3a1d7,54000.0,211,1,0,0,20.03,7.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
14821,61,2cb4f97358b841b9a9773a7aa05a9d77,72000.0,172,0,1,0,25.97,7.0,1.0,0.0,2.0,0.0,0.0,1.0,1.0,0.0
14822,49,01d26f638c274aa0b965d24cefe3183f,73000.0,705,0,1,0,39.74,8.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0
14823,83,9dc1421481194dcd9400aec7c9ae6366,50000.0,1030,1,0,0,189.67,14.0,3.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0


# Clustering

In [76]:
df_profile_c = df_profile_m.set_index('account_id')
df_profile_c = df_profile_c.select_dtypes(include=[np.number]).dropna()

In [78]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_profile_c)

In [81]:
silhouette_scores = []
k_range = range(2, 50)
for k in tqdm(k_range):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    silhouette_scores.append(score)

100%|██████████| 48/48 [01:11<00:00,  1.50s/it]


In [146]:
fig = go.Figure([
    go.Scatter(
        x=list(k_range),
        y=silhouette_scores,
        mode='lines+markers',
        marker=dict(size=10),
        line=dict(width=2),
        name='Silhouette Score'
    )
])

fig.update_layout(
    title='Silhouette Score por Número de Clusters',
    xaxis_title='Número de Clusters (k)',
    yaxis_title='Silhouette Score',
    template='plotly_white'
)
fig.show()

In [104]:
best_k = k_range[np.argmax(silhouette_scores)]
kmeans = KMeans(n_clusters=best_k, random_state=42)
df_profile_c['cluster'] = kmeans.fit_predict(X_scaled)
best_k

4

In [99]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df_profile_m['pca1'] = X_pca[:, 0]
df_profile_m['pca2'] = X_pca[:, 1]

unique_clusters = df_profile_m['cluster'].unique()

fig = go.Figure()
for i, cluster in enumerate(unique_clusters):
    cluster_data = df_profile_m[df_profile_m['cluster'] == cluster]
    fig.add_trace(go.Scatter(
        x=cluster_data['pca1'],
        y=cluster_data['pca2'],
        mode='markers',
        name=f'Cluster {cluster}',
        marker=dict(color=DEFAULT_PLOTLY_COLORS[i]),
    ))

fig.update_layout(
    title='Clusters com PCA (colorido por grupo)',
    xaxis_title='PCA 1',
    yaxis_title='PCA 2',
    template='plotly_white'
)
fig.show()

In [101]:
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)
df_profile_m['pca1'] = X_pca[:, 0]
df_profile_m['pca2'] = X_pca[:, 1]
df_profile_m['pca3'] = X_pca[:, 2]

unique_clusters = df_profile_m['cluster'].unique()

# 3. Criar figura 3D
fig = go.Figure()

for i, cluster in enumerate(unique_clusters):
    cluster_data = df_profile_m[df_profile_m['cluster'] == cluster]
    fig.add_trace(go.Scatter3d(
        x=cluster_data['pca1'],
        y=cluster_data['pca2'],
        z=cluster_data['pca3'],
        mode='markers',
        name=f'Cluster {cluster}',
        marker=dict(size=5, color=DEFAULT_PLOTLY_COLORS[i]),
        hovertext=cluster_data.index.astype(str)
    ))

fig.update_layout(
    title='Visualização 3D dos Clusters com PCA',
    scene=dict(
        xaxis_title='PCA 1',
        yaxis_title='PCA 2',
        zaxis_title='PCA 3'
    ),
    template='plotly_white'
)

fig.show()


In [None]:
# df_profile_c.reset_index().to_csv(BASE_PATH + '/data/temp/profile_clustering.csv', index=False)