In [1]:
import pandas as pd
import numpy as np

import  pyspark.sql. functions as F
import pyspark.sql.types as T

from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.colors import DEFAULT_PLOTLY_COLORS

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

spark = SparkSession.builder.appName("iFood Case").master("local[*]").getOrCreate()

25/07/14 12:34:51 WARN Utils: Your hostname, MARCO-PC resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/07/14 12:34:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/14 12:34:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
BASE_PATH = '/home/marco/ifood-case'
DATA_PROCESSED_PATH = BASE_PATH + '/data/processed/'

In [3]:
profile = spark.read.csv(DATA_PROCESSED_PATH + 'profile_processed.csv', header=True, inferSchema=True)
offers = spark.read.csv(DATA_PROCESSED_PATH + 'offers_processed.csv', header=True, inferSchema=True)
transactions = spark.read.csv(DATA_PROCESSED_PATH + 'event_transaction_processed.csv', header=True, inferSchema=True)
offer_received = spark.read.csv(DATA_PROCESSED_PATH + 'event_offer_received_processed.csv', header=True, inferSchema=True)
offer_viewed = spark.read.csv(DATA_PROCESSED_PATH + 'event_offer_viewed_processed.csv', header=True, inferSchema=True)
offer_completed = spark.read.csv(DATA_PROCESSED_PATH + 'event_offer_completed_processed.csv', header=True, inferSchema=True)

In [4]:
df_offer_transactions_tl = (
    transactions
    .join(
        offer_completed,
        on=['account_id', 'transaction_time',],
        how='left'
    )
    .withColumn('event', F.when(
        F.col('offer_id').isNotNull(), F.lit('offer completed')).otherwise(F.lit('transaction'))
    )
    .select([
        F.col('account_id'),
        F.col('offer_id'),
        F.col('transaction_time').alias('time'),
        F.col('event'),
        F.col('amount'),
        F.col('reward')
    ])
    .fillna(0, subset=['amount', 'reward'])
)

df_offer_received_tl = (
    offer_received
    .select([
        F.col('account_id'),
        F.col('offer_id'),
        F.col('received_time').alias('time'),
        F.lit('offer received').alias('event'),
        F.lit(0).alias('amount'),
        F.lit(0).alias('reward')
    ])
)

df_offer_viewed_tl = (
    offer_viewed
    .select([
        F.col('account_id'),
        F.col('offer_id'),
        F.col('viewed_time').alias('time'),
        F.lit('offer viewed').alias('event'),
        F.lit(0).alias('amount'),
        F.lit(0).alias('reward')
    ])
)

df_timeline_features = (
    df_offer_transactions_tl
    .union(df_offer_received_tl)
    .union(df_offer_viewed_tl)
    .orderBy(['account_id', 'time'])
    .join(
        offers.select('offer_id', 'offer_type'),
        on='offer_id',
        how='left'
    )
    .withColumn('transactions_count', 
        (F.col('event') == 'transaction').cast(T.IntegerType())
    )
    .withColumn('received_bogo_count', 
        ((F.col('event') == 'offer received') & 
         (F.col('offer_type') == 'bogo')).cast(T.IntegerType())
    )
    .withColumn('received_discount_count', 
        ((F.col('event') == 'offer received') & 
         (F.col('offer_type') == 'discount')).cast(T.IntegerType())
    )
    .withColumn('received_info_count', 
        ((F.col('event') == 'offer received') & 
         (F.col('offer_type') == 'informational')).cast(T.IntegerType())
    )
    .withColumn('viewed_bogo_count', 
        ((F.col('event') == 'offer viewed') & 
         (F.col('offer_type') == 'bogo')).cast(T.IntegerType())
    )
    .withColumn('viewed_discount_count', 
        ((F.col('event') == 'offer viewed') & 
         (F.col('offer_type') == 'discount')).cast(T.IntegerType())
    )
    .withColumn('viewed_info_count', 
        ((F.col('event') == 'offer viewed') & 
         (F.col('offer_type') == 'informational')).cast(T.IntegerType())
    )
    .withColumn('completed_bogo_count', 
        ((F.col('event') == 'offer completed') & 
         (F.col('offer_type') == 'bogo')).cast(T.IntegerType())
    )
    .withColumn('completed_discount_count', 
        ((F.col('event') == 'offer completed') & 
         (F.col('offer_type') == 'discount')).cast(T.IntegerType())
    )
    .withColumn('completed_info_count', 
        ((F.col('event') == 'offer completed') & 
         (F.col('offer_type') == 'informational')).cast(T.IntegerType())
    )
)

w = Window.partitionBy('account_id').orderBy('time').rowsBetween(Window.unboundedPreceding, Window.currentRow)

df_timeline_features = (
    df_timeline_features
    .groupBy(['account_id', 'time']).sum()
    .withColumn('total_amount', F.round(F.sum('sum(amount)').over(w), 2))
    .withColumn('total_reward', F.round(F.sum('sum(reward)').over(w), 2))
    .withColumn('total_transactions', F.sum('sum(transactions_count)').over(w))
    .withColumn('total_received_bogo', F.sum('sum(received_bogo_count)').over(w))
    .withColumn('total_received_discount', F.sum('sum(received_discount_count)').over(w))
    .withColumn('total_received_info', F.sum('sum(received_info_count)').over(w))
    .withColumn('total_viewed_bogo', F.sum('sum(viewed_bogo_count)').over(w))
    .withColumn('total_viewed_discount', F.sum('sum(viewed_discount_count)').over(w))
    .withColumn('total_viewed_info', F.sum('sum(viewed_info_count)').over(w))
    .withColumn('total_completed_bogo', F.sum('sum(completed_bogo_count)').over(w))
    .withColumn('total_completed_discount', F.sum('sum(completed_discount_count)').over(w))
    .withColumn('total_completed_info', F.sum('sum(completed_info_count)').over(w))
)

drop_agg_cols = [col for col in df_timeline_features.columns if col.startswith('sum(')]
df_timeline_features = df_timeline_features.drop(*drop_agg_cols)

df_offer_event_features = (
    transactions
    .join(
        offer_completed, 
        on=['account_id', 'transaction_time'], 
        how='left'
    )
    .join(
        offer_received.drop('amount', 'reward'), 
        on=['account_id', 'offer_id'], 
        how='outer'
    )
    .join(
        offers.select('offer_id', 'offer_type'),
        on='offer_id',
        how='left'
    )
    .fillna({'offer_type': 'no_offer'})
    .withColumn('offer_consumed', ((F.col('offer_id').isNotNull()) &
                                   (F.col('transaction_time').isNotNull())).cast(T.IntegerType()))
)

In [5]:
df_timeline = df_timeline_features.toPandas()
df_offer_events = df_offer_event_features.toPandas()
df_profile = profile.toPandas()
df_offers = offers.toPandas()

df_timeline = df_timeline.set_index(['account_id', 'time']).reset_index()
df_offer_events = df_offer_events.set_index(['account_id', 'offer_id', 'transaction_time']).reset_index()

df_bogo = df_offer_events[df_offer_events['offer_type'] == 'bogo']
df_discount = df_offer_events[df_offer_events['offer_type'] == 'discount']
df_info = df_offer_events[df_offer_events['offer_type'] == 'informational']

                                                                                

In [6]:
total_bogo = len(df_bogo)
total_discount = len(df_discount)
total_info = len(df_info)

consumed_bogo = df_bogo['offer_consumed'].sum()
consumed_discount = df_discount['offer_consumed'].sum()
consumed_info = df_info['offer_consumed'].sum()
not_consumed_bogo = total_bogo - consumed_bogo
not_consumed_discount = total_discount - consumed_discount
not_consumed_info = total_info - consumed_info

consumed_bogo_str = f"{int(np.round((consumed_bogo / total_bogo) * 100, 0))}%"
consumed_discount_str = f"{int(np.round((consumed_discount / total_discount) * 100, 0))}%"
consumed_info_str = f"{int(np.round((consumed_info / total_info) * 100, 0))}%"
not_consumed_bogo_str = f"{int(np.round((not_consumed_bogo / total_bogo) * 100, 0))}%"
not_consumed_discount_str = f"{int(np.round((not_consumed_discount / total_discount) * 100, 0))}%"
not_consumed_info_str = f"{int(np.round((not_consumed_info / total_info) * 100, 0))}%"

fig = go.Figure([
    go.Bar(
        x=['BOGO', 'Desconto', 'Informação'],
        y=[consumed_bogo, consumed_discount, consumed_info],
        text=[consumed_bogo_str, consumed_discount_str, consumed_info_str],
        name='Consumidas',
        marker_color="#667DF3",
        textfont=dict(size=14),
        legendrank=1
    ),
    go.Bar(
        x=['BOGO', 'Desconto', 'Informação'],
        y=[not_consumed_bogo, not_consumed_discount, not_consumed_info],
        text=[not_consumed_bogo_str, not_consumed_discount_str, not_consumed_info_str],
        name='Não Consumidas',
        marker_color="#DB6F7A",
        textfont=dict(size=14),
        legendrank=2
    )
])

fig.update_layout(
    barmode='stack',
    title='Ofertas Recebidas vs. Proporção de Uso',
    yaxis=dict(title='Quantidade de Ofertas'),
    legend=dict(
        x=0.5,
        xanchor='center',
        y=-0.1,
        yanchor='top',
        orientation='h',
    ),
    width=500
)

fig.show()

In [None]:
df_offer_events_s = df_offer_events[df_offer_events['offer_type'].isin(['bogo', 'discount'])]
df_offer_events_s = df_offer_events_s[['account_id', 'offer_id', 'offer_type', 'received_time', 'offer_consumed']]

df_offer_events_s = (
    df_offer_events_s
    .merge(
        df_timeline, 
        left_on=['account_id', 'received_time'],
        right_on=['account_id', 'time'],
        how='left'
    )
    .merge(
        df_profile, 
        on=['account_id'],
        how='left'
    )
)

df_offer_events_s = df_offer_events_s.drop(columns=['received_time'])
df_offer_events_s = df_offer_events_s.drop_duplicates()
df_offer_events_g = df_offer_events_s.groupby(['account_id', 'offer_id', 'time']).sum(numeric_only=True)
df_offer_events_g = df_offer_events_g.reset_index().sort_values(['account_id', 'time'], ascending=False)
df_offer_events_g['order'] = df_offer_events_g.groupby('account_id').cumcount() + 1
df_offer_events_g = df_offer_events_g[df_offer_events_g['order'] == 1].drop(columns=['order'])
df_offer_events_g = df_offer_events_g.set_index(['account_id', 'offer_id', 'time'])

X = df_offer_events_g.drop(columns=['offer_consumed'])
y = df_offer_events_g['offer_consumed']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
X_scaled['pca1'] = X_pca[:, 0]
X_scaled['pca2'] = X_pca[:, 1]
X_scaled['offer_consumed'] = y.values

X_scaled_0 = X_scaled[X_scaled['offer_consumed'] == 0]
X_scaled_1 = X_scaled[X_scaled['offer_consumed'] == 1]

fig = go.Figure([
    go.Scatter(
        x=X_scaled_1['pca1'],
        y=X_scaled_1['pca2'],
        marker_color='#667DF3',
        mode='markers',
        name='Consumidas'
    ),
    go.Scatter(
        x=X_scaled_0['pca1'],
        y=X_scaled_0['pca2'],
        marker_color='#DB6F7A',
        mode='markers',
        name='Não Consumidas'
    ),
])

fig.update_layout(
    title={
        'text': 'Dispersão de Ofertas Consumidas por Usuário',
        'x': 0.5,
        'y': 0.85,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    legend=dict(x=0.5, xanchor='center', y=-0.1, yanchor='top', orientation='h'),
    width=700
)

fig.show()