In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window

import  pyspark.sql. functions as F
import pyspark.sql.types as T

spark = SparkSession.builder.appName("iFood Case").master("local[*]").getOrCreate()

In [31]:
BASE_PATH = '/home/marco/ifood-case'
DATA_RAW_PATH = BASE_PATH + '/data/raw/'
DATA_PROCESSED_PATH = BASE_PATH + '/data/processed/'

## Profile Feature

In [32]:
df_profile = spark.read.json(DATA_RAW_PATH + 'profile.json')

df_profile_s = (
    df_profile
    .filter(F.col('age') <= 101)
    .withColumn('registered_on', F.to_date(F.col('registered_on'), 'yyyyMMdd'))
    .withColumn('actual_date', F.lit('2019-01-01').cast('date'))
    .fillna({'gender': 'O'})
    .select([
        F.col('id').alias('account_id'),
        F.col('age'),
        F.datediff(F.col('actual_date'), F.col('registered_on')).alias('registered_days'),
        F.when(F.col('gender') == 'F', 1).otherwise(0).alias('gender_F'),
        F.when(F.col('gender') == 'M', 1).otherwise(0).alias('gender_M'),
        F.when(F.col('gender') == 'O', 1).otherwise(0).alias('gender_O'),
    ])
)

In [33]:
df_profile_s.toPandas().to_csv(DATA_PROCESSED_PATH + 'profile_features.csv', index=False)

## Transaction Timeline Features

In [34]:
df_transaction_events = spark.read.json(DATA_RAW_PATH + 'transactions.json')
df_offers = spark.read.json(DATA_RAW_PATH + 'offers.json')

                                                                                

In [35]:
df_offers_s = df_offers.select([
    F.col("id").alias("offer_id"),
    F.col('offer_type')
])

In [36]:
df_transaction = (
    df_transaction_events
    .filter(F.col('event') == 'transaction')
    .alias('transaction')
    .select([
        F.col('account_id'), 
        F.col('value.amount').alias('amount'), 
        F.col('time_since_test_start').alias('time')
    ])
)

df_offer_completed = (
    df_transaction_events
    .filter(F.col('event') == 'offer completed')
    .alias('offer_completed')
    .select([
        F.col('account_id'),
        F.col('value.offer_id').alias('offer_id'),
        F.col('value.reward').alias('reward'),
        F.col('time_since_test_start').alias('time'),
    ])
)

df_offer_transactions = (
    df_transaction
    .join(
        df_offer_completed,
        on=['account_id', 'time',],
        how='left'
    )
    .withColumn('event', F.when(
        F.col('offer_id').isNotNull(), F.lit('offer completed')).otherwise(F.lit('transaction'))
    )
    .select([
        F.col('account_id'),
        F.col('offer_id'),
        F.col('time'),
        F.col('event'),
        F.col('amount'),
        F.col('reward')
    ])
    .fillna(0, subset=['amount', 'reward'])
)

df_offer_received = (
    df_transaction_events
    .filter(F.col('event') == 'offer received')
    .alias('offer_received')
    .select([
        F.col('account_id'),
        F.col('value.offer id').alias('offer_id'),
        F.col('time_since_test_start').alias('time'),
        F.col('event'),
        F.lit(0).alias('amount'),
        F.lit(0).alias('reward')
    ])
)

df_offer_viewed = (
    df_transaction_events
    .filter(F.col('event') == 'offer viewed')
    .alias('offer_viewed')
    .select([
        F.col('account_id'),
        F.col('value.offer id').alias('offer_id'),
        F.col('time_since_test_start').alias('time'),
        F.col('event'),
        F.lit(0).alias('amount'),
        F.lit(0).alias('reward')
    ])
)

In [37]:
df_timeline = (
    df_offer_transactions
    .union(df_offer_received)
    .union(df_offer_viewed)
    .orderBy(['account_id', 'time'])
    .join(
        df_offers_s,
        on='offer_id',
        how='left'
    )
    .withColumn('transactions_count', 
        (F.col('event') == 'transaction').cast(T.IntegerType())
    )
    .withColumn('received_bogo_count', 
        ((F.col('event') == 'offer received') & 
         (F.col('offer_type') == 'bogo')).cast(T.IntegerType())
    )
    .withColumn('received_discount_count', 
        ((F.col('event') == 'offer received') & 
         (F.col('offer_type') == 'discount')).cast(T.IntegerType())
    )
    .withColumn('received_info_count', 
        ((F.col('event') == 'offer received') & 
         (F.col('offer_type') == 'informational')).cast(T.IntegerType())
    )
    .withColumn('viewed_bogo_count', 
        ((F.col('event') == 'offer viewed') & 
         (F.col('offer_type') == 'bogo')).cast(T.IntegerType())
    )
    .withColumn('viewed_discount_count', 
        ((F.col('event') == 'offer viewed') & 
         (F.col('offer_type') == 'discount')).cast(T.IntegerType())
    )
    .withColumn('viewed_info_count', 
        ((F.col('event') == 'offer viewed') & 
         (F.col('offer_type') == 'informational')).cast(T.IntegerType())
    )
    .withColumn('completed_bogo_count', 
        ((F.col('event') == 'offer completed') & 
         (F.col('offer_type') == 'bogo')).cast(T.IntegerType())
    )
    .withColumn('completed_discount_count', 
        ((F.col('event') == 'offer completed') & 
         (F.col('offer_type') == 'discount')).cast(T.IntegerType())
    )
    .withColumn('completed_info_count', 
        ((F.col('event') == 'offer completed') & 
         (F.col('offer_type') == 'informational')).cast(T.IntegerType())
    )
)

In [38]:
w = Window.partitionBy('account_id').orderBy('time').rowsBetween(Window.unboundedPreceding, Window.currentRow)

df_timeline_g = (
    df_timeline
    .groupBy(['account_id', 'time']).sum()
    .withColumn('total_amount', F.round(F.sum('sum(amount)').over(w), 2))
    .withColumn('total_reward', F.round(F.sum('sum(reward)').over(w), 2))
    .withColumn('total_transactions', F.sum('sum(transactions_count)').over(w))
    .withColumn('total_received_bogo', F.sum('sum(received_bogo_count)').over(w))
    .withColumn('total_received_discount', F.sum('sum(received_discount_count)').over(w))
    .withColumn('total_received_info', F.sum('sum(received_info_count)').over(w))
    .withColumn('total_viewed_bogo', F.sum('sum(viewed_bogo_count)').over(w))
    .withColumn('total_viewed_discount', F.sum('sum(viewed_discount_count)').over(w))
    .withColumn('total_viewed_info', F.sum('sum(viewed_info_count)').over(w))
    .withColumn('total_completed_bogo', F.sum('sum(completed_bogo_count)').over(w))
    .withColumn('total_completed_discount', F.sum('sum(completed_discount_count)').over(w))
    .withColumn('total_completed_info', F.sum('sum(completed_info_count)').over(w))
)

drop_agg_cols = [col for col in df_timeline_g.columns if col.startswith('sum(')]
df_timeline_g = df_timeline_g.drop(*drop_agg_cols)

In [39]:
df_timeline_g.toPandas().to_csv(DATA_PROCESSED_PATH + 'transaction_timeline.csv', index=False)

                                                                                