In [190]:
from pyspark.sql import SparkSession

import  pyspark.sql. functions as F
import pyspark.sql.types as T

spark = SparkSession.builder.appName("iFood Case").master("local[*]").getOrCreate()

In [191]:
BASE_PATH = '/home/marco/ifood-case'
DATA_RAW_PATH = BASE_PATH + '/data/raw/'
DATA_PROCESSED_PATH = BASE_PATH + '/data/processed/'

## Profile Preprocessing

In [192]:
df_profile = spark.read.json(DATA_RAW_PATH + 'profile.json')

In [193]:
df_profile_processed = (
    df_profile
    .filter(F.col('age') <= 101)
    .withColumn('registered_on', F.to_date(F.col('registered_on'), 'yyyyMMdd'))
    .withColumn('actual_date', F.lit('2019-01-01').cast('date'))
    .fillna({'gender': 'O'})
    .select([
        F.col('id').alias('account_id'),
        F.col('age'),
        F.datediff(F.col('actual_date'), F.col('registered_on')).alias('registered_days'),
        F.when(F.col('gender') == 'F', 1).otherwise(0).alias('gender_F'),
        F.when(F.col('gender') == 'M', 1).otherwise(0).alias('gender_M'),
        F.when(F.col('gender') == 'O', 1).otherwise(0).alias('gender_O'),
    ])
    .dropDuplicates()
)

In [194]:
df_profile_processed.toPandas().to_csv(DATA_PROCESSED_PATH + 'profile_processed.csv', index=False)

## Offers Preprocessing

In [195]:
df_offers = spark.read.json(DATA_RAW_PATH + 'offers.json')

In [196]:
channels_list = ['web', 'email', 'mobile', 'social']

for ch in channels_list:
    df_offers = df_offers.withColumn(f'channel_{ch}', 
        F.array_contains(F.col('channels'), ch).cast(T.IntegerType()))

df_offers_processed = (
    df_offers
    .select([
        F.col('id').alias('offer_id'),
        F.col('offer_type'),
        F.col('discount_value'),
        F.col('min_value'),
        F.col('duration'),
        F.col('channel_web'),
        F.col('channel_email'),
        F.col('channel_mobile'),
        F.col('channel_social'),
    ])
    .dropDuplicates()
)

In [197]:
df_offers_processed.toPandas().to_csv(DATA_PROCESSED_PATH + 'offers_processed.csv', index=False)

## Transaction Preprocessing

In [198]:
df_transaction_events = spark.read.json(DATA_RAW_PATH + 'transactions.json')

                                                                                

In [199]:
df_event_transaction_processed = (
    df_transaction_events
    .filter(F.col('event') == 'transaction')
    .select([
        F.col('account_id'), 
        F.col('value.amount').alias('amount'), 
        F.col('time_since_test_start').alias('transaction_time'),
    ])
)

df_event_offer_received_processed = (
    df_transaction_events
    .filter(F.col('event') == 'offer received')
    .select([
        F.col('account_id'),
        F.col('value.offer id').alias('offer_id'),
        F.col('time_since_test_start').alias('received_time'),
    ])
)

df_event_offer_viewed_processed = (
    df_transaction_events
    .filter(F.col('event') == 'offer viewed')
    .select([
        F.col('account_id'),
        F.col('value.offer id').alias('offer_id'),
        F.col('time_since_test_start').alias('viewed_time'),
    ])
)

df_event_offer_completed_processed = (
    df_transaction_events
    .filter(F.col('event') == 'offer completed')
    .select([
        F.col('account_id'),
        F.col('value.offer_id').alias('offer_id'),
        F.col('value.reward').alias('reward'),
        F.col('time_since_test_start').alias('transaction_time'),
    ])
)

In [200]:
df_event_transaction_processed.toPandas().to_csv(DATA_PROCESSED_PATH + 'event_transaction_processed.csv', index=False)
df_event_offer_received_processed.toPandas().to_csv(DATA_PROCESSED_PATH + 'event_offer_received_processed.csv', index=False)
df_event_offer_viewed_processed.toPandas().to_csv(DATA_PROCESSED_PATH + 'event_offer_viewed_processed.csv', index=False)
df_event_offer_completed_processed.toPandas().to_csv(DATA_PROCESSED_PATH + 'event_offer_completed_processed.csv', index=False)

                                                                                