In [2]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [3]:
# BASE_PATH = '/Workspace/Users/marcodaniel.ml@hotmail.com/ifood-case'
BASE_PATH = 'D:/Downloads/IFood/ifood-case/'
DATA_RAW_PATH = BASE_PATH + '/data/raw/'
DATA_TEMP_PATH = BASE_PATH + '/data/temp/'

# profile

In [4]:
df_profile = pd.read_json(DATA_RAW_PATH + 'profile.json')

df_profile['age'] = df_profile['age'].astype(int)
df_profile = df_profile[df_profile['age'] <= 101]
df_profile['registered_on'] = pd.to_datetime(df_profile['registered_on'], format='%Y%m%d')
df_profile['gender'] = df_profile['gender'].str.upper().fillna('O')
df_profile['id'] = df_profile['id'].str.strip()
df_profile['credit_card_limit'] = df_profile['credit_card_limit'].astype(float)
df_profile = df_profile.rename(columns={'id': 'account_id'})

# offers

In [5]:
df_offers = pd.read_json(DATA_RAW_PATH + 'offers.json')

df_offers['min_value'] = df_offers['min_value'].astype(float)
df_offers['duration'] = df_offers['duration'].astype(int)
df_offers['id'] = df_offers['id'].str.strip()
df_offers['discount_value'] = df_offers['discount_value'].astype(float)

df_offers = df_offers.rename(columns={'id': 'offer_id'})

# transactions

In [6]:
df_transactions = pd.read_json(DATA_RAW_PATH + 'transactions.json')

df_transactions['account_id'] = df_transactions['account_id'].str.strip()
df_transactions['time_since_test_start'] = df_transactions['time_since_test_start'].astype(float)

value_expanded = df_transactions['value'].apply(pd.Series)
df_transactions_s = pd.concat([df_transactions.drop(columns='value'), value_expanded], axis=1)

In [7]:
df_transaction = df_transactions_s[df_transactions_s['event'] == 'transaction']
df_offer_received = df_transactions_s[df_transactions_s['event'] == 'offer received']
df_offer_viewed = df_transactions_s[df_transactions_s['event'] == 'offer viewed']
df_offer_completed = df_transactions_s[df_transactions_s['event'] == 'offer completed']

df_transaction = df_transaction[['account_id', 'amount', 'time_since_test_start']]
df_offer_received = df_offer_received[['account_id', 'offer id', 'time_since_test_start']]
df_offer_viewed = df_offer_viewed[['account_id', 'offer id', 'time_since_test_start']]
df_offer_completed = df_offer_completed[['account_id', 'offer_id', 'reward', 'time_since_test_start']]

df_transaction = df_transaction.rename(columns={
    'time_since_test_start': 'transaction_time',
})
df_offer_received = df_offer_received.rename(columns={
    'offer id': 'offer_id',
    'time_since_test_start': 'received_time',
})
df_offer_viewed = df_offer_viewed.rename(columns={
    'offer id': 'offer_id',
    'time_since_test_start': 'viewed_time',
})
df_offer_completed = df_offer_completed.rename(columns={
    'time_since_test_start': 'transaction_time',
})

## TimeLine

In [8]:
df_transaction_tl = (
    df_transaction
    .merge(
        df_offer_completed,
        on=['account_id', 'transaction_time',],
        how='left'
    )
)

df_transaction_tl = df_transaction_tl.rename(columns={
    'transaction_time': 'time'
})

df_transaction_tl['event'] = np.where(
    df_transaction_tl['offer_id'].notnull(),
    'offer_completed', 'transaction'
)

df_offer_received_tl = df_offer_received.rename(columns={
    'received_time': 'time'
})
df_offer_received_tl['event'] = 'offer_received'

df_offer_viewed_tl = df_offer_viewed.rename(columns={
    'viewed_time': 'time'
})
df_offer_viewed_tl['event'] = 'offer_viewed'

df_timeline = pd.concat([df_transaction_tl, df_offer_received_tl, df_offer_viewed_tl])
df_timeline = df_timeline.drop_duplicates()
df_timeline = df_timeline.sort_values(['account_id', 'time'])

df_offers_tl = df_offers[['offer_id', 'offer_type']]
df_timeline = df_timeline.merge(df_offers_tl, on='offer_id', how='left')
df_timeline['offer_type'] = df_timeline['offer_type'].fillna('no_offer')
df_timeline = df_timeline.set_index(['account_id', 'offer_id', 'time']).reset_index()
df_timeline[['amount', 'reward']] = df_timeline[['amount', 'reward']].fillna(0)

df_timeline = df_timeline.sort_values(['account_id', 'time'])

df_timeline['transactions'] = (df_timeline['event'] == 'transaction').astype(int)

df_timeline['received_bogo'] = ((df_timeline['event'] == 'offer_received') &
                                (df_timeline['offer_type'] == 'bogo')).astype(int)
df_timeline['received_discount'] = ((df_timeline['event'] == 'offer_received') &
                                    (df_timeline['offer_type'] == 'discount')).astype(int)
df_timeline['received_info'] = ((df_timeline['event'] == 'offer_received') &
                                (df_timeline['offer_type'] == 'informational')).astype(int)

df_timeline['viewed_bogo'] = ((df_timeline['event'] == 'offer_viewed') &
                              (df_timeline['offer_type'] == 'bogo')).astype(int)
df_timeline['viewed_discount'] = ((df_timeline['event'] == 'offer_viewed') &
                                  (df_timeline['offer_type'] == 'discount')).astype(int)
df_timeline['viewed_info'] = ((df_timeline['event'] == 'offer_viewed') &
                              (df_timeline['offer_type'] == 'informational')).astype(int)

df_timeline['completed_bogo'] = ((df_timeline['event'] == 'offer_completed') &
                                 (df_timeline['offer_type'] == 'bogo')).astype(int)
df_timeline['completed_discount'] = ((df_timeline['event'] == 'offer_completed') &
                                     (df_timeline['offer_type'] == 'discount')).astype(int)
df_timeline['completed_info'] = ((df_timeline['event'] == 'offer_completed') &
                                 (df_timeline['offer_type'] == 'informational')).astype(int)

df_timeline_g = df_timeline.groupby(['account_id', 'time']).sum(numeric_only=True)

df_timeline_g['amount_sum'] = df_timeline_g.groupby(['account_id'])['amount'].cumsum()
df_timeline_g['reward_sum'] = df_timeline_g.groupby(['account_id'])['reward'].cumsum()
df_timeline_g['transactions_sum'] = df_timeline_g.groupby(['account_id'])['transactions'].cumsum()
df_timeline_g['received_bogo_sum'] = df_timeline_g.groupby(['account_id'])['received_bogo'].cumsum()
df_timeline_g['received_discount_sum'] = df_timeline_g.groupby(['account_id'])['received_discount'].cumsum()
df_timeline_g['received_info_sum'] = df_timeline_g.groupby(['account_id'])['received_info'].cumsum()
df_timeline_g['viewed_bogo_sum'] = df_timeline_g.groupby(['account_id'])['viewed_bogo'].cumsum()
df_timeline_g['viewed_discount_sum'] = df_timeline_g.groupby(['account_id'])['viewed_discount'].cumsum()
df_timeline_g['viewed_info_sum'] = df_timeline_g.groupby(['account_id'])['viewed_info'].cumsum()
df_timeline_g['completed_bogo_sum'] = df_timeline_g.groupby(['account_id'])['completed_bogo'].cumsum()
df_timeline_g['completed_discount_sum'] = df_timeline_g.groupby(['account_id'])['completed_discount'].cumsum()
df_timeline_g['completed_info_sum'] = df_timeline_g.groupby(['account_id'])['completed_info'].cumsum()

df_timeline_g = df_timeline_g.drop(columns=[
    'amount', 
    'reward',
    'transactions',
    'received_bogo',
    'received_discount',
    'received_info',
    'viewed_bogo',
    'viewed_discount',
    'viewed_info',
    'completed_bogo',
    'completed_discount',
    'completed_info',
]).reset_index()

In [9]:
# df_timeline_g.to_csv(DATA_TEMP_PATH + '/transaction_timeline.csv', index=False)

In [11]:
df_timeline_g[df_timeline_g['account_id'] == '0020ccbbb6d84e358d3414a3ff76cffd']

Unnamed: 0,account_id,time,amount_sum,reward_sum,transactions_sum,received_bogo_sum,received_discount_sum,received_info_sum,viewed_bogo_sum,viewed_discount_sum,viewed_info_sum,completed_bogo_sum,completed_discount_sum,completed_info_sum
54,0020ccbbb6d84e358d3414a3ff76cffd,1.75,16.27,0.0,1,0,0,0,0,0,0,0,0,0
55,0020ccbbb6d84e358d3414a3ff76cffd,7.0,16.27,0.0,1,0,1,0,0,1,0,0,0,0
56,0020ccbbb6d84e358d3414a3ff76cffd,9.25,27.92,3.0,1,0,1,0,0,1,0,0,1,0
57,0020ccbbb6d84e358d3414a3ff76cffd,10.0,41.78,3.0,2,0,1,0,0,1,0,0,1,0
58,0020ccbbb6d84e358d3414a3ff76cffd,11.5,48.59,3.0,3,0,1,0,0,1,0,0,1,0
59,0020ccbbb6d84e358d3414a3ff76cffd,12.5,62.86,3.0,4,0,1,0,0,1,0,0,1,0
60,0020ccbbb6d84e358d3414a3ff76cffd,14.0,62.86,3.0,4,1,1,0,0,1,0,0,1,0
61,0020ccbbb6d84e358d3414a3ff76cffd,14.5,62.86,3.0,4,1,1,0,1,1,0,0,1,0
62,0020ccbbb6d84e358d3414a3ff76cffd,15.75,77.39,8.0,4,1,1,0,1,1,0,1,1,0
63,0020ccbbb6d84e358d3414a3ff76cffd,17.0,77.39,8.0,4,1,1,1,1,1,1,1,1,0
