In [1]:
import pandas as pd
import numpy as np

from functools import reduce

from sqlalchemy import create_engine

from datetime import date

import json

# Initial Ingestion and Cleaning

In [2]:
offers = pd.read_json('data/portfolio.json', orient='records', lines=True)
users = pd.read_json('data/profile.json', orient='records', lines=True)
interactions = pd.read_json('data/transcript.json', orient='records', lines=True)

offer_col_mapping = {
    'id': 'offer_id', 
    'channels': 'offer_notification_channel', 
    'offer_type': 'offer_type', 
    'difficulty': 'offer_spend_minimum', 
    'reward': 'offer_reward', 
    'duration': 'offer_duration'
}

users_col_mapping = {
    'id': 'user_id', 
    'gender': 'user_gender', 
    'age': 'user_age', 
    'became_member_on': 'user_member_since', 
    'income': 'user_income'
}

interactions_col_mapping = {
    'person': 'user_id', 
    'event': 'intxn_event_type', 
    'value': 'intxn_value', 
    'time': 'intxn_time'
}

# Reordering and renaming columns for offers
offers = offers[ list( offer_col_mapping.keys() ) ]
offers = offers.rename( columns=offer_col_mapping )
offers['offer_duration'] = offers['offer_duration'].apply( lambda x: x * 24 ) # Converting days to hours

# Exploding the offers table so that we have one row per channel per offer
offers = offers.explode('offer_notification_channel')

# Reordering and renaming columns for users
users = users[ list( users_col_mapping.keys() ) ]
users = users.rename( columns=users_col_mapping )

# Users with age 118 are users that do not have an age in their profile
users['user_age'] = users['user_age'].apply( lambda x: np.nan if x == 118 else x )
users['user_member_since'] = pd.to_datetime(users['user_member_since'], format='%Y%m%d', errors='coerce')
users['user_tenure'] = users['user_member_since'].apply( lambda x: ( pd.Timestamp(2019, 1, 1) - x).days if pd.notnull(x) else np.nan )
users['user_tenure'] = users['user_tenure'].apply( lambda x: int(x / 360) )

# Assigning ages to groups
user_age_groupings = {
    '0-18': (0, 18),
    '19-25': (19, 25),
    '26-35': (26, 35),
    '36-45': (36, 45),
    '46-55': (46, 55),
    '56-65': (56, 65),
    '66+': (66, np.inf)
}
users['user_age_group'] = pd.cut( users['user_age'], bins=[0, 18, 25, 35, 45, 55, 65, np.inf], labels=['0-18', '19-25', '26-35', '36-45', '46-55', '56-65', '66+'], right=False )

# Reordering and renaming columns for interactions
interactions = interactions[ list( interactions_col_mapping.keys() ) ]
interactions = interactions.rename( columns=interactions_col_mapping )

# Expanding the interaction value column into separate columns
intxn_value_expanded = pd.json_normalize( interactions['intxn_value'] )

# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.combine_first.html
intxn_value_expanded['offer_id'] = intxn_value_expanded['offer_id'].combine_first( intxn_value_expanded['offer id'] )
intxn_value_expanded = intxn_value_expanded.drop(columns=['offer id'])
intxn_value_expanded = intxn_value_expanded.rename( columns={ col: f'intxn_{col}' for col in intxn_value_expanded.columns.tolist() if col != 'offer_id' } )

# Adding the expanded interaction value columns back into the interactions dataframe
interactions = pd.concat([interactions, intxn_value_expanded], axis=1)
interactions = interactions.drop(columns=['intxn_value'])

In [3]:
offers

Unnamed: 0,offer_id,offer_notification_channel,offer_type,offer_spend_minimum,offer_reward,offer_duration
0,ae264e3637204a6fb9bb56bc8210ddfd,email,bogo,10,10,168
0,ae264e3637204a6fb9bb56bc8210ddfd,mobile,bogo,10,10,168
0,ae264e3637204a6fb9bb56bc8210ddfd,social,bogo,10,10,168
1,4d5c57ea9a6940dd891ad53e9dbe8da0,web,bogo,10,10,120
1,4d5c57ea9a6940dd891ad53e9dbe8da0,email,bogo,10,10,120
1,4d5c57ea9a6940dd891ad53e9dbe8da0,mobile,bogo,10,10,120
1,4d5c57ea9a6940dd891ad53e9dbe8da0,social,bogo,10,10,120
2,3f207df678b143eea3cee63160fa8bed,web,informational,0,0,96
2,3f207df678b143eea3cee63160fa8bed,email,informational,0,0,96
2,3f207df678b143eea3cee63160fa8bed,mobile,informational,0,0,96


In [4]:
users

Unnamed: 0,user_id,user_gender,user_age,user_member_since,user_income,user_tenure,user_age_group
0,68be06ca386d4c31939f3a4f0e3dd783,,,2017-02-12,,1,
1,0610b486422d4921ae7d2bf64640c50b,F,55.0,2017-07-15,112000.0,1,56-65
2,38fe809add3b4fcf9315a9694bb96ff5,,,2018-07-12,,0,
3,78afa995795e4d85b5d9ceeca43f5fef,F,75.0,2017-05-09,100000.0,1,66+
4,a03223e636434f42ac4c3df47e8bac43,,,2017-08-04,,1,
...,...,...,...,...,...,...,...
16995,6d5f3a774f3d4714ab0c092238f3a1d7,F,45.0,2018-06-04,54000.0,0,46-55
16996,2cb4f97358b841b9a9773a7aa05a9d77,M,61.0,2018-07-13,72000.0,0,56-65
16997,01d26f638c274aa0b965d24cefe3183f,M,49.0,2017-01-26,73000.0,1,46-55
16998,9dc1421481194dcd9400aec7c9ae6366,F,83.0,2016-03-07,50000.0,2,66+


In [5]:
interactions

Unnamed: 0,user_id,intxn_event_type,intxn_time,intxn_amount,offer_id,intxn_reward
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,,9b98b8c7a33c4b65b9aebfe6a799e6d9,
1,a03223e636434f42ac4c3df47e8bac43,offer received,0,,0b1e1539f2cc45b7b9fa7c272da2e1d7,
2,e2127556f4f64592b11af22de27a7932,offer received,0,,2906b810c7d4411798c6938adc9daaa5,
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,0,,fafdcd668e3743c1bb461111dcafc2a4,
4,68617ca6246f4fbc85e91a2a49552598,offer received,0,,4d5c57ea9a6940dd891ad53e9dbe8da0,
...,...,...,...,...,...,...
306529,b3a1272bc9904337b331bf348c3e8c17,transaction,714,1.59,,
306530,68213b08d99a4ae1b0dcb72aebd9aa35,transaction,714,9.53,,
306531,a00058cf10334a308c68e7631c529907,transaction,714,3.61,,
306532,76ddbd6576844afe811f1a3c0fbb5bec,transaction,714,3.53,,


In [6]:
conn = create_engine('sqlite:///data/raw_starbucks.db')

offers.to_sql('offers', conn, if_exists='replace', index=False)
users.to_sql('users', conn, if_exists='replace', index=False)
interactions.to_sql('interactions', conn, if_exists='replace', index=False)

306534