## Imports

In [0]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, StringType, DateType, DoubleType

In [0]:
spark = SparkSession.builder.appName("ifood_case").getOrCreate()

In [0]:
path_offers = 'dbfs:/FileStore/ifood_case/data/raw/offers.json'
path_profile = 'dbfs:/FileStore/ifood_case/data/raw/profile.json'
path_transactions = 'dbfs:/FileStore/ifood_case/data/raw/transactions.json'

## Extracts

In [0]:
df_offers = spark.read.json(path_offers)
df_offers.display()

channels,discount_value,duration,id,min_value,offer_type
"List(email, mobile, social)",10,7.0,ae264e3637204a6fb9bb56bc8210ddfd,10,bogo
"List(web, email, mobile, social)",10,5.0,4d5c57ea9a6940dd891ad53e9dbe8da0,10,bogo
"List(web, email, mobile)",0,4.0,3f207df678b143eea3cee63160fa8bed,0,informational
"List(web, email, mobile)",5,7.0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5,bogo
"List(web, email)",5,10.0,0b1e1539f2cc45b7b9fa7c272da2e1d7,20,discount
"List(web, email, mobile, social)",3,7.0,2298d6c36e964ae4a3e7e9706d1fb8c2,7,discount
"List(web, email, mobile, social)",2,10.0,fafdcd668e3743c1bb461111dcafc2a4,10,discount
"List(email, mobile, social)",0,3.0,5a8bc65990b245e5a138643cd4eb9837,0,informational
"List(web, email, mobile, social)",5,5.0,f19421c1d4aa40978ebb69ca19b0e20d,5,bogo
"List(web, email, mobile)",2,7.0,2906b810c7d4411798c6938adc9daaa5,10,discount


In [0]:
df_profile = spark.read.json(path_profile)
display(df_profile.limit(15))
df_profile.select(F.countDistinct('id')).display()

age,credit_card_limit,gender,id,registered_on
118,,,68be06ca386d4c31939f3a4f0e3dd783,20170212
55,112000.0,F,0610b486422d4921ae7d2bf64640c50b,20170715
118,,,38fe809add3b4fcf9315a9694bb96ff5,20180712
75,100000.0,F,78afa995795e4d85b5d9ceeca43f5fef,20170509
118,,,a03223e636434f42ac4c3df47e8bac43,20170804
68,70000.0,M,e2127556f4f64592b11af22de27a7932,20180426
118,,,8ec6ce2a7e7949b1bf142def7d0e0586,20170925
118,,,68617ca6246f4fbc85e91a2a49552598,20171002
65,53000.0,M,389bc3fa690240e798340f5a15918d5c,20180209
118,,,8974fc5686fe429db53ddde067b88302,20161122


count(DISTINCT id)
17000


In [0]:
df_transactions = spark.read.json(path_transactions)
display(df_transactions.limit(15))
df_transactions.select(F.countDistinct('account_id')).display()

account_id,event,time_since_test_start,value
78afa995795e4d85b5d9ceeca43f5fef,offer received,0.0,"List(null, 9b98b8c7a33c4b65b9aebfe6a799e6d9, null, null)"
a03223e636434f42ac4c3df47e8bac43,offer received,0.0,"List(null, 0b1e1539f2cc45b7b9fa7c272da2e1d7, null, null)"
e2127556f4f64592b11af22de27a7932,offer received,0.0,"List(null, 2906b810c7d4411798c6938adc9daaa5, null, null)"
8ec6ce2a7e7949b1bf142def7d0e0586,offer received,0.0,"List(null, fafdcd668e3743c1bb461111dcafc2a4, null, null)"
68617ca6246f4fbc85e91a2a49552598,offer received,0.0,"List(null, 4d5c57ea9a6940dd891ad53e9dbe8da0, null, null)"
389bc3fa690240e798340f5a15918d5c,offer received,0.0,"List(null, f19421c1d4aa40978ebb69ca19b0e20d, null, null)"
c4863c7985cf408faee930f111475da3,offer received,0.0,"List(null, 2298d6c36e964ae4a3e7e9706d1fb8c2, null, null)"
2eeac8d8feae4a8cad5a6af0499a211d,offer received,0.0,"List(null, 3f207df678b143eea3cee63160fa8bed, null, null)"
aa4862eba776480b8bb9c68455b8c2e1,offer received,0.0,"List(null, 0b1e1539f2cc45b7b9fa7c272da2e1d7, null, null)"
31dda685af34476cad5bc968bdb01c53,offer received,0.0,"List(null, 0b1e1539f2cc45b7b9fa7c272da2e1d7, null, null)"


count(DISTINCT account_id)
17000


In [0]:
df_transactions.dropDuplicates(['event']).display()

account_id,event,time_since_test_start,value
9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,offer completed,0.0,"List(null, null, 2906b810c7d4411798c6938adc9daaa5, 2.0)"
78afa995795e4d85b5d9ceeca43f5fef,offer received,0.0,"List(null, 9b98b8c7a33c4b65b9aebfe6a799e6d9, null, null)"
389bc3fa690240e798340f5a15918d5c,offer viewed,0.0,"List(null, f19421c1d4aa40978ebb69ca19b0e20d, null, null)"
02c083884c7d45b39cc68e1314fec56c,transaction,0.0,"List(0.83, null, null, null)"


In [0]:
df_transactions.count()

Out[8]: 306534

## Transform

In [0]:
df_offers_transformed = (
    df_offers
    .withColumn('offer_id', F.col('id').cast(StringType()))
    .select('offer_id', 'offer_type', 'min_value', 'duration', 'discount_value', 'channels')
    .dropDuplicates()
)
display(df_offers_transformed.limit(15))

offer_id,offer_type,min_value,duration,discount_value,channels
f19421c1d4aa40978ebb69ca19b0e20d,bogo,5,5.0,5,"List(web, email, mobile, social)"
9b98b8c7a33c4b65b9aebfe6a799e6d9,bogo,5,7.0,5,"List(web, email, mobile)"
4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,10,5.0,10,"List(web, email, mobile, social)"
ae264e3637204a6fb9bb56bc8210ddfd,bogo,10,7.0,10,"List(email, mobile, social)"
2298d6c36e964ae4a3e7e9706d1fb8c2,discount,7,7.0,3,"List(web, email, mobile, social)"
2906b810c7d4411798c6938adc9daaa5,discount,10,7.0,2,"List(web, email, mobile)"
0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,20,10.0,5,"List(web, email)"
5a8bc65990b245e5a138643cd4eb9837,informational,0,3.0,0,"List(email, mobile, social)"
3f207df678b143eea3cee63160fa8bed,informational,0,4.0,0,"List(web, email, mobile)"
fafdcd668e3743c1bb461111dcafc2a4,discount,10,10.0,2,"List(web, email, mobile, social)"


In [0]:
df_profile_transformed = (
    df_profile
    .withColumn('account_id', F.col('id').cast(StringType()))
    .withColumn('date_client_was_registered', F.date_format(F.to_date(F.col('registered_on'), 'yyyyMMdd'), 'yyyy-MM-dd').cast(DateType()))
    .withColumn('age_at_registration', F.col('age').cast(IntegerType()))
    .filter(F.col('age_at_registration') <= 100)
    .select('account_id', 'date_client_was_registered', 'age_at_registration', 'gender', 'credit_card_limit')
    .dropDuplicates(['account_id'])
)
display(df_profile_transformed.limit(15))

account_id,date_client_was_registered,age_at_registration,gender,credit_card_limit
0009655768c64bdeb2e877511632db8f,2017-04-21,33,M,72000.0
0011e0d4e6b944f998e987f904e8c1e5,2018-01-09,40,O,57000.0
0020c2b971eb4e9188eac86d93036a77,2016-03-04,59,F,90000.0
0020ccbbb6d84e358d3414a3ff76cffd,2016-11-11,24,F,60000.0
003d66b6608740288d6cc97a6903f4f0,2017-06-21,26,F,73000.0
00426fe3ffde4c6b9cb9ad6d077a13ea,2016-08-09,19,F,65000.0
004b041fbfe44859945daa2c7f79ee64,2018-05-08,55,F,74000.0
004c5799adbf42868b9cff0396190900,2016-03-31,54,M,99000.0
005500a7188546ff8a767329a2f7c76a,2017-12-09,56,M,47000.0
0056df74b63b4298809f0b375a304cf4,2016-08-21,54,M,91000.0


In [0]:
df_transactions_transformed = (
    df_transactions
    .withColumn('account_id', F.col('account_id').cast(StringType()))
    .withColumn('amount', F.col('value.amount'))
    .withColumn('offer id', F.col('value.offer id'))
    .withColumn('offer_id', F.col('value.offer_id'))
    .withColumn('offer_id', F.coalesce('offer_id', 'offer id'))
    .withColumn('reward', F.col('value.reward'))
    .select('account_id', 'offer_id', 'event', 'time_since_test_start', 'amount', 'reward')
    .dropDuplicates()
)
display(df_transactions_transformed.limit(15))

account_id,offer_id,event,time_since_test_start,amount,reward
c4863c7985cf408faee930f111475da3,2298d6c36e964ae4a3e7e9706d1fb8c2,offer received,0.0,,
aa4862eba776480b8bb9c68455b8c2e1,0b1e1539f2cc45b7b9fa7c272da2e1d7,offer received,0.0,,
78afa995795e4d85b5d9ceeca43f5fef,9b98b8c7a33c4b65b9aebfe6a799e6d9,offer received,0.0,,
31dda685af34476cad5bc968bdb01c53,0b1e1539f2cc45b7b9fa7c272da2e1d7,offer received,0.0,,
c27e0d6ab72c455a8bb66d980963de60,3f207df678b143eea3cee63160fa8bed,offer received,0.0,,
68617ca6246f4fbc85e91a2a49552598,4d5c57ea9a6940dd891ad53e9dbe8da0,offer received,0.0,,
4b0da7e80e5945209a1fdddfe813dbe0,ae264e3637204a6fb9bb56bc8210ddfd,offer received,0.0,,
e2127556f4f64592b11af22de27a7932,2906b810c7d4411798c6938adc9daaa5,offer received,0.0,,
8ec6ce2a7e7949b1bf142def7d0e0586,fafdcd668e3743c1bb461111dcafc2a4,offer received,0.0,,
3d02345581554e81b7b289ab5e288078,0b1e1539f2cc45b7b9fa7c272da2e1d7,offer received,0.0,,


In [0]:
df_customers_offers_transactions = (
    df_transactions_transformed
    .join(df_profile_transformed, 'account_id')
    .join(df_offers_transformed, 'offer_id', 'left')
)
display(df_customers_offers_transactions.limit(15))

offer_id,account_id,event,time_since_test_start,amount,reward,date_client_was_registered,age_at_registration,gender,credit_card_limit,offer_type,min_value,duration,discount_value,channels
9b98b8c7a33c4b65b9aebfe6a799e6d9,fae5f722dce445c1ae311729464943cf,offer received,0.0,,,2017-09-11,76,F,98000.0,bogo,5,7.0,5,"List(web, email, mobile)"
4d5c57ea9a6940dd891ad53e9dbe8da0,352ccf90feeb4c92a54834c7ad2b66bf,offer received,0.0,,,2016-05-23,52,F,87000.0,bogo,10,5.0,10,"List(web, email, mobile, social)"
5a8bc65990b245e5a138643cd4eb9837,3bcc51fdde354eb1949c813dbc905182,offer received,0.0,,,2016-06-13,54,F,52000.0,informational,0,3.0,0,"List(email, mobile, social)"
2298d6c36e964ae4a3e7e9706d1fb8c2,267e47de94fd46b1afa96dea1c9d3cbf,offer received,0.0,,,2018-07-26,28,M,33000.0,discount,7,7.0,3,"List(web, email, mobile, social)"
fafdcd668e3743c1bb461111dcafc2a4,d80f9e3f974448ec902c42818097ebf3,offer received,0.0,,,2017-08-21,47,F,62000.0,discount,10,10.0,2,"List(web, email, mobile, social)"
0b1e1539f2cc45b7b9fa7c272da2e1d7,74b8a279433a4750b8568ae7478ac2fe,offer received,0.0,,,2017-09-22,89,F,81000.0,discount,20,10.0,5,"List(web, email)"
9b98b8c7a33c4b65b9aebfe6a799e6d9,a104f5f8f470487693d626d36317dcf7,offer received,0.0,,,2016-09-29,94,F,31000.0,bogo,5,7.0,5,"List(web, email, mobile)"
f19421c1d4aa40978ebb69ca19b0e20d,39f0ce98d4ba48e98737b8fa051c5b97,offer received,0.0,,,2017-07-28,51,M,54000.0,bogo,5,5.0,5,"List(web, email, mobile, social)"
fafdcd668e3743c1bb461111dcafc2a4,38a9ec4b59a549b786ae49c5af104c7f,offer received,0.0,,,2016-08-01,48,M,32000.0,discount,10,10.0,2,"List(web, email, mobile, social)"
fafdcd668e3743c1bb461111dcafc2a4,ff9f73ead16a4f9b9e1a53a27280af92,offer received,0.0,,,2015-07-31,43,F,67000.0,discount,10,10.0,2,"List(web, email, mobile, social)"


In [0]:
print(f"Antes do join: {df_transactions_transformed.count()} registros")
print(f"Depois do join: {df_customers_offers_transactions.count()} registros")

Antes do join: 306137 registros
Depois do join: 272290 registros


In [0]:
print("Clientes removidos:", df_profile.count() - df_profile_transformed.count())
print("Transações removidas:", df_transactions_transformed.filter(~F.col('account_id').isin(df_profile_transformed.select('account_id').distinct().rdd.flatMap(lambda x: x).collect())).count())

Clientes removidos: 2180
Transações removidas: 33847


## EDA

In [0]:
df_customers_offers_transactions.describe().display()

summary,offer_id,account_id,event,time_since_test_start,amount,reward,age_at_registration,gender,credit_card_limit,offer_type,min_value,duration,discount_value
count,148380,272290,272290,272290.0,123910.0,32058.0,272290.0,272290,272290.0,148380,148380.0,148380.0,148380.0
mean,,2.565638242424101E31,,15.270470821550552,13.994548220482608,4.947158275625429,53.821590950824486,,64327.2613757391,,7.8831446286561535,6.622273891360021,4.441501550074134
stddev,,0.0,,8.34905678235019,31.75486488734324,2.905916154176298,17.531611551494837,,21241.4222749056,,5.038536111691908,2.1329909644687643,3.3735636186865667
min,0b1e1539f2cc45b7b9fa7c272da2e1d7,0009655768c64bdeb2e877511632db8f,offer completed,0.0,0.05,2.0,18.0,F,30000.0,bogo,0.0,3.0,0.0
max,fafdcd668e3743c1bb461111dcafc2a4,ffff82501cea40309d5fdd7edcca4a07,transaction,29.75,1062.28,10.0,100.0,O,120000.0,informational,20.0,10.0,10.0


In [0]:
df_customers_offers_transactions.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df_customers_offers_transactions.columns]).display()

offer_id,account_id,event,time_since_test_start,amount,reward,date_client_was_registered,age_at_registration,gender,credit_card_limit,offer_type,min_value,duration,discount_value,channels
123910,0,0,0,148380,240232,0,0,0,0,123910,123910,123910,123910,123910


In [0]:
df_customers_offers_transactions.groupBy('event').count().display()

event,count
transaction,123910
offer received,66478
offer completed,32058
offer viewed,49844


In [0]:
# Média de amount por tipo de transação
df_customers_offers_transactions.filter(F.col('event') != 'transaction').groupBy('offer_type').agg({'amount': 'mean'}).show()
df_customers_offers_transactions.filter(F.col('event') == 'transaction').agg({'amount': 'mean'}).show()

# Média de reward por tipo de transação
df_customers_offers_transactions.filter(F.col('event') != 'transaction').groupBy('offer_type').agg({'reward': 'mean'}).show()
df_customers_offers_transactions.filter(F.col('event') == 'transaction').agg({'reward': 'mean'}).show()

+-------------+-----------+
|   offer_type|avg(amount)|
+-------------+-----------+
|     discount|       null|
|informational|       null|
|         bogo|       null|
+-------------+-----------+

+------------------+
|       avg(amount)|
+------------------+
|13.994548220482601|
+------------------+

+-------------+------------------+
|   offer_type|       avg(reward)|
+-------------+------------------+
|     discount|2.8703419811320754|
|informational|              null|
|         bogo| 7.280103324943701|
+-------------+------------------+

+-----------+
|avg(reward)|
+-----------+
|       null|
+-----------+



### Separate transactions from offers

In [0]:
df_offers_transactions = (
    df_transactions_transformed
    .join(df_offers_transformed, 'offer_id', 'left')
)

df_customers_transactions = (
    df_offers_transactions
    .filter(F.col('event') == 'transaction')
    .drop('offer_id', 'reward', 'offer_type', 'min_value', 'duration', 'discount_value', 'channels')
)
print(df_customers_transactions.count())

df_customers_offers = (
    df_offers_transactions
    .filter(F.col('event') != 'transaction')
)
print(df_customers_offers.count())

offer_completed = (F.col('event') == 'offer completed') 
information_viewed = ((F.col('event') == 'offer viewed') & (F.col('offer_type') == 'informational'))
df_customers_offers_informations_completed = (
    df_customers_offers
    .filter(offer_completed | information_viewed)
    .drop('amount')
)
print(df_customers_offers_informations_completed.count())


138953
167184
44013


In [0]:
df_customers_offers_informations_completed.groupBy('offer_type').count().display()

offer_type,count
discount,17681
informational,10831
bogo,15501


In [0]:
display(df_customers_transactions.groupBy('account_id', 'time_since_test_start').count().orderBy(F.col('count').desc()).limit(15))
df_customers_offers_informations_completed.groupBy('account_id', 'time_since_test_start').count().orderBy(F.col('count').desc()).display()

account_id,time_since_test_start,count
abc4359eb34e4e2ca2349da2ddf771b6,3.5,1
2eede32dd09743bdb5e6143dd8db7b10,5.25,1
c0f9e4bb1f3a4d30a34f5a835b2c97ea,4.75,1
3ee1c588015f405eb0cc8aea95fd9daa,12.25,1
e685472e5137400db362c80bc216a915,0.75,1
668a6a51df4b454b93c31c945bc5eb76,14.0,1
3c487d3686d642c687c14eaf3fbd0f15,11.5,1
06186260c2554978a92be071e4fe3c79,29.75,1
b4f14ce6c0d048638741bbc1d3ae0f58,3.0,1
32f0edf2e8c24882a2a7f4de13c55b44,9.0,1


account_id,time_since_test_start,count
75bb371cf36d4a9186397a9866ed2fbe,24.0,4
5fe0c3ba7e3f49d499ddd2734fd6bbee,22.0,3
60b7692b124f4de99279cf046d036092,24.5,3
43ab1831aeb0431f98efd3578f14bf67,23.25,3
609dffa9eb714e3281a9aca1e5ce48f0,21.0,3
528339af535e43aaaa128fc5044fb46d,25.0,3
cb4a0508e95e41b8a57c59d69a20f06d,25.75,3
85d51ac3c81f49b9a179e80cf11d41f1,24.75,3
ff9beda564da4bab81633fca744037dc,25.0,3
7055b5fa5f8647618aa14e220b7c6b5c,24.5,3


In [0]:
df_customers_transactions_consolidated = (
    df_customers_transactions.alias('d')
    .join(df_customers_offers_informations_completed.alias('o'), on=[
        df_customers_transactions.account_id == df_customers_offers_informations_completed.account_id,
        df_customers_transactions.time_since_test_start == df_customers_offers_informations_completed.time_since_test_start], how='left').select('d.*', 'o.offer_id', 'o.reward', 'o.offer_type', 'o.min_value', 'o.duration', 'o.discount_value', 'o.channels')
    .withColumn('event', F.when((F.col('offer_id').isNotNull() & (F.col('offer_type') != 'informational')), 'transaction with offer')
                          .when(F.col('offer_type') == 'informational', 'transaction with information')
                          .otherwise(F.col('event')))
)

df_customers_others_offers = df_customers_offers.join(df_customers_transactions_consolidated.select('account_id', 'time_since_test_start', 'offer_id'), ['account_id', 'time_since_test_start', 'offer_id'], 'left_anti')

df_customers_transactions_consolidated = df_customers_transactions_consolidated.unionByName(df_customers_others_offers, allowMissingColumns=False)

display(df_customers_transactions_consolidated.limit(15))

account_id,event,time_since_test_start,amount,offer_id,reward,offer_type,min_value,duration,discount_value,channels
02c083884c7d45b39cc68e1314fec56c,transaction,0.0,0.83,,,,,,,
54890f68699049c2a04d415abc25e717,transaction,0.0,13.23,,,,,,,
629fc02d56414d91bca360decdfa9288,transaction with offer,0.0,33.9,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,bogo,5.0,7.0,5.0,"List(web, email, mobile)"
9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,transaction with offer,0.0,34.56,2906b810c7d4411798c6938adc9daaa5,2.0,discount,10.0,7.0,2.0,"List(web, email, mobile)"
a04fcfd571034456aaa6d56c0a3fd9b6,transaction,0.0,5.02,,,,,,,
a97e6f33219c432db82acfa0d19c602d,transaction,0.0,18.59,,,,,,,
b432b74402bb4981a4651c8df1670365,transaction,0.0,6.46,,,,,,,
227f2d69e46a4899b70d48182822cff6,transaction with offer,0.0,28.39,4d5c57ea9a6940dd891ad53e9dbe8da0,10.0,bogo,10.0,5.0,10.0,"List(web, email, mobile, social)"
4cbe33c601a5407f8202086565c55111,transaction,0.0,36.19,,,,,,,
676506bad68e4161b9bbaffeb039626b,transaction with offer,0.0,18.01,ae264e3637204a6fb9bb56bc8210ddfd,10.0,bogo,10.0,7.0,10.0,"List(email, mobile, social)"


In [0]:
df_customers_transactions_consolidated.count()

Out[37]: 269240

In [0]:
df_customers_transactions_metrics = (
    df_customers_transactions_consolidated
    .groupBy('account_id').agg(
        F.count(F.when(F.col('event').contains('transaction'), F.col('event'))).alias('freq_shopping'),
        F.count(F.when((F.col('event').contains('transaction')) & (F.col('offer_id').isNull()), F.col('event'))).alias('freq_shopping_without_offer'),
        F.avg(F.when(F.col('event').contains('transaction'), F.col('amount'))).alias('avg_ticket'),
        F.stddev(F.when(F.col('event').contains('transaction'), F.col('amount'))).alias('variability_ticket'),

        F.count(F.when(F.col('offer_id').isNotNull(), F.col('offer_id'))).alias('total_offers_received'),
        F.count(F.when((F.col('offer_id').isNotNull()) & (F.col('event').contains('transaction')), F.col('offer_id'))).alias('total_offers_accepted'),

        F.count(F.when((F.col('offer_type') == 'bogo') & (F.col('event').contains('transaction')), F.col('offer_id'))).alias('total_bogo_offers_accepted'),
        F.count(F.when((F.col('offer_type') == 'discount') & (F.col('event').contains('transaction')), F.col('offer_id'))).alias('total_discount_offers_accepted'),
        F.count(F.when((F.col('offer_type') == 'informational') & (F.col('event').contains('transaction')), F.col('offer_id'))).alias('total_informational_offers_accepted'),
    )
    .withColumn('perc_shopping_without_offer', F.col('freq_shopping_without_offer') / F.col('freq_shopping'))
    .withColumn('offer_acceptance_rate', F.col('total_offers_accepted') / F.col('total_offers_received'))
    .withColumn('bogo_offer_acceptance_rate', F.col('total_bogo_offers_accepted') / F.col('total_offers_received'))
    .withColumn('discount_offer_acceptance_rate', F.col('total_discount_offers_accepted') / F.col('total_offers_received'))
    .withColumn('informational_offer_acceptance_rate', F.col('total_informational_offers_accepted') / F.col('total_offers_received'))
)

display(df_customers_transactions_metrics.limit(15))
print(df_customers_transactions_metrics.count())

account_id,freq_shopping,freq_shopping_without_offer,avg_ticket,variability_ticket,total_offers_received,total_offers_accepted,total_bogo_offers_accepted,total_discount_offers_accepted,total_informational_offers_accepted,perc_shopping_without_offer,offer_acceptance_rate,bogo_offer_acceptance_rate,discount_offer_acceptance_rate,informational_offer_acceptance_rate
d3c24fa42d0947a4be104283c35a7983,15,12,13.548666666666668,3.802491413847104,9,3,2,1,0,0.8,0.3333333333333333,0.2222222222222222,0.1111111111111111,0.0
d1c50a0b12b84565a5182ee69f6c773e,4,4,3.1674999999999995,4.839120960119376,3,0,0,0,0,1.0,0.0,0.0,0.0,0.0
907e78791e0a4e58b91eee23eb7048d0,13,9,16.71461538461538,7.13052080307441,12,4,1,3,0,0.6923076923076923,0.3333333333333333,0.0833333333333333,0.25,0.0
5e0cac0673884c67b9eec062a30473f3,8,7,9.6225,21.457603214578413,6,1,0,0,1,0.875,0.1666666666666666,0.0,0.0,0.1666666666666666
dd1069bbc7ef423c9b22bc81aceb6ec1,19,18,3.216842105263158,2.578166990177278,12,1,0,1,0,0.9473684210526316,0.0833333333333333,0.0,0.0833333333333333,0.0
38197aa576534853acc113f11fb86b62,19,16,6.2663157894736825,3.0317692130839227,9,3,2,1,0,0.8421052631578947,0.3333333333333333,0.2222222222222222,0.1111111111111111,0.0
37aa48f7035f4b808d8292f5c95fd55f,11,8,8.826363636363636,3.582399399082332,12,3,1,2,0,0.7272727272727273,0.25,0.0833333333333333,0.1666666666666666,0.0
234f3b42e14245349935d57b937300dd,10,10,2.459,1.4965623571668802,8,0,0,0,0,1.0,0.0,0.0,0.0,0.0
6aea2f1dd8ab4ff68ce6230ca39af67d,4,3,3.015,2.201779583276522,10,1,1,0,0,0.75,0.1,0.1,0.0,0.0
d0a70db109564df1be5f0cfcf3296642,6,4,4.66,3.582446091708848,7,2,1,0,1,0.6666666666666666,0.2857142857142857,0.1428571428571428,0.0,0.1428571428571428


Databricks visualization. Run in Databricks to view.

17000


In [0]:
df_customers_to_maintain = (
    df_customers_transactions_metrics
    .filter(F.col('perc_shopping_without_offer') <= 0.8)
    .select('account_id', 'avg_ticket', 'variability_ticket', 'offer_acceptance_rate', 'bogo_offer_acceptance_rate', 'discount_offer_acceptance_rate', 'informational_offer_acceptance_rate')
)

In [0]:
df_customers_transactions_consolidated_metrics = (
    df_customers_transactions_consolidated
    .join(df_customers_to_maintain, 'account_id', 'inner')
    .join(df_profile_transformed, 'account_id', 'inner')
    .filter(F.col('offer_id').isNotNull())
    .fillna({'reward': 0, 'amount': 0})
    .withColumn('offer_accepted', F.when(F.col('event').contains('transaction with'), 1).otherwise(0))

    .withColumn('is_bogo_offer', F.when(F.col('offer_type') == 'bogo', 1).otherwise(0))
    .withColumn('is_discount_offer', F.when(F.col('offer_type') == 'discount', 1).otherwise(0))
    .withColumn('is_informational_offer', F.when(F.col('offer_type') == 'informational', 1).otherwise(0))

    .withColumn('has_web_notification', F.when(F.array_contains('channels', 'web'), 1).otherwise(0))
    .withColumn('has_mobile_notification', F.when(F.array_contains('channels', 'mobile'), 1).otherwise(0))
    .withColumn('has_social_notification', F.when(F.array_contains('channels', 'social'), 1).otherwise(0))

    .withColumn('is_male', F.when(F.col('gender') == 'M', 1).otherwise(0))
    .withColumn('is_female', F.when(F.col('gender') == 'F', 1).otherwise(0))
    .withColumn('is_other', F.when(F.col('gender') == 'O', 1).otherwise(0))

    .withColumn('customer_tenure', F.datediff(F.date_format(F.current_date(), 'yyyy-MM-dd'), F.col('date_client_was_registered')))
    .drop('event', 'channels', 'offer_type', 'reward', 'gender', 'date_client_was_registered')
)
print(df_customers_transactions_consolidated_metrics.count())
display(df_customers_transactions_consolidated_metrics.limit(15))

100008


account_id,time_since_test_start,amount,offer_id,min_value,duration,discount_value,avg_ticket,variability_ticket,offer_acceptance_rate,bogo_offer_acceptance_rate,discount_offer_acceptance_rate,informational_offer_acceptance_rate,age_at_registration,credit_card_limit,offer_accepted,is_bogo_offer,is_discount_offer,is_informational_offer,has_web_notification,has_mobile_notification,has_social_notification,is_male,is_female,is_other,customer_tenure
5abe3df001c14294a1796c4f78225fa0,25.5,8.9,0b1e1539f2cc45b7b9fa7c272da2e1d7,20,10.0,5,12.836,3.5490975378725977,0.3333333333333333,0.25,0.0833333333333333,0.0,48,57000.0,1,0,1,0,1,0,0,0,1,0,3482
f6c178ca2b1847d6b91fb46123b44981,7.25,27.6,f19421c1d4aa40978ebb69ca19b0e20d,5,5.0,5,21.68142857142857,3.947650898831813,0.3571428571428571,0.2142857142857142,0.1428571428571428,0.0,55,93000.0,1,1,0,0,1,1,1,1,0,0,2470
5465e844845a4a5c82864a608b79e940,15.0,14.19,f19421c1d4aa40978ebb69ca19b0e20d,5,5.0,5,11.488333333333337,4.026007534352943,0.25,0.125,0.125,0.0,65,56000.0,1,1,0,0,1,1,1,1,0,0,2675
72c8dfba6eb44158af1115edbdc3dd8c,24.75,28.59,ae264e3637204a6fb9bb56bc8210ddfd,10,7.0,10,26.5,6.243071588340379,0.3,0.3,0.0,0.0,56,106000.0,1,1,0,0,0,1,1,1,0,0,3223
d32d9bdf55ad47879f502c3602e0a45f,18.0,22.8,ae264e3637204a6fb9bb56bc8210ddfd,10,7.0,10,24.315,2.1425335469952373,0.25,0.25,0.0,0.0,63,75000.0,1,1,0,0,0,1,1,0,1,0,2485
5c73c99208e449129397591e045e641f,19.0,8.57,2906b810c7d4411798c6938adc9daaa5,10,7.0,2,8.074545454545454,4.993334284095876,0.2307692307692307,0.0769230769230769,0.1538461538461538,0.0,65,36000.0,1,0,1,0,1,1,0,1,0,0,2647
628d90cb330a4c42aca731c80e06f7ec,14.0,12.33,ae264e3637204a6fb9bb56bc8210ddfd,10,7.0,10,15.791999999999998,4.096309450116178,0.4166666666666667,0.0833333333333333,0.3333333333333333,0.0,58,67000.0,1,1,0,0,0,1,1,0,1,0,3006
d3c24fa42d0947a4be104283c35a7983,14.75,24.18,0b1e1539f2cc45b7b9fa7c272da2e1d7,20,10.0,5,13.548666666666668,3.802491413847105,0.3333333333333333,0.2222222222222222,0.1111111111111111,0.0,87,61000.0,1,0,1,0,1,0,0,0,1,0,3515
37aa48f7035f4b808d8292f5c95fd55f,22.25,16.38,2906b810c7d4411798c6938adc9daaa5,10,7.0,2,8.826363636363636,3.582399399082332,0.25,0.0833333333333333,0.1666666666666666,0.0,66,40000.0,1,0,1,0,1,1,0,1,0,0,3316
f5de20984950433d9c206f7c42477342,15.0,10.7,4d5c57ea9a6940dd891ad53e9dbe8da0,10,5.0,10,20.228125,6.696315622539506,0.3333333333333333,0.3333333333333333,0.0,0.0,48,75000.0,1,1,0,0,1,1,1,1,0,0,3477


In [0]:
df_customers_transactions_consolidated.groupBy('event').count().display()

event,count
transaction,107175
transaction with offer,33182
transaction with information,1255
offer received,73878
offer viewed,53750


In [0]:
df_customers_transactions_consolidated_metrics.groupBy('event').count().display()

event,count
transaction with offer,27583
transaction with information,810
offer received,42547
offer viewed,31685


In [0]:
df_customers_transactions_consolidated_metrics.groupBy('gender').count().display()

gender,count
F,50500
M,50508
O,1617


In [0]:
df_customers_transactions_consolidated_metrics.groupBy('offer_type').count().orderBy(F.desc('count')).display()
df_customers_transactions_consolidated_metrics.groupBy('event').count().orderBy(F.desc('count')).display()

offer_type,count
,96963
bogo,61013
discount,60008
informational,22648


event,count
transaction,96963
offer received,64409
offer viewed,47202
transaction with offer,29142
offer completed,2916


In [0]:
df_customers_transactions_consolidated_metrics.select('discount_value').summary().display()

summary,discount_value
count,100008.0
mean,4.4896808255339575
stddev,3.3078342737188
min,0.0
25%,2.0
50%,5.0
75%,5.0
max,10.0


In [0]:
df_customers_transactions_consolidated_metrics.select('time_since_test_start').summary().display()
quantiles = df_customers_transactions_consolidated_metrics.approxQuantile('time_since_test_start', [0.15, 0.7, 0.85], relativeError=0.05)
print(quantiles)

summary,time_since_test_start
count,100008.0
mean,14.890213782897368
stddev,8.26108967268318
min,0.0
25%,7.25
50%,17.0
75%,21.25
max,29.75


[1.0, 21.0, 23.5]
