In [2]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
%matplotlib inline
import imp

import sys
  
# adding utils folder to the system path
sys.path.insert(0, '../utils')
  
# importing the utils_main library as utm
import utils_main as utm

# Read files

In [3]:
portfolio = pd.read_json('./../data/portfolio.json', lines=True)
profile = pd.read_json('./../data/profile.json', lines=True)
transcript = pd.read_json('./../data/transcript.json', lines=True)

The data comprises of three separate file described below.

# Portfolio data

portfolio.json contains information about the ten promotions.
* reward is the monetary value of the promotion.
* channels is the ways in which the promotion was advertised.
* diffculty is the amount the customer needs to spend in order to receive the reward.
* duration is the total number of days that the promotion was available.
* offer_type is the type of promotion. This is either a money off offer (discount), buy one get one free (BOGO) or
a news letter (informational).
* id is the promotion identitifcation --> rename as id_promotion


Preprocessing:
* Created the binary columns from the channels column of type list:
    * email column
    * mobile column
    * social column
    * web column
* Renamed id column into id_promotion column

In [4]:
imp.reload(utm)

<module 'utils_main' from 'C:\\Users\\BASTAMX2\\04_TargetingCustomers_DAIB\\TargetingCustomers_DAIB\\eda\\../utils\\utils_main.py'>

In [5]:
portfolio, profile, transcript = utm.readFiles(dropUnnecessaryCol = True)

In [6]:
portfolio

Unnamed: 0,reward,difficulty,duration,offer_type,id_promotion,email,mobile,social,web
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1
2,0,0,4,informational,3f207df678b143eea3cee63160fa8bed,1,1,0,1
3,5,5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0,1
4,5,20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,1
5,3,7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2,1,1,1,1
6,2,10,10,discount,fafdcd668e3743c1bb461111dcafc2a4,1,1,1,1
7,0,0,3,informational,5a8bc65990b245e5a138643cd4eb9837,1,1,1,0
8,5,5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1
9,2,10,7,discount,2906b810c7d4411798c6938adc9daaa5,1,1,0,1


In [7]:
portfolio.shape

(10, 9)

In [8]:
print(portfolio.isnull().sum())

reward          0
difficulty      0
duration        0
offer_type      0
id_promotion    0
email           0
mobile          0
social          0
web             0
dtype: int64


In [9]:
print(portfolio.isna().sum())

reward          0
difficulty      0
duration        0
offer_type      0
id_promotion    0
email           0
mobile          0
social          0
web             0
dtype: int64


In [10]:
portfolio.dtypes

reward           int64
difficulty       int64
duration         int64
offer_type      object
id_promotion    object
email            int64
mobile           int64
social           int64
web              int64
dtype: object

In [11]:
portfolio_report = ProfileReport(portfolio)
portfolio_report.to_file("reports/portfolio.html")
#portfolio_report

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Profile data

profile.json contains information about customers.
* gender is the identified gender of the customer.
* age is the age of the customer at the time of the promotion period.
* id is the customer membership identification --> rename as id_membership
* became_member is the date when the customer became a member.
* income is the self reported income of the customer at the time of the promotion period.


Assumption:
* You can assume that all customers were members before the promotional period began

Preprocessing:
* Changes format of the "became_member_on" column from int to datetime64[ns]
* Renamed id column into id_membership column

Things to do:
*  Since in this case we want the cluster customers, someone who joined in say March 2015 would have a longer membership than someone who joined say March 2018 and therefore the important aspect here may be the 'length of membership'.

Observation:
*  all the persons with **age 118** have their gender set to None  and their incomes set to NaN. Probably, these are persons who did not want to share their personal informations to the seller. While analysing this group later on, we notice that these persons tend to spend less than others if we look at the sumamry statistics of the total average spend per customer.

In [12]:
portfolio, profile, transcript = utm.readFiles(dropUnnecessaryCol = True)

In [77]:
profile.head()

Unnamed: 0,gender,age,id_membership,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,2017-02-12,
1,F,55,0610b486422d4921ae7d2bf64640c50b,2017-07-15,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,2018-07-12,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,2017-05-09,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,2017-08-04,


All the persons with **age 118** seem to have their gender set to None  and their incomes set to NaN. Lets double check that

In [44]:
sum(profile.age==118)

2175

In [52]:
sum(profile.income.isna())

2175

In [51]:
sum(profile.gender.isna())

2175

In [14]:
profile.shape

(17000, 5)

In [54]:
sum(profile.income.isna() == profile.gender.isna())

17000

In [55]:
sum((profile.age==118) == (profile.gender.isna()))

17000

In [15]:
profile.dtypes

gender                      object
age                          int64
id_membership               object
became_member_on    datetime64[ns]
income                     float64
dtype: object

In [16]:
print(profile.isnull().sum())

gender              2175
age                    0
id_membership          0
became_member_on       0
income              2175
dtype: int64


In [17]:
print(profile.isna().sum())

gender              2175
age                    0
id_membership          0
became_member_on       0
income              2175
dtype: int64


In [18]:
profile_report = ProfileReport(profile)
profile_report.to_file("reports/profile.html")
#profile_report

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Transcript data

transactions.json contains information specific transaction across the promotion period.
* person is the customer membership identification --> renamed id_membership
* event is the event - either offer received, offer viewed, offer complete or transaction.
* value.offer.id is the promotion identification --> renamed id_promotion
* value.amount is the amount spent in GBP for a given transaction.
* value.reward is the monetary value of the promotion.
* time is the time from the beginning of the promotion period.

Preprocessing:
* Extracted values from the "value" dictionnary column into:
    * id_promotion column
    * amount column
    * reward column
* Renamed person column into id_membership column
* Renamed value.offer.id column into id_promotion column

In [19]:
imp.reload(utm)

<module 'utils_main' from 'C:\\Users\\BASTAMX2\\04_TargetingCustomers_DAIB\\TargetingCustomers_DAIB\\eda\\../utils\\utils_main.py'>

In [20]:
portfolio, profile, transcript = utm.readFiles(dropUnnecessaryCol = True)

In [21]:
transcript.head()

Unnamed: 0,id_membership,event,time,id_promotion,amount,reward
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,0.0,0
1,a03223e636434f42ac4c3df47e8bac43,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,0.0,0
2,e2127556f4f64592b11af22de27a7932,offer received,0,2906b810c7d4411798c6938adc9daaa5,0.0,0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,0,fafdcd668e3743c1bb461111dcafc2a4,0.0,0
4,68617ca6246f4fbc85e91a2a49552598,offer received,0,4d5c57ea9a6940dd891ad53e9dbe8da0,0.0,0


In [22]:
transcript["id_promotion"].unique()

array(['9b98b8c7a33c4b65b9aebfe6a799e6d9',
       '0b1e1539f2cc45b7b9fa7c272da2e1d7',
       '2906b810c7d4411798c6938adc9daaa5',
       'fafdcd668e3743c1bb461111dcafc2a4',
       '4d5c57ea9a6940dd891ad53e9dbe8da0',
       'f19421c1d4aa40978ebb69ca19b0e20d',
       '2298d6c36e964ae4a3e7e9706d1fb8c2',
       '3f207df678b143eea3cee63160fa8bed',
       'ae264e3637204a6fb9bb56bc8210ddfd',
       '5a8bc65990b245e5a138643cd4eb9837', ''], dtype=object)

In [23]:
transcript["amount"].unique()

array([  0.  ,   0.83,  34.56, ..., 685.07, 405.04, 476.33])

In [24]:
transcript["reward"].unique()

array([ 0,  2,  5, 10,  3], dtype=int64)

In [25]:
transcript.shape

(306534, 6)

In [26]:
transcript.dtypes

id_membership     object
event             object
time               int64
id_promotion      object
amount           float64
reward             int64
dtype: object

In [27]:
print(transcript.isnull().sum())

id_membership    0
event            0
time             0
id_promotion     0
amount           0
reward           0
dtype: int64


In [28]:
print(transcript.isna().sum())

id_membership    0
event            0
time             0
id_promotion     0
amount           0
reward           0
dtype: int64


In [29]:
transcript_report = ProfileReport(transcript)
transcript_report.to_file("reports/transcript.html")
#transcript_report

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Merging dataframes
This section does some investigation about merging dataframes

The idea is to merge the portfolio, profile, and transcript dataframes using the id_membership and id_promotion keys

In [30]:
portfolio.columns

Index(['reward', 'difficulty', 'duration', 'offer_type', 'id_promotion',
       'email', 'mobile', 'social', 'web'],
      dtype='object')

In [31]:
profile.columns

Index(['gender', 'age', 'id_membership', 'became_member_on', 'income'], dtype='object')

In [32]:
transcript.columns

Index(['id_membership', 'event', 'time', 'id_promotion', 'amount', 'reward'], dtype='object')

In [33]:
trans_pro = pd.merge(transcript, profile, on="id_membership")

In [34]:
transcript.shape

(306534, 6)

In [56]:
profile.shape

(17000, 5)

In [35]:
trans_pro.shape

(306534, 10)

The transcript df and the merged trans_pro df have the same number of rows --> no data has been lost --> Looks ok

In [36]:
trans_pro.head()

Unnamed: 0,id_membership,event,time,id_promotion,amount,reward,gender,age,became_member_on,income
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,0.0,0,F,75,2017-05-09,100000.0
1,78afa995795e4d85b5d9ceeca43f5fef,offer viewed,6,9b98b8c7a33c4b65b9aebfe6a799e6d9,0.0,0,F,75,2017-05-09,100000.0
2,78afa995795e4d85b5d9ceeca43f5fef,transaction,132,,19.89,0,F,75,2017-05-09,100000.0
3,78afa995795e4d85b5d9ceeca43f5fef,offer completed,132,,0.0,5,F,75,2017-05-09,100000.0
4,78afa995795e4d85b5d9ceeca43f5fef,transaction,144,,17.78,0,F,75,2017-05-09,100000.0


In [37]:
trans_pro_port = pd.merge(trans_pro, portfolio, on="id_promotion")

In [38]:
trans_pro.shape

(306534, 10)

In [39]:
trans_pro_port.shape

(134002, 18)

The trans_pro df and the merged trans_pro_port df do NOT have the same number of rows --> some data has been lost --> TO DOUBLE CHECK

In [40]:
trans_pro_port.head()

Unnamed: 0,id_membership,event,time,id_promotion,amount,reward_x,gender,age,became_member_on,income,reward_y,difficulty,duration,offer_type,email,mobile,social,web
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,0.0,0,F,75,2017-05-09,100000.0,5,5,7,bogo,1,1,0,1
1,78afa995795e4d85b5d9ceeca43f5fef,offer viewed,6,9b98b8c7a33c4b65b9aebfe6a799e6d9,0.0,0,F,75,2017-05-09,100000.0,5,5,7,bogo,1,1,0,1
2,e2127556f4f64592b11af22de27a7932,offer received,408,9b98b8c7a33c4b65b9aebfe6a799e6d9,0.0,0,M,68,2018-04-26,70000.0,5,5,7,bogo,1,1,0,1
3,e2127556f4f64592b11af22de27a7932,offer viewed,420,9b98b8c7a33c4b65b9aebfe6a799e6d9,0.0,0,M,68,2018-04-26,70000.0,5,5,7,bogo,1,1,0,1
4,68617ca6246f4fbc85e91a2a49552598,offer received,504,9b98b8c7a33c4b65b9aebfe6a799e6d9,0.0,0,,118,2017-10-02,,5,5,7,bogo,1,1,0,1


# Variables of interest

The idea is to derive a set of informative variables that we can use to cluster customers.

The name of the derived variables will start with "prep_" for preprocessing.

Some exploration first

In [80]:
transcript.event.unique()

array(['offer received', 'offer viewed', 'transaction', 'offer completed'],
      dtype=object)

There are four different types of events:
* offer received
* offer viewed
* transaction
* offer completed

In [86]:
sum(transcript[transcript.event == "transaction"].amount)

1775451.9699999907

In [162]:
sum(transcript[transcript.event == "offer received"].amount)

0.0

The "offer received", "offer viewed", and "offer completed" events have their corresponding amount set to 0.

Only the "transaction" events have a non-zero amount.

## Total average spend per customer

In [164]:
trans_mean = transcript.query('event == "transaction"') # Filter only on transactions events
trans_mean = trans_mean.groupby('id_membership').mean() #.sort_values(by="amount", ascending=False)
trans_mean = trans_mean.reset_index(level=[0]) # reset index
trans_mean = trans_mean[["id_membership", "amount"]] # only keep "id_membership" and "amount" columns for later merging to profile
trans_mean.rename(columns = {'amount':'prep_tot_aver_spend'}, inplace = True) # rename "amount" column to "prep_tot_aver_spend" column
trans_mean

Unnamed: 0,id_membership,prep_tot_aver_spend
0,0009655768c64bdeb2e877511632db8f,15.950000
1,00116118485d4dfda04fdbaba9a87b5c,1.363333
2,0011e0d4e6b944f998e987f904e8c1e5,15.892000
3,0020c2b971eb4e9188eac86d93036a77,24.607500
4,0020ccbbb6d84e358d3414a3ff76cffd,12.837500
...,...,...
16573,fff3ba4757bd42088c044ca26d73817a,52.816364
16574,fff7576017104bcc8677a8d63322b5e1,4.990000
16575,fff8957ea8b240a6b5e634b6ee8eafcf,2.430000
16576,fffad4f4828548d1b5583907f2e9906b,7.402500


In [142]:
trans_mean.shape

(16578, 2)

In [144]:
profile.shape

(17000, 5)

There is less rows in trans_mean than profile --> some customers do **NOT** do any transactions probably. lets do a full outer join to double check that

In [145]:
pro_test = pd.merge(trans_mean, profile, on="id_membership",how='outer')

In [146]:
pro_test

Unnamed: 0,id_membership,prep_tot_aver_spend,gender,age,became_member_on,income
0,0009655768c64bdeb2e877511632db8f,15.950000,M,33,2017-04-21,72000.0
1,00116118485d4dfda04fdbaba9a87b5c,1.363333,,118,2018-04-25,
2,0011e0d4e6b944f998e987f904e8c1e5,15.892000,O,40,2018-01-09,57000.0
3,0020c2b971eb4e9188eac86d93036a77,24.607500,F,59,2016-03-04,90000.0
4,0020ccbbb6d84e358d3414a3ff76cffd,12.837500,F,24,2016-11-11,60000.0
...,...,...,...,...,...,...
16995,122d6efefb634edeb6a7199163a012d5,,M,67,2018-06-22,73000.0
16996,8c8aab35a805417b9f818afd497f26a4,,M,63,2017-08-13,71000.0
16997,b29f03fcfc844f01b93c28ce950a2cd7,,M,34,2018-03-10,31000.0
16998,043b6b6be98c412d901f7f376e0548d0,,M,73,2017-10-05,77000.0


We see that the bottom rows have **NaN** for the prep_tot_aver_spend meaning that they did not do any transactions.

In [147]:
sum(pro_test.prep_tot_aver_spend.isna())

422

There are 422 customers that did not do any transactions

Lets do a inner merge this time to avoid any NaNs in the prep_tot_aver_spend column.

In [166]:
pro_ext = pd.merge(trans_mean, profile, on="id_membership", merge_how="inner")

TypeError: merge() got an unexpected keyword argument 'merge_how'

### Comparison of the 118 age group against the non-118 age group

We see that the **118** age group spend on average much **LESS** than the **non-118** age group.

In [167]:
pro_ext.prep_tot_aver_spend.describe()

count    16578.000000
mean        13.680266
std         16.056763
min          0.050000
25%          3.181392
50%         11.996607
75%         20.469643
max        451.470000
Name: prep_tot_aver_spend, dtype: float64

In [168]:
pro_ext[pro_ext.age==118].prep_tot_aver_spend.describe()

count    2086.000000
mean        2.759012
std         3.344033
min         0.050000
25%         1.632000
50%         2.265833
75%         3.062670
max        82.070000
Name: prep_tot_aver_spend, dtype: float64

In [169]:
pro_ext[pro_ext.age!=118].prep_tot_aver_spend.describe()

count    14492.000000
mean        15.252287
std         16.543371
min          0.150000
25%          3.890833
50%         14.311214
75%         21.467625
max        451.470000
Name: prep_tot_aver_spend, dtype: float64

## Total number of completed offer

In [113]:
transcript.head()

Unnamed: 0,id_membership,event,time,id_promotion,amount,reward
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,0.0,0
1,a03223e636434f42ac4c3df47e8bac43,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,0.0,0
2,e2127556f4f64592b11af22de27a7932,offer received,0,2906b810c7d4411798c6938adc9daaa5,0.0,0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,0,fafdcd668e3743c1bb461111dcafc2a4,0.0,0
4,68617ca6246f4fbc85e91a2a49552598,offer received,0,4d5c57ea9a6940dd891ad53e9dbe8da0,0.0,0


In [135]:
trans_count = transcript.groupby(['id_membership', 'event']).count()#.sort_values(by="amount", ascending=False)
trans_count.head(20)
#tot_aver_spend = trans_mean["amount"]

Unnamed: 0_level_0,Unnamed: 1_level_0,time,id_promotion,amount,reward
id_membership,event,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0009655768c64bdeb2e877511632db8f,offer completed,3,3,3,3
0009655768c64bdeb2e877511632db8f,offer received,5,5,5,5
0009655768c64bdeb2e877511632db8f,offer viewed,4,4,4,4
0009655768c64bdeb2e877511632db8f,transaction,8,8,8,8
00116118485d4dfda04fdbaba9a87b5c,offer received,2,2,2,2
00116118485d4dfda04fdbaba9a87b5c,offer viewed,2,2,2,2
00116118485d4dfda04fdbaba9a87b5c,transaction,3,3,3,3
0011e0d4e6b944f998e987f904e8c1e5,offer completed,3,3,3,3
0011e0d4e6b944f998e987f904e8c1e5,offer received,5,5,5,5
0011e0d4e6b944f998e987f904e8c1e5,offer viewed,5,5,5,5


We see different scenarios for customers:
* some customers view and use (e.g. complete) all the received offers
* some other customers do not use (e.g. complete) at all their received offers even if they have viewed them --> need to undestand why

In [172]:
trans_count_ind = trans_count.reset_index(level=[0,1]) # reset index
trans_count_filt = trans_count_ind[trans_count_ind["event"] == "offer completed"] # Filter on "offer completed" only
trans_count_filt.rename(columns = {'time':'prep_tot_comp_offer'}, inplace = True) # rename "amount" column to "prep_tot_aver_spend" column
trans_count_filt = trans_count_filt[["id_membership", "prep_tot_comp_offer"]] # only keep the prep_tot_aver_spend and id_membership columns before merging
trans_count_filt

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,id_membership,prep_tot_comp_offer
0,0009655768c64bdeb2e877511632db8f,3
7,0011e0d4e6b944f998e987f904e8c1e5,3
11,0020c2b971eb4e9188eac86d93036a77,3
15,0020ccbbb6d84e358d3414a3ff76cffd,3
19,003d66b6608740288d6cc97a6903f4f0,3
...,...,...
63157,fff29fb549084123bd046dbc5ceb4faa,6
63161,fff3ba4757bd42088c044ca26d73817a,3
63165,fff7576017104bcc8677a8d63322b5e1,3
63172,fffad4f4828548d1b5583907f2e9906b,3


In [117]:
trans_count_filt.shape

(12774, 6)

## Number of transactions over the set promotion period

In [181]:
trans_count = transcript.query('event == "transaction"') # Filter on transactions events
trans_count = trans_count.groupby('id_membership').count() # groupby id_membership and apply mean
trans_count = trans_count.reset_index(level=[0]) # reset index
trans_count.rename(columns = {'time':'prep_nb_of_transactions'}, inplace = True) # rename "amount" column to "prep_tot_aver_spend" column

trans_count = trans_count[["id_membership", "prep_nb_of_transactions"]] # only keep the prep_nb_of_transactions and id_membership columns before merging
trans_count

Unnamed: 0,id_membership,prep_nb_of_transactions
0,0009655768c64bdeb2e877511632db8f,8
1,00116118485d4dfda04fdbaba9a87b5c,3
2,0011e0d4e6b944f998e987f904e8c1e5,5
3,0020c2b971eb4e9188eac86d93036a77,8
4,0020ccbbb6d84e358d3414a3ff76cffd,12
...,...,...
16573,fff3ba4757bd42088c044ca26d73817a,11
16574,fff7576017104bcc8677a8d63322b5e1,6
16575,fff8957ea8b240a6b5e634b6ee8eafcf,5
16576,fffad4f4828548d1b5583907f2e9906b,12


## Test preprocessing function

In [196]:
imp.reload(utm)

<module 'utils_main' from 'C:\\Users\\BASTAMX2\\04_TargetingCustomers_DAIB\\TargetingCustomers_DAIB\\eda\\../utils\\utils_main.py'>

In [183]:
profile.shape

(17000, 5)

In [197]:
profile_prep = utm.preprocessing(portfolio, profile, transcript, merge_how="outer")
profile_prep.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,id_membership,prep_nb_of_offer_view,prep_nb_of_offer_rec,prep_nb_of_transactions,prep_nb_of_offer_comp,prep_tot_spend,prep_tot_aver_spend,gender,age,became_member_on,income
0,0009655768c64bdeb2e877511632db8f,4.0,5.0,8.0,3.0,127.6,15.95,M,33,2017-04-21,72000.0
1,00116118485d4dfda04fdbaba9a87b5c,2.0,2.0,3.0,,4.09,1.363333,,118,2018-04-25,
2,0011e0d4e6b944f998e987f904e8c1e5,5.0,5.0,5.0,3.0,79.46,15.892,O,40,2018-01-09,57000.0
3,0020c2b971eb4e9188eac86d93036a77,3.0,5.0,8.0,3.0,196.86,24.6075,F,59,2016-03-04,90000.0
4,0020ccbbb6d84e358d3414a3ff76cffd,4.0,4.0,12.0,3.0,154.05,12.8375,F,24,2016-11-11,60000.0


In [175]:
profile_prep.shape

(17000, 7)

In [198]:
sum(profile_prep.prep_tot_aver_spend.isna())

422

422 customers have not spend a single dollar (no transactions)

In [185]:
sum(profile_prep.prep_nb_of_transactions.isna())

422

422 customer have **not** executed any transactions

In [199]:
sum(profile_prep.prep_nb_of_offer_comp.isna())

4226

4226 customers have **not** completed any offers --> why

In [200]:
sum(profile_prep.prep_nb_of_offer_rec.isna())

6

6 customers have **not** received any offers --> why

In [202]:
sum(profile_prep.prep_nb_of_offer_view.isna())

166

166 customers have **not** viewed any offers --> why