## Extracting Customer behavior Features

Features for clustering will be extracted including, for each customer, number of offers received, number of offers viewed, number of offers completed, number of transactions made and total amount of money spent.

In [68]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import mysql.connector
import sqlalchemy

In [87]:
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="ozge.12",
  database="starbucks_db"
)

print(mydb)

<mysql.connector.connection_cext.CMySQLConnection object at 0x000001D91BC1FE20>


In [70]:
customers = pd.read_sql('SELECT * FROM customers', mydb)
offers = pd.read_sql('SELECT * FROM offers', mydb)
transcripts = pd.read_sql('SELECT * FROM transcripts', mydb)
members = pd.read_sql('SELECT * FROM members', mydb)



In [71]:
transcripts.head()

Unnamed: 0,index,person,event,dict_key,dict_value,hours_since_start
0,0,78afa995795e4d85b5d9ceeca43f5fef,offer received,offer id,9b98b8c7a33c4b65b9aebfe6a799e6d9,0
1,1,a03223e636434f42ac4c3df47e8bac43,offer received,offer id,0b1e1539f2cc45b7b9fa7c272da2e1d7,0
2,2,e2127556f4f64592b11af22de27a7932,offer received,offer id,2906b810c7d4411798c6938adc9daaa5,0
3,3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,offer id,fafdcd668e3743c1bb461111dcafc2a4,0
4,4,68617ca6246f4fbc85e91a2a49552598,offer received,offer id,4d5c57ea9a6940dd891ad53e9dbe8da0,0


In [72]:
# create customer_behavior dataframe
customer_behavior = pd.DataFrame(columns = ['person', 'num_received', 'num_viewed', 'num_completed', 'num_transactions', 'money_spent'])

In [73]:
# Add data to person column, ordered in ascending order
customer_behavior['person'] = transcripts.value_counts('person').sort_index().index.values

In [45]:
customer_behavior

Unnamed: 0,person,num_received,num_viewed,num_completed,num_transactions,money_spent
0,0009655768c64bdeb2e877511632db8f,,,,,
1,00116118485d4dfda04fdbaba9a87b5c,,,,,
2,0011e0d4e6b944f998e987f904e8c1e5,,,,,
3,0020c2b971eb4e9188eac86d93036a77,,,,,
4,0020ccbbb6d84e358d3414a3ff76cffd,,,,,
...,...,...,...,...,...,...
16995,fff3ba4757bd42088c044ca26d73817a,,,,,
16996,fff7576017104bcc8677a8d63322b5e1,,,,,
16997,fff8957ea8b240a6b5e634b6ee8eafcf,,,,,
16998,fffad4f4828548d1b5583907f2e9906b,,,,,


In [74]:
# Aggregate transcripts dataframe
transcripts_filtered = transcripts.sort_values('person')
transcripts_grouped = transcripts_filtered.groupby(['person', 'event']).count()
transcripts_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,index,dict_key,dict_value,hours_since_start
person,event,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0009655768c64bdeb2e877511632db8f,offer completed,3,3,3,3
0009655768c64bdeb2e877511632db8f,offer received,5,5,5,5
0009655768c64bdeb2e877511632db8f,offer viewed,4,4,4,4
0009655768c64bdeb2e877511632db8f,transaction,8,8,8,8
00116118485d4dfda04fdbaba9a87b5c,offer received,2,2,2,2


In [75]:
customer_behavior.set_index('person', inplace = True)
# Add data to `num_received`, `num_viewed`, `num_completed` and `num_transactions`
for person_id in customer_behavior.index.values:
    if (person_id, 'offer received') in list(transcripts_grouped.index.values):
        customer_behavior.loc[person_id, 'num_received'] = transcripts_grouped.loc[(person_id, 'offer received'), 'dict_key']
    if (person_id, 'offer viewed') in list(transcripts_grouped.index.values):
        customer_behavior.loc[person_id, 'num_viewed'] = transcripts_grouped.loc[(person_id, 'offer viewed'), 'dict_key']
    if (person_id, 'offer completed') in list(transcripts_grouped.index.values):
        customer_behavior.loc[person_id, 'num_completed'] = transcripts_grouped.loc[(person_id, 'offer completed'), 'dict_key']
    if (person_id, 'transaction') in list(transcripts_grouped.index.values):
        customer_behavior.loc[person_id, 'num_transactions'] = transcripts_grouped.loc[(person_id, 'transaction'), 'dict_key']

In [76]:
# Calculate how much money each customer spent
money_spent = transcripts_filtered[transcripts_filtered['event'] == 'transaction'].groupby('person').agg({'dict_value': 'sum'})

# Add to money_spent
customer_behavior['money_spent'] = money_spent
customer_behavior.head(3)

Unnamed: 0_level_0,num_received,num_viewed,num_completed,num_transactions,money_spent
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0009655768c64bdeb2e877511632db8f,5,4,3.0,8,18.418.5728.1614.1112.3622.1613.5610.27
00116118485d4dfda04fdbaba9a87b5c,2,2,,3,0.70000000000000010.23.19
0011e0d4e6b944f998e987f904e8c1e5,5,5,3.0,5,11.938.9623.0313.4922.05


In [77]:
# Replace NA with 0
customer_behavior.fillna(0, inplace = True)
customer_behavior.isna().sum()

num_received        0
num_viewed          0
num_completed       0
num_transactions    0
money_spent         0
dtype: int64

In [80]:
customer_behavior_with_num_received = customer_behavior.copy()

Drop column num_received
I am going to be using features about customer behavior for clustering analysis. However, number of offers received was not actively decided by customers, therfore I will drop it when running clustering.

In [81]:
# Drop `num_received`
customer_behavior.drop('num_received', axis = 1, inplace = True)
customer_behavior.head(3)

# Calculate how much money each customer spent during the month of the campaign
money_spent = transactions.groupby('person').agg({'dict_value': 'sum'})

# Add to `money_spent`
customer_behavior['money_spent'] = money_spent

I am going to be using features about customer behavior for clustering analysis. However, number of offers received was not actively decided by customers, therfore I will drop it when running clustering.

In [82]:
# Replace NA with 0
customer_behavior.fillna(0, inplace = True)
customer_behavior.isna().sum()
customer_behavior.head(3)

Unnamed: 0_level_0,num_viewed,num_completed,num_transactions,money_spent
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0009655768c64bdeb2e877511632db8f,4,3,8,127.6
00116118485d4dfda04fdbaba9a87b5c,2,0,3,4.09
0011e0d4e6b944f998e987f904e8c1e5,5,3,5,79.46


In [83]:
customer_behavior_export = customer_behavior.reset_index(drop = False)
customer_behavior_with_num_received_export = customer_behavior_with_num_received.reset_index(drop = False)

In [92]:
customer_behavior_export

Unnamed: 0,person,num_viewed,num_completed,num_transactions,money_spent
0,0009655768c64bdeb2e877511632db8f,4,3,8,127.60
1,00116118485d4dfda04fdbaba9a87b5c,2,0,3,4.09
2,0011e0d4e6b944f998e987f904e8c1e5,5,3,5,79.46
3,0020c2b971eb4e9188eac86d93036a77,3,3,8,196.86
4,0020ccbbb6d84e358d3414a3ff76cffd,4,3,12,154.05
...,...,...,...,...,...
16995,fff3ba4757bd42088c044ca26d73817a,3,3,11,580.98
16996,fff7576017104bcc8677a8d63322b5e1,4,3,6,29.94
16997,fff8957ea8b240a6b5e634b6ee8eafcf,2,0,5,12.15
16998,fffad4f4828548d1b5583907f2e9906b,4,3,12,88.83


In [93]:
database_username = 'root'
database_password = 'ozge.12'
database_ip       = '127.0.0.1'
database_name     = 'starbucks_db'
database_connection = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}@{2}/{3}'.
                                               format(database_username, database_password, 
                                                      database_ip, database_name), pool_recycle=1, pool_timeout=57600).connect()

customer_behavior_export.to_sql(con=database_connection, name='customer_behavior_export_', if_exists='append',chunksize=100)
customer_behavior_with_num_received_export.to_sql(con=database_connection, name='customer_behavior_with_num_received_', if_exists='append',chunksize=100)

database_connection.close()

17000