# Business Application: Analysis and Modeling of Customer Purchases

### Part 1: Preliminary Data Analysis

In [2]:
import numpy as np
import pandas as pd

In [55]:
df = pd.read_csv('./data/audiobook_data.csv', header=None)

In [56]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,994,1620.0,1620,19.73,19.73,1,10.00,0.99,1603.8,5,92,0
1,1143,2160.0,2160,5.33,5.33,0,8.91,0.00,0.0,0,0,0
2,2059,2160.0,2160,5.33,5.33,0,8.91,0.00,0.0,0,388,0
3,2882,1620.0,1620,5.96,5.96,0,8.91,0.42,680.4,1,129,0
4,3342,2160.0,2160,5.33,5.33,0,8.91,0.22,475.2,0,361,0
...,...,...,...,...,...,...,...,...,...,...,...,...
14079,28220,1620.0,1620,5.33,5.33,1,9.00,0.61,988.2,0,4,0
14080,28671,1080.0,1080,6.55,6.55,1,6.00,0.29,313.2,0,29,0
14081,31134,2160.0,2160,6.14,6.14,0,8.91,0.00,0.0,0,0,0
14082,32832,1620.0,1620,5.33,5.33,1,8.00,0.38,615.6,0,90,0


In [57]:
df.columns

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')

In [58]:
headers = ['ID',
           'book_length_avg', 'book_length_total', 'price_avg', 'price_total', 
           'gave_review', 'avg_review', 'completion', 'time_listened',
           'support_requests', 'last_used_minus_purchase',
           'TARGETS']

In [59]:
keys = np.arange(0, 12)
column_map = dict(zip(keys, headers))

In [60]:
df.rename(column_map, axis=1, inplace=True) 

In [61]:
df.describe()

Unnamed: 0,ID,book_length_avg,book_length_total,price_avg,price_total,gave_review,avg_review,completion,time_listened,support_requests,last_used_minus_purchase,TARGETS
count,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0
mean,16772.491551,1591.281685,1678.608634,7.103791,7.543805,0.16075,8.909795,0.125659,189.888983,0.070222,61.935033,0.158833
std,9691.807248,504.340663,654.838599,4.931673,5.560129,0.367313,0.643406,0.241206,371.08401,0.472157,88.207634,0.365533
min,2.0,216.0,216.0,3.86,3.86,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,8368.0,1188.0,1188.0,5.33,5.33,0.0,8.91,0.0,0.0,0.0,0.0,0.0
50%,16711.5,1620.0,1620.0,5.95,6.07,0.0,8.91,0.0,0.0,0.0,11.0,0.0
75%,25187.25,2160.0,2160.0,8.0,8.0,0.0,8.91,0.13,194.4,0.0,105.0,0.0
max,33683.0,2160.0,7020.0,130.94,130.94,1.0,10.0,1.0,2160.0,30.0,464.0,1.0


In [62]:
df['num_books'] = round(df['price_total'] / df['price_avg'], 1)
df

Unnamed: 0,ID,book_length_avg,book_length_total,price_avg,price_total,gave_review,avg_review,completion,time_listened,support_requests,last_used_minus_purchase,TARGETS,num_books
0,994,1620.0,1620,19.73,19.73,1,10.00,0.99,1603.8,5,92,0,1.0
1,1143,2160.0,2160,5.33,5.33,0,8.91,0.00,0.0,0,0,0,1.0
2,2059,2160.0,2160,5.33,5.33,0,8.91,0.00,0.0,0,388,0,1.0
3,2882,1620.0,1620,5.96,5.96,0,8.91,0.42,680.4,1,129,0,1.0
4,3342,2160.0,2160,5.33,5.33,0,8.91,0.22,475.2,0,361,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14079,28220,1620.0,1620,5.33,5.33,1,9.00,0.61,988.2,0,4,0,1.0
14080,28671,1080.0,1080,6.55,6.55,1,6.00,0.29,313.2,0,29,0,1.0
14081,31134,2160.0,2160,6.14,6.14,0,8.91,0.00,0.0,0,0,0,1.0
14082,32832,1620.0,1620,5.33,5.33,1,8.00,0.38,615.6,0,90,0,1.0


In [63]:
df.describe()

Unnamed: 0,ID,book_length_avg,book_length_total,price_avg,price_total,gave_review,avg_review,completion,time_listened,support_requests,last_used_minus_purchase,TARGETS,num_books
count,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0
mean,16772.491551,1591.281685,1678.608634,7.103791,7.543805,0.16075,8.909795,0.125659,189.888983,0.070222,61.935033,0.158833,1.063689
std,9691.807248,504.340663,654.838599,4.931673,5.560129,0.367313,0.643406,0.241206,371.08401,0.472157,88.207634,0.365533,0.330884
min,2.0,216.0,216.0,3.86,3.86,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,8368.0,1188.0,1188.0,5.33,5.33,0.0,8.91,0.0,0.0,0.0,0.0,0.0,1.0
50%,16711.5,1620.0,1620.0,5.95,6.07,0.0,8.91,0.0,0.0,0.0,11.0,0.0,1.0
75%,25187.25,2160.0,2160.0,8.0,8.0,0.0,8.91,0.13,194.4,0.0,105.0,0.0,1.0
max,33683.0,2160.0,7020.0,130.94,130.94,1.0,10.0,1.0,2160.0,30.0,464.0,1.0,7.0


In [64]:
df['num_books_check'] = round(df['book_length_total'] / df['book_length_avg'], 1)
df

Unnamed: 0,ID,book_length_avg,book_length_total,price_avg,price_total,gave_review,avg_review,completion,time_listened,support_requests,last_used_minus_purchase,TARGETS,num_books,num_books_check
0,994,1620.0,1620,19.73,19.73,1,10.00,0.99,1603.8,5,92,0,1.0,1.0
1,1143,2160.0,2160,5.33,5.33,0,8.91,0.00,0.0,0,0,0,1.0,1.0
2,2059,2160.0,2160,5.33,5.33,0,8.91,0.00,0.0,0,388,0,1.0,1.0
3,2882,1620.0,1620,5.96,5.96,0,8.91,0.42,680.4,1,129,0,1.0,1.0
4,3342,2160.0,2160,5.33,5.33,0,8.91,0.22,475.2,0,361,0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14079,28220,1620.0,1620,5.33,5.33,1,9.00,0.61,988.2,0,4,0,1.0,1.0
14080,28671,1080.0,1080,6.55,6.55,1,6.00,0.29,313.2,0,29,0,1.0,1.0
14081,31134,2160.0,2160,6.14,6.14,0,8.91,0.00,0.0,0,0,0,1.0,1.0
14082,32832,1620.0,1620,5.33,5.33,1,8.00,0.38,615.6,0,90,0,1.0,1.0


In [65]:
np.min(df['num_books'] - df['num_books_check'])

0.0

In [66]:
np.max(df['num_books'] - df['num_books_check'])

0.0

### Feature Selection

It's possible that we don't need to use all the available input variables. Some of them contain the same information: e.g., minutes listened, total minutes purchases, and completion all contain related variables. 

In [67]:
df2 = df[['num_books', 'price_total', 'completion', 'last_used_minus_purchase', 'TARGETS']]

In [68]:
df2

Unnamed: 0,num_books,price_total,completion,last_used_minus_purchase,TARGETS
0,1.0,19.73,0.99,92,0
1,1.0,5.33,0.00,0,0
2,1.0,5.33,0.00,388,0
3,1.0,5.96,0.42,129,0
4,1.0,5.33,0.22,361,0
...,...,...,...,...,...
14079,1.0,5.33,0.61,4,0
14080,1.0,6.55,0.29,29,0
14081,1.0,6.14,0.00,0,0
14082,1.0,5.33,0.38,90,0


In [75]:
A = df2.values

In [76]:
reduced_inputs = A[:, 0:-1]
reduced_inputs

array([[1.000e+00, 1.973e+01, 9.900e-01, 9.200e+01],
       [1.000e+00, 5.330e+00, 0.000e+00, 0.000e+00],
       [1.000e+00, 5.330e+00, 0.000e+00, 3.880e+02],
       ...,
       [1.000e+00, 6.140e+00, 0.000e+00, 0.000e+00],
       [1.000e+00, 5.330e+00, 3.800e-01, 9.000e+01],
       [2.000e+00, 1.067e+01, 0.000e+00, 0.000e+00]])

In [77]:
reduced_targets = A[:, -1]
reduced_targets

array([0., 0., 0., ..., 0., 0., 1.])

In [78]:
reduced_targets.shape

(14084,)

In [94]:
ones_count = int(reduced_targets.sum())
ones_count

2237

### Balance the Data

There are only two outputs: 
* (1) customer made a purchase within 6 months
* (0) customer did not make a purchase within 6 months

We need to balance the data so that there are an equal number of priors in the target data used for training and testing: 50% make a purchase (1) and 50% do not (0). 

In [84]:
# shuffle the original input and target data
shuffled_indices_all = np.arange(reduced_inputs.shape[0])
np.random.shuffle(shuffled_indices_all)

In [85]:
shuffled_indices_all

array([13552, 10364,  6588, ...,  3658,  8928, 13684])

In [86]:
shuffled_inputs = reduced_inputs[shuffled_indices_all]
shuffled_targets = reduced_targets[shuffled_indices_all]

In [95]:
# to balance the dataset, we want to have an equal number of 0 targets and 1 targets
zero_count = 0
remove_indices = []

for i in range (shuffled_targets.shape[0]):
    if (shuffled_targets[i] == 0):
        zero_count += 1
        if (zero_count > ones_count):
            remove_indices.append(i)

In [96]:
unscaled_inputs_equal_priors = np.delete(shuffled_inputs, remove_indices, axis=0)
targets_equal_priors = np.delete(shuffled_targets, remove_indices, axis=0)

(unscaled_inputs_equal_priors.shape, targets_equal_priors.shape)

((4474, 4), (4474,))

In [97]:
# standardize the input data so that each variable has a mean of 0 and standard deviation of 1
from sklearn import preprocessing

In [98]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)
scaled_inputs.shape

(4474, 4)

In [99]:
# shuffle the data again
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

In [100]:
shuffled_unscaled_inputs = unscaled_inputs_equal_priors[shuffled_indices]

shuffled_scaled_inputs = scaled_inputs[shuffled_indices]

shuffled_targets = targets_equal_priors[shuffled_indices]

### Split Dataset into Training, Validation, and Testing

In [101]:
num_samples = shuffled_inputs.shape[0]
num_samples

14084