### Import the relevant libraries

In [65]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [66]:
# We load the data from the csv file. In the file, the headers have been removed as they contain the labels for 
# the columns. Also our input data contains all the columns between the Customer IDs and the target(conversion)
# We will extract this data as our input data and also extract the targets
raw_data = np.loadtxt('original.csv', delimiter=',')
unscaled_inputs = raw_data[:,1:-1]
all_targets = raw_data[:,-1]

# To give a view of the structure of the data, we load the original file with the headers intact
view_data = pd.read_csv('original_w_headers.csv')
view_data.head()

Unnamed: 0,id,overall_book_length(mins),avg_book_length(mins),overall_price,avg_price,review?,review(10/10),completion,mins_listened,support_requests,last_visited_minus_purchase_date,conversion
0,994,1620.0,1620,19.73,19.73,1,10.0,0.99,1603.8,5,92,0
1,1143,2160.0,2160,5.33,5.33,0,8.91,0.0,0.0,0,0,0
2,2059,2160.0,2160,5.33,5.33,0,8.91,0.0,0.0,0,388,0
3,2882,1620.0,1620,5.96,5.96,0,8.91,0.42,680.4,1,129,0
4,3342,2160.0,2160,5.33,5.33,0,8.91,0.22,475.2,0,361,0


### Balance the dataset

In [67]:
view_data.describe()

Unnamed: 0,id,overall_book_length(mins),avg_book_length(mins),overall_price,avg_price,review?,review(10/10),completion,mins_listened,support_requests,last_visited_minus_purchase_date,conversion
count,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0
mean,16772.491551,1591.281685,1678.608634,7.103791,7.543805,0.16075,8.909795,0.125659,189.888983,0.070222,61.935033,0.158833
std,9691.807248,504.340663,654.838599,4.931673,5.560129,0.367313,0.643406,0.241206,371.08401,0.472157,88.207634,0.365533
min,2.0,216.0,216.0,3.86,3.86,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,8368.0,1188.0,1188.0,5.33,5.33,0.0,8.91,0.0,0.0,0.0,0.0,0.0
50%,16711.5,1620.0,1620.0,5.95,6.07,0.0,8.91,0.0,0.0,0.0,11.0,0.0
75%,25187.25,2160.0,2160.0,8.0,8.0,0.0,8.91,0.13,194.4,0.0,105.0,0.0
max,33683.0,2160.0,7020.0,130.94,130.94,1.0,10.0,1.0,2160.0,30.0,464.0,1.0


In [68]:
num_conversions = np.sum(all_targets)
num_conversions

2237.0

In [69]:
# From the table above, the total count of the dataset is 14084. Therefore, in order to assume a balanced dataset, we
# will need to have 14084/2 as the sum of all our targets which is 7042. From the sum above, it appears we have a 
# lesser number of 1s(conversions) compared to 0s(non-conversions). In order to train our model on a balanced dataset,
# we have to reduce the dataset by eliminating data with non-conversions to match with that of conversions

In [70]:
indices_to_remove = []
zero_count = all_targets.shape[0] - num_conversions

for i in range(all_targets.shape[0]):
    if all_targets[i] == 0:
        if zero_count > num_conversions:
            indices_to_remove.append(i)
            zero_count-=1
        else:
            break

# Now we delete the number of objects with the indices in the list we created, the resulting dataset will form our
# balanced inputs and targets

unscaled_inputs_eq_priors = np.delete(unscaled_inputs, indices_to_remove, axis=0)
all_targets_eq_priors = np.delete(all_targets, indices_to_remove, axis=0)

# With equal number of priors, our dataset is balanced. Now we move to scaling the dataset

### Scaling the inputs

In [71]:
all_inputs_scaled = preprocessing.scale(unscaled_inputs_eq_priors)

### Shuffling the data

In [72]:
# Since we will be batching our data, we want the data in each batch to be as random as possible, so we shuffle the
# data. We use the indices of the data to shuffle the inputs and targets. Note that the shuffled indices used for 
# the inputs will be the same indices used for the targets so as to maintain the integrity of the data

shuffled_indices = np.arange(all_inputs_scaled.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = unscaled_inputs_eq_priors[shuffled_indices]
shuffled_targets = all_targets_eq_priors[shuffled_indices]

### Splitting the data

In [73]:
# We split the data into training, testing and validation data. Using the 80-10-10 split

samples_count = shuffled_inputs.shape[0]

train_samples = int(0.8*samples_count)
validation_samples = int(0.1*samples_count)
test_samples = samples_count - train_samples - validation_samples

training_inputs = shuffled_inputs[:train_samples]
training_targets = shuffled_targets[:train_samples]

validation_inputs = shuffled_inputs[train_samples:train_samples+validation_samples]
validation_targets = shuffled_targets[train_samples:train_samples+validation_samples]

test_inputs = shuffled_inputs[train_samples+validation_samples:]
test_targets = shuffled_targets[train_samples+validation_samples:]

# At this point, we need to validate that our data is balanced across the train, validation and test data 
# by checking the proportion of the targets in each dataset

print(train_samples, np.sum(training_targets), np.sum(training_targets)/train_samples)
print(validation_samples, np.sum(validation_targets), np.sum(validation_targets)/validation_samples)
print(test_samples, np.sum(test_targets), np.sum(test_targets)/test_samples)

3579 1800.0 0.5029337803855826
447 224.0 0.5011185682326622
448 213.0 0.47544642857142855


### Save the data

In [74]:
# In order to use the dataset in tensorflow, we save the data in a .npz format
np.savez('audiobooks_training_data', inputs = training_inputs, targets = training_targets)
np.savez('audiobooks_validation_data', inputs = validation_inputs, targets = validation_targets)
np.savez('audiobooks_test_data', inputs = test_inputs, targets = test_targets)