# Audiobooks business case

## Preprocess, balance, split and save data for use in tensorflow

### Get the data from the csv, using pandas DataFrames for all further steps here

In [18]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler

raw_csv_data = pd.read_csv('Audiobooks_data.csv', header=None)

unscaled_inputs_all = raw_csv_data.drop(raw_csv_data.columns[[0, -1]], axis=1)
targets_all = raw_csv_data.iloc[:,-1]

### (Pre)Shuffle the data
optional, in this case preshuffling lowers the final accuracy by ~10% because the data seems to be arranged in a way that offers additional patterns. (most likely the way non-buying customers are sorted)

In [19]:
#idx = np.random.permutation(unscaled_inputs_all.index)

#unscaled_inputs_all = unscaled_inputs_all.reindex(idx).reset_index(drop=True)
#targets_all = targets_all.reindex(idx).reset_index(drop=True)

### Balance the dataset
the dataset has by far more 0s than 1s, so we take all 1s and drop 0s until we have the same amount of 0s to balance it to 50/50

In [20]:
num_one_targets = int(np.sum(targets_all))  # gives amount of 1s in dataset (since there are only 0s and 1s)
zero_targets_counter = 0  # keep track of how many 0s we already iterated over
indices_to_remove = []  # 'mark' all 0s over the amount of 1s as deletable

# loop that puts all indices of rows in a list after the amount of 0s equals to that of 1s
for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

# drop every row which index is in the generated list
unscaled_inputs_equal_priors = unscaled_inputs_all.drop(unscaled_inputs_all.index[indices_to_remove], axis=0)
targets_equal_priors = targets_all.drop(targets_all.index[indices_to_remove], axis=0)

### Shuffle the equal-priors-data

In [21]:
# declaring list with randomized order of indices
idx = np.random.permutation(unscaled_inputs_equal_priors.index)

# applying the list of random indices to both dataframes, so they have equal, random shuffled indices
unscaled_inputs_equal_priors = unscaled_inputs_equal_priors.reindex(idx).reset_index(drop=True)
targets_equal_priors = targets_equal_priors.reindex(idx).reset_index(drop=True)

### Standardize the inputs

In [22]:
audiobook_scaler = StandardScaler()
audiobook_scaler.fit(unscaled_inputs_equal_priors)
# use the sklearn preprocessing function, returns nd-array
scaled_inputs = audiobook_scaler.transform(unscaled_inputs_equal_priors)
# transform nd-array back into a pd.DataFrame
scaled_inputs = pd.DataFrame(scaled_inputs)

### Split the dataset into train, validation, and test

In [23]:
# get number of all samples
samples_count = scaled_inputs.shape[0]

# declare variables for the counts off all subset-samples
train_samples_count = int(0.8*samples_count)  # 80% of all
validation_samples_count = int(0.1*samples_count)  # 10% of all
test_samples_count = samples_count - train_samples_count - validation_samples_count  # rest

# actually slicing the dataframes by the set percentages
# train-set
train_inputs = scaled_inputs[:train_samples_count]
train_targets = targets_equal_priors[:train_samples_count]

# val-set
validation_inputs = scaled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = targets_equal_priors[train_samples_count:train_samples_count+validation_samples_count]

# test-set
test_inputs = scaled_inputs[train_samples_count+validation_samples_count:]
test_targets = targets_equal_priors[train_samples_count+validation_samples_count:]

# print: number of 1s (from 1s and 0s), number of samples per set, percentage of split(should be 1 / categories roughly)
print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

1773 3579 0.4953897736797988
234 447 0.5234899328859061
230 448 0.5133928571428571


### Save the three datasets in *.npz
save the 3 subsets as tuples with tags: 'inputs' for inputs and 'targets' for labels

In [24]:
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)

### Save the Scaler-Object

In [25]:
with open('scaler.pickle', 'wb') as f:
    pickle.dump(audiobook_scaler, f)