# Audiobooks business case

#### Libraries

In [40]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.model_selection import train_test_split

## Read data

In [3]:
raw_data = pd.read_csv('Audiobooks_data.csv', header=None)
raw_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,873,2160.0,2160,10.13,10.13,0,8.91,0.0,0.0,0,0,1
1,611,1404.0,2808,6.66,13.33,1,6.5,0.0,0.0,0,182,1
2,705,324.0,324,10.13,10.13,1,9.0,0.0,0.0,1,334,1
3,391,1620.0,1620,15.31,15.31,0,9.0,0.0,0.0,0,183,1
4,819,432.0,1296,7.11,21.33,1,9.0,0.0,0.0,0,0,1


### Balance the dataset

In [22]:
raw_data.iloc[:, -1].value_counts()

0    11847
1     2237
Name: 11, dtype: int64

In [26]:
zero_target = raw_data[raw_data.iloc[:, -1]==0]
zero_target

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
2237,508,1080.0,1080,10.13,10.13,1,9.00,0.85,0.0,1,81,0
2238,123,2160.0,2160,5.33,5.33,0,8.91,0.00,0.0,0,37,0
2239,558,648.0,648,10.13,10.13,1,8.00,0.65,0.0,0,134,0
2240,678,2160.0,2160,8.00,8.00,0,8.91,0.58,0.0,0,138,0
2241,604,1620.0,1620,8.96,8.96,0,8.91,0.09,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
14079,27398,2160.0,2160,7.99,7.99,0,8.91,0.00,0.0,0,54,0
14080,28220,1620.0,1620,5.33,5.33,1,9.00,0.61,0.0,0,4,0
14081,28671,1080.0,1080,6.55,6.55,1,6.00,0.29,0.0,0,29,0
14082,31134,2160.0,2160,6.14,6.14,0,8.91,0.00,0.0,0,0,0


In [27]:
one_target = raw_data[raw_data.iloc[:, -1]==1]
one_target

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,873,2160.0,2160,10.13,10.13,0,8.91,0.0,0.0,0,0,1
1,611,1404.0,2808,6.66,13.33,1,6.50,0.0,0.0,0,182,1
2,705,324.0,324,10.13,10.13,1,9.00,0.0,0.0,1,334,1
3,391,1620.0,1620,15.31,15.31,0,9.00,0.0,0.0,0,183,1
4,819,432.0,1296,7.11,21.33,1,9.00,0.0,0.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2232,5646,1620.0,1620,5.33,5.33,0,8.91,0.0,0.0,0,0,1
2233,8782,1476.0,4428,5.33,16.00,0,8.91,0.0,0.0,0,0,1
2234,15827,2160.0,2160,5.33,5.33,0,8.91,0.0,0.0,0,12,1
2235,20609,2160.0,2160,5.33,5.33,1,10.00,0.0,0.0,0,6,1


In [30]:
len(zero_target.iloc[:2237])

2237

In [31]:
zero_data = zero_target.iloc[:2237]

In [32]:
all_data = zero_data.append(one_target)

In [33]:
all_data.iloc[:, -1].value_counts()

0    2237
1    2237
Name: 11, dtype: int64

### Select X and y

In [35]:
# The inputs are all columns in the csv, except for the first one and the last one
# The first column is the arbitrary ID, while the last contains the targets
X = all_data.iloc[:, 1:-1]
y = all_data.iloc[:,-1]
X.shape, y.shape

((4474, 10), (4474,))

In [37]:
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1,4474.0,1568.267156,497.491877,216.0,1188.0,1620.0,2160.0,2160.0
2,4474.0,1843.169423,870.538565,216.0,1188.0,1620.0,2160.0,7020.0
3,4474.0,7.060673,4.532291,3.86,5.33,5.99,8.0,104.0
4,4474.0,8.445823,6.333357,3.86,5.33,6.58,8.61,111.47
5,4474.0,0.167635,0.373584,0.0,0.0,0.0,0.0,1.0
6,4474.0,8.917789,0.692126,1.0,8.91,8.91,8.91,10.0
7,4474.0,0.074473,0.198747,0.0,0.0,0.0,0.0,1.0
8,4474.0,337.990031,391.459893,0.0,0.0,190.08,623.16,2116.8
9,4474.0,0.084265,0.41036,0.0,0.0,0.0,0.0,8.0
10,4474.0,72.560125,93.950475,0.0,0.0,21.0,135.0,464.0


### Scale data

In [38]:
scale = StandardScaler()
X_sc = scale.fit_transform(X)

## Split data into train and test

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, test_size=.80, random_state=42)