## Audiobooks Example Preprocessing

## Import relevant libraries

In [1]:
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import statsmodels.api
import seaborn as sns
import pandas as pd
sns.set()

## Extract data from csv

In [2]:
raw_data = pd.read_csv('Train.csv')
raw_data

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed
5,Kenya,2018,uniqueid_6,No,Rural,No,7,26,Female,Spouse,Married/Living together,Primary education,Informally employed
6,Kenya,2018,uniqueid_7,No,Rural,Yes,7,32,Female,Spouse,Married/Living together,Primary education,Self employed
7,Kenya,2018,uniqueid_8,No,Rural,Yes,1,42,Female,Head of Household,Married/Living together,Tertiary education,Formally employed Government
8,Kenya,2018,uniqueid_9,Yes,Rural,Yes,3,54,Male,Head of Household,Married/Living together,Secondary education,Farming and Fishing
9,Kenya,2018,uniqueid_10,No,Urban,Yes,3,76,Female,Head of Household,Divorced/Seperated,No formal education,Remittance Dependent


## Preprocessing

In [3]:
raw_data.describe(include='all')

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
count,23524,23524.0,23524,23524,23524,23524,23524.0,23524.0,23524,23524,23524,23524,23524
unique,4,,8735,2,2,2,,,2,6,5,6,10
top,Rwanda,,uniqueid_1088,No,Rural,Yes,,,Female,Head of Household,Married/Living together,Primary education,Self employed
freq,8735,,4,20212,14343,17454,,,13877,12831,10749,12791,6437
mean,,2016.975939,,,,,3.797483,38.80522,,,,,
std,,0.847371,,,,,2.227613,16.520569,,,,,
min,,2016.0,,,,,1.0,16.0,,,,,
25%,,2016.0,,,,,2.0,26.0,,,,,
50%,,2017.0,,,,,3.0,35.0,,,,,
75%,,2018.0,,,,,5.0,49.0,,,,,


## Determining the variables of interest

In [4]:
data = raw_data.drop(['uniqueid','year'],axis=1)
data.describe(include='all')

Unnamed: 0,country,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
count,23524,23524,23524,23524,23524.0,23524.0,23524,23524,23524,23524,23524
unique,4,2,2,2,,,2,6,5,6,10
top,Rwanda,No,Rural,Yes,,,Female,Head of Household,Married/Living together,Primary education,Self employed
freq,8735,20212,14343,17454,,,13877,12831,10749,12791,6437
mean,,,,,3.797483,38.80522,,,,,
std,,,,,2.227613,16.520569,,,,,
min,,,,,1.0,16.0,,,,,
25%,,,,,2.0,26.0,,,,,
50%,,,,,3.0,35.0,,,,,
75%,,,,,5.0,49.0,,,,,


## Dealing with missing values

In [5]:
data.isnull().sum()

country                   0
bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64

## Dealing with categorical values

In [6]:
data['bank_account'] = data['bank_account'].map({'Yes':1,'No':0})
data['gender_of_respondent'] = data['gender_of_respondent'].map({'Male':1,'Female':0})
data['cellphone_access'] = data['cellphone_access'].map({'Yes':1,'No':0})

In [7]:
data.describe(include='all')

Unnamed: 0,country,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
count,23524,23524.0,23524,23524.0,23524.0,23524.0,23524.0,23524,23524,23524,23524
unique,4,,2,,,,,6,5,6,10
top,Rwanda,,Rural,,,,,Head of Household,Married/Living together,Primary education,Self employed
freq,8735,,14343,,,,,12831,10749,12791,6437
mean,,0.140792,,0.741966,3.797483,38.80522,0.410092,,,,
std,,0.347815,,0.437562,2.227613,16.520569,0.491861,,,,
min,,0.0,,0.0,1.0,16.0,0.0,,,,
25%,,0.0,,0.0,2.0,26.0,0.0,,,,
50%,,0.0,,1.0,3.0,35.0,0.0,,,,
75%,,0.0,,1.0,5.0,49.0,1.0,,,,


In [8]:
data_with_dummies = pd.get_dummies(data,drop_first=True)
data_with_dummies.describe()

Unnamed: 0,bank_account,cellphone_access,household_size,age_of_respondent,gender_of_respondent,country_Rwanda,country_Tanzania,country_Uganda,location_type_Urban,relationship_with_head_Head of Household,...,education_level_Vocational/Specialised training,job_type_Farming and Fishing,job_type_Formally employed Government,job_type_Formally employed Private,job_type_Government Dependent,job_type_Informally employed,job_type_No Income,job_type_Other Income,job_type_Remittance Dependent,job_type_Self employed
count,23524.0,23524.0,23524.0,23524.0,23524.0,23524.0,23524.0,23524.0,23524.0,23524.0,...,23524.0,23524.0,23524.0,23524.0,23524.0,23524.0,23524.0,23524.0,23524.0,23524.0
mean,0.140792,0.741966,3.797483,38.80522,0.410092,0.371323,0.281415,0.089313,0.390282,0.545443,...,0.034135,0.231296,0.016451,0.044848,0.0105,0.237927,0.026654,0.045911,0.107422,0.273635
std,0.347815,0.437562,2.227613,16.520569,0.491861,0.483169,0.449699,0.285201,0.487824,0.497941,...,0.181581,0.42167,0.127206,0.206974,0.101932,0.425823,0.161072,0.209296,0.309656,0.445834
min,0.0,0.0,1.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,2.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,3.0,35.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,5.0,49.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,21.0,100.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
unscaled_inputs_all = np.array(data_with_dummies.drop(['bank_account'],axis=1))
targets_all = np.array(data['bank_account'])

In [10]:
unscaled_inputs_all.shape

(23524, 31)

In [11]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
variables = data_with_dummies.drop(['bank_account'],axis=1)
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values,i) for i in range(variables.shape[1])]
vif['features'] = variables.columns

In [12]:
vif

Unnamed: 0,VIF,features
0,4.587878,cellphone_access
1,5.594623,household_size
2,11.476934,age_of_respondent
3,2.973198,gender_of_respondent
4,2.79821,country_Rwanda
5,3.715654,country_Tanzania
6,1.885956,country_Uganda
7,2.356462,location_type_Urban
8,10.826751,relationship_with_head_Head of Household
9,1.107949,relationship_with_head_Other non-relatives


## Balance the data set

In [13]:
"""nubmer_of_one_target = int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i] ==0:
        zero_targets_counter += 1
        if zero_targets_counter > nubmer_of_one_target:
            indices_to_remove.append(i)
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove,axis = 0)
targets_equal_priors = np.delete(targets_all, indices_to_remove,axis = 0)"""


'nubmer_of_one_target = int(np.sum(targets_all))\nzero_targets_counter = 0\nindices_to_remove = []\n\nfor i in range(targets_all.shape[0]):\n    if targets_all[i] ==0:\n        zero_targets_counter += 1\n        if zero_targets_counter > nubmer_of_one_target:\n            indices_to_remove.append(i)\nunscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove,axis = 0)\ntargets_equal_priors = np.delete(targets_all, indices_to_remove,axis = 0)'

In [14]:
unscaled_inputs_equal_priors = unscaled_inputs_all
targets_equal_priors = targets_all

In [15]:
unscaled_inputs_equal_priors.shape

(23524, 31)

## Standardize the inputs

In [16]:
scaled_inputs = preprocessing.scale(unscaled_inputs_all)

## Shuffle the data

In [17]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_all[shuffled_indices]

## Split the dataset into train, validation and test

In [18]:
samples_count = shuffled_inputs.shape[0]

train_samples_count = int(0.8*samples_count)
validation_samples_count = int(0.1*samples_count)
test_samples_count = samples_count - validation_samples_count - train_samples_count

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count:validation_samples_count+train_samples_count]
validation_targets = shuffled_targets[train_samples_count:validation_samples_count+train_samples_count]

test_inputs = shuffled_inputs[validation_samples_count+train_samples_count:]
test_targets = shuffled_targets[validation_samples_count+train_samples_count:]


## Save the three datasets in *.npz

In [19]:
np.savez('AI_HACK_data_train',inputs=train_inputs,targets=train_targets)
np.savez('AI_HACK_data_validation',inputs=validation_inputs,targets=validation_targets)
np.savez('AI_HACK_data_test',inputs=test_inputs,targets=test_targets)