## Audiobooks Example Preprocessing

## Import relevant libraries

In [1]:
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import statsmodels.api
import seaborn as sns
import pandas as pd
sns.set()

## Extract data from csv

In [2]:
raw_data = pd.read_csv('Test.csv')
raw_data

Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_6056,Urban,Yes,3,30,Male,Head of Household,Married/Living together,Secondary education,Formally employed Government
1,Kenya,2018,uniqueid_6060,Urban,Yes,7,51,Male,Head of Household,Married/Living together,Vocational/Specialised training,Formally employed Private
2,Kenya,2018,uniqueid_6065,Rural,No,3,77,Female,Parent,Married/Living together,No formal education,Remittance Dependent
3,Kenya,2018,uniqueid_6072,Rural,No,6,39,Female,Head of Household,Married/Living together,Primary education,Remittance Dependent
4,Kenya,2018,uniqueid_6073,Urban,No,3,16,Male,Child,Single/Never Married,Secondary education,Remittance Dependent
5,Kenya,2018,uniqueid_6074,Rural,Yes,4,25,Female,Spouse,Married/Living together,Primary education,Self employed
6,Kenya,2018,uniqueid_6075,Rural,Yes,4,32,Female,Head of Household,Divorced/Seperated,Primary education,Informally employed
7,Kenya,2018,uniqueid_6076,Urban,Yes,3,24,Female,Spouse,Married/Living together,Vocational/Specialised training,Self employed
8,Kenya,2018,uniqueid_6077,Rural,Yes,4,22,Female,Child,Single/Never Married,Primary education,Informally employed
9,Kenya,2018,uniqueid_6078,Urban,Yes,1,42,Male,Head of Household,Divorced/Seperated,Secondary education,Farming and Fishing


## Preprocessing

In [3]:
raw_data.describe(include='all')

Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
count,10086,10086.0,10086,10086,10086,10086.0,10086.0,10086,10086,10086,10086,10086
unique,4,,7313,2,2,,,2,6,5,6,10
top,Rwanda,,uniqueid_8535,Rural,Yes,,,Female,Head of Household,Married/Living together,Primary education,Self employed
freq,3745,,3,6189,7559,,,5847,5487,4663,5479,2773
mean,,2016.975907,,,,3.778802,38.308348,,,,,
std,,0.847356,,,,2.212721,16.270053,,,,,
min,,2016.0,,,,1.0,16.0,,,,,
25%,,2016.0,,,,2.0,26.0,,,,,
50%,,2017.0,,,,3.0,35.0,,,,,
75%,,2018.0,,,,5.0,48.0,,,,,


## Determining the variables of interest

In [4]:
data = raw_data.drop(['uniqueid','year'],axis=1)
data.describe(include='all')

Unnamed: 0,country,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
count,10086,10086,10086,10086.0,10086.0,10086,10086,10086,10086,10086
unique,4,2,2,,,2,6,5,6,10
top,Rwanda,Rural,Yes,,,Female,Head of Household,Married/Living together,Primary education,Self employed
freq,3745,6189,7559,,,5847,5487,4663,5479,2773
mean,,,,3.778802,38.308348,,,,,
std,,,,2.212721,16.270053,,,,,
min,,,,1.0,16.0,,,,,
25%,,,,2.0,26.0,,,,,
50%,,,,3.0,35.0,,,,,
75%,,,,5.0,48.0,,,,,


## Dealing with missing values

In [5]:
data.isnull().sum()

country                   0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dtype: int64

## Dealing with categorical values

In [6]:
data['gender_of_respondent'] = data['gender_of_respondent'].map({'Male':1,'Female':0})
data['cellphone_access'] = data['cellphone_access'].map({'Yes':1,'No':0})

In [7]:
data.describe(include='all')

Unnamed: 0,country,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
count,10086,10086,10086.0,10086.0,10086.0,10086.0,10086,10086,10086,10086
unique,4,2,,,,,6,5,6,10
top,Rwanda,Rural,,,,,Head of Household,Married/Living together,Primary education,Self employed
freq,3745,6189,,,,,5487,4663,5479,2773
mean,,,0.749455,3.778802,38.308348,0.420286,,,,
std,,,0.433349,2.212721,16.270053,0.493629,,,,
min,,,0.0,1.0,16.0,0.0,,,,
25%,,,0.0,2.0,26.0,0.0,,,,
50%,,,1.0,3.0,35.0,0.0,,,,
75%,,,1.0,5.0,48.0,1.0,,,,


In [8]:
data_with_dummies = pd.get_dummies(data,drop_first=True)
data_with_dummies.describe()

Unnamed: 0,cellphone_access,household_size,age_of_respondent,gender_of_respondent,country_Rwanda,country_Tanzania,country_Uganda,location_type_Urban,relationship_with_head_Head of Household,relationship_with_head_Other non-relatives,...,education_level_Vocational/Specialised training,job_type_Farming and Fishing,job_type_Formally employed Government,job_type_Formally employed Private,job_type_Government Dependent,job_type_Informally employed,job_type_No Income,job_type_Other Income,job_type_Remittance Dependent,job_type_Self employed
count,10086.0,10086.0,10086.0,10086.0,10086.0,10086.0,10086.0,10086.0,10086.0,10086.0,...,10086.0,10086.0,10086.0,10086.0,10086.0,10086.0,10086.0,10086.0,10086.0,10086.0
mean,0.749455,3.778802,38.308348,0.420286,0.371307,0.281479,0.089332,0.386377,0.544021,0.006841,...,0.035594,0.234979,0.016458,0.044021,0.009915,0.235177,0.027365,0.041939,0.109756,0.274936
std,0.433349,2.212721,16.270053,0.493629,0.483178,0.449743,0.285236,0.486943,0.498083,0.082432,...,0.185285,0.424007,0.127237,0.205153,0.099083,0.424131,0.163152,0.20046,0.312601,0.446504
min,0.0,1.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,3.0,35.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,5.0,48.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,20.0,100.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
unscaled_inputs_all = data_with_dummies.copy()

In [10]:
unscaled_inputs_all.shape

(10086, 31)

## Standardize the inputs

In [11]:
scaled_inputs = preprocessing.scale(unscaled_inputs_all)

## Save the three datasets in *.npz

In [12]:
np.savez('AI_HACK_data_hack_test',inputs=scaled_inputs)

In [13]:
data_formated = raw_data.copy()

In [14]:
data_formated.head()

Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_6056,Urban,Yes,3,30,Male,Head of Household,Married/Living together,Secondary education,Formally employed Government
1,Kenya,2018,uniqueid_6060,Urban,Yes,7,51,Male,Head of Household,Married/Living together,Vocational/Specialised training,Formally employed Private
2,Kenya,2018,uniqueid_6065,Rural,No,3,77,Female,Parent,Married/Living together,No formal education,Remittance Dependent
3,Kenya,2018,uniqueid_6072,Rural,No,6,39,Female,Head of Household,Married/Living together,Primary education,Remittance Dependent
4,Kenya,2018,uniqueid_6073,Urban,No,3,16,Male,Child,Single/Never Married,Secondary education,Remittance Dependent


In [15]:
data_formated['uniqueid'] = data_formated['uniqueid'] +' x '+data_formated['country']

In [16]:
data_formated.head()

Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_6056 x Kenya,Urban,Yes,3,30,Male,Head of Household,Married/Living together,Secondary education,Formally employed Government
1,Kenya,2018,uniqueid_6060 x Kenya,Urban,Yes,7,51,Male,Head of Household,Married/Living together,Vocational/Specialised training,Formally employed Private
2,Kenya,2018,uniqueid_6065 x Kenya,Rural,No,3,77,Female,Parent,Married/Living together,No formal education,Remittance Dependent
3,Kenya,2018,uniqueid_6072 x Kenya,Rural,No,6,39,Female,Head of Household,Married/Living together,Primary education,Remittance Dependent
4,Kenya,2018,uniqueid_6073 x Kenya,Urban,No,3,16,Male,Child,Single/Never Married,Secondary education,Remittance Dependent


In [17]:
np.savez('AI_HACK_data_hack_test_uniquid',uniquid=data_formated['uniqueid'])