# Preprocess

This notebook is to prepare the dataset for modeling.

In [1]:
# Import modules
import numpy as np
import pandas as pd
import warnings

In [2]:
# Configurations
warnings.filterwarnings('ignore')

In [3]:
# Global variables
DATASETS_PATH = '../datasets/processed'

In [4]:
# Load population
population = pd.read_csv(f'{DATASETS_PATH}/individuos_espec_eda.csv')
population.head()

Unnamed: 0,name,age,marital_status,n_children,study,work,play_sports,transportation,body_mass_index
0,1,44.0,divorced,1.0,1.0,0.0,1.0,public,222009600000000.0
1,2,24.0,married,0.0,0.0,0.0,1.0,public,253787200000000.0
2,3,35.0,single,1.0,0.0,0.0,1.0,private,199523900000000.0
3,4,50.0,married,1.0,1.0,1.0,0.0,public,267320500000000.0
4,5,30.0,single,2.0,1.0,0.0,1.0,public,152956700000000.0


In [5]:
# Load connections
connections = pd.read_csv(f'{DATASETS_PATH}/conexoes_espec_eda.csv')
connections.head()

Unnamed: 0,infecting,infected,degree,proximity,transmission_rate
0,1,2,work,frequent_visit,0.589462
1,1,3,work,rare_visit,0.708465
2,2,4,work,casual_visit,
3,2,5,work,rare_visit,0.638842
4,3,6,friends,live_together,


In [6]:
# Merge datasets
dataframe = pd.merge(connections, population, left_on='infecting', right_on='name')
dataframe = pd.merge(dataframe, population, left_on='infected', right_on='name', suffixes=['_infecting', '_infected'])
dataframe.head()

Unnamed: 0,infecting,infected,degree,proximity,transmission_rate,name_infecting,age_infecting,marital_status_infecting,n_children_infecting,study_infecting,...,body_mass_index_infecting,name_infected,age_infected,marital_status_infected,n_children_infected,study_infected,work_infected,play_sports_infected,transportation_infected,body_mass_index_infected
0,1,2,work,frequent_visit,0.589462,1,44.0,divorced,1.0,1.0,...,222009600000000.0,2,24.0,married,0.0,0.0,0.0,1.0,public,253787200000000.0
1,1,3,work,rare_visit,0.708465,1,44.0,divorced,1.0,1.0,...,222009600000000.0,3,35.0,single,1.0,0.0,0.0,1.0,private,199523900000000.0
2,2,4,work,casual_visit,,2,24.0,married,0.0,0.0,...,253787200000000.0,4,50.0,married,1.0,1.0,1.0,0.0,public,267320500000000.0
3,2,5,work,rare_visit,0.638842,2,24.0,married,0.0,0.0,...,253787200000000.0,5,30.0,single,2.0,1.0,0.0,1.0,public,152956700000000.0
4,3,6,friends,live_together,,3,35.0,single,1.0,0.0,...,199523900000000.0,6,20.0,,1.0,0.0,1.0,0.0,public,204129400000000.0


In [7]:
# Drop IDs
dataframe_dropped = dataframe.drop(['name_infecting', 'name_infected'], axis=1)
dataframe_dropped.head()

Unnamed: 0,infecting,infected,degree,proximity,transmission_rate,age_infecting,marital_status_infecting,n_children_infecting,study_infecting,work_infecting,...,transportation_infecting,body_mass_index_infecting,age_infected,marital_status_infected,n_children_infected,study_infected,work_infected,play_sports_infected,transportation_infected,body_mass_index_infected
0,1,2,work,frequent_visit,0.589462,44.0,divorced,1.0,1.0,0.0,...,public,222009600000000.0,24.0,married,0.0,0.0,0.0,1.0,public,253787200000000.0
1,1,3,work,rare_visit,0.708465,44.0,divorced,1.0,1.0,0.0,...,public,222009600000000.0,35.0,single,1.0,0.0,0.0,1.0,private,199523900000000.0
2,2,4,work,casual_visit,,24.0,married,0.0,0.0,0.0,...,public,253787200000000.0,50.0,married,1.0,1.0,1.0,0.0,public,267320500000000.0
3,2,5,work,rare_visit,0.638842,24.0,married,0.0,0.0,0.0,...,public,253787200000000.0,30.0,single,2.0,1.0,0.0,1.0,public,152956700000000.0
4,3,6,friends,live_together,,35.0,single,1.0,0.0,0.0,...,private,199523900000000.0,20.0,,1.0,0.0,1.0,0.0,public,204129400000000.0


In [8]:
# Add column to remember missing values
dataframe_had_nan = dataframe_dropped.copy()
dataframe_tmp = dataframe_had_nan.drop('transmission_rate', axis=1)
dataframe_had_nan['had_nan'] = dataframe_tmp.isna().sum(axis=1) >= 1
dataframe_had_nan.head()

Unnamed: 0,infecting,infected,degree,proximity,transmission_rate,age_infecting,marital_status_infecting,n_children_infecting,study_infecting,work_infecting,...,body_mass_index_infecting,age_infected,marital_status_infected,n_children_infected,study_infected,work_infected,play_sports_infected,transportation_infected,body_mass_index_infected,had_nan
0,1,2,work,frequent_visit,0.589462,44.0,divorced,1.0,1.0,0.0,...,222009600000000.0,24.0,married,0.0,0.0,0.0,1.0,public,253787200000000.0,False
1,1,3,work,rare_visit,0.708465,44.0,divorced,1.0,1.0,0.0,...,222009600000000.0,35.0,single,1.0,0.0,0.0,1.0,private,199523900000000.0,False
2,2,4,work,casual_visit,,24.0,married,0.0,0.0,0.0,...,253787200000000.0,50.0,married,1.0,1.0,1.0,0.0,public,267320500000000.0,False
3,2,5,work,rare_visit,0.638842,24.0,married,0.0,0.0,0.0,...,253787200000000.0,30.0,single,2.0,1.0,0.0,1.0,public,152956700000000.0,False
4,3,6,friends,live_together,,35.0,single,1.0,0.0,0.0,...,199523900000000.0,20.0,,1.0,0.0,1.0,0.0,public,204129400000000.0,True


In [9]:
# Replace NaN values on the submission samples
dataframe_replaced = dataframe_had_nan.copy()

MODE_FEATURES = [
    'degree',
    'proximity',
    'n_children_infecting',
    'study_infecting',
    'work_infecting',
    'play_sports_infecting',
    'marital_status_infecting',
    'transportation_infecting',
    'n_children_infected',
    'study_infected',
    'work_infected',
    'play_sports_infected',
    'marital_status_infected',
    'transportation_infected'
]
MEAN_FEATURES = [
    'age_infecting',
    'body_mass_index_infecting',
    'age_infected',
    'body_mass_index_infected',
]

modes = dataframe_replaced[MODE_FEATURES].mode()
means = dataframe_replaced[MEAN_FEATURES].mean(axis=0)

dataframe_replaced = dataframe_replaced.fillna(modes.iloc[0])
dataframe_replaced = dataframe_replaced.fillna(means)

dataframe_replaced.isna().sum()

infecting                         0
infected                          0
degree                            0
proximity                         0
transmission_rate            500000
age_infecting                     0
marital_status_infecting          0
n_children_infecting              0
study_infecting                   0
work_infecting                    0
play_sports_infecting             0
transportation_infecting          0
body_mass_index_infecting         0
age_infected                      0
marital_status_infected           0
n_children_infected               0
study_infected                    0
work_infected                     0
play_sports_infected              0
transportation_infected           0
body_mass_index_infected          0
had_nan                           0
dtype: int64

In [10]:
# Categorical to numerical features
dataframe_numerical = dataframe_replaced.copy()

dataframe_numerical['proximity'] = dataframe_numerical['proximity'].replace({
    'rare_visit': 0,
    'casual_visit': 1,
    'frequent_visit': 2,
    'live_together': 3
})

for feature in ['degree', 'marital_status_infecting', 'transportation_infecting', 'marital_status_infected', 'transportation_infected']:
    column = dataframe_numerical.pop(feature)
    new_columns = pd.get_dummies(column, prefix=feature)
    dataframe_numerical = pd.concat([dataframe_numerical, new_columns], axis=1)

dataframe_numerical.head()

Unnamed: 0,infecting,infected,proximity,transmission_rate,age_infecting,n_children_infecting,study_infecting,work_infecting,play_sports_infecting,body_mass_index_infecting,...,transportation_infecting_private,transportation_infecting_public,transportation_infecting_taxi,marital_status_infected_divorced,marital_status_infected_married,marital_status_infected_single,marital_status_infected_widow,transportation_infected_private,transportation_infected_public,transportation_infected_taxi
0,1,2,2,0.589462,44.0,1.0,1.0,0.0,1.0,222009600000000.0,...,0,1,0,0,1,0,0,0,1,0
1,1,3,0,0.708465,44.0,1.0,1.0,0.0,1.0,222009600000000.0,...,0,1,0,0,0,1,0,1,0,0
2,2,4,1,,24.0,0.0,0.0,0.0,1.0,253787200000000.0,...,0,1,0,0,1,0,0,0,1,0
3,2,5,0,0.638842,24.0,0.0,0.0,0.0,1.0,253787200000000.0,...,0,1,0,0,0,1,0,0,1,0
4,3,6,3,,35.0,1.0,0.0,0.0,1.0,199523900000000.0,...,1,0,0,0,0,1,0,0,1,0


In [11]:
# Removing skewness
dataframe_skew = dataframe_numerical.copy()
for feature in ['age_infecting', 'body_mass_index_infecting', 'age_infected', 'body_mass_index_infected']:
    dataframe_skew[feature] = np.log1p(dataframe_skew[feature])

In [12]:
# Combining body mass index and age
dataframe_eng = dataframe_skew.copy()

age = dataframe_eng['age_infecting'].where(dataframe_eng['age_infecting'] >= 1, 1)
dataframe_eng['health_infecting'] = dataframe_eng['body_mass_index_infecting'] / age

age = dataframe_eng['age_infected'].where(dataframe_eng['age_infected'] >= 1, 1)
dataframe_eng['health_infected'] = dataframe_eng['body_mass_index_infected'] / age

In [13]:
# Save new dataset
dataframe_eng.to_csv(f'{DATASETS_PATH}/dataframe_preprocess.csv', index=False)

**Observations:**

- datasets merged
- missing values samples dropped, except for the submission samples
- missing values of the submission samples filled with mean and mode values
- categorical features replaced by new numerical features
- skewness removed
- created health index by combining *body_mass_index* and *age*