### This notebook contains the code to generate the preprocessed data for the "DengAI: Predicting Disease Spread" competition.

We are given the training data dengue_features_train.csv and dengue_labels_train.csv, and test data dengue_feature_test.csv.

This notebook preprocesses these and saves this data.

We'll need to impute missing values. Also note the city-year-weekofyear is essentially the identifier for each number of cases, but they contain useful information so won't be dropped.

In this script we fill in missing values by doing a simple 'forward fill', so taking the previous value in the time-ordered series, instead of taking the average over all other years.

In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [37]:
X = pd.read_csv('data/dengue_features_train.csv')
X_test = pd.read_csv('data/dengue_features_test.csv')
X.head(10)

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8
5,sj,1990,23,1990-06-04,,0.17485,0.254314,0.181743,9.58,299.63,...,26.49,79.891429,9.58,17.212857,2.1,28.114286,6.942857,34.4,23.9,39.1
6,sj,1990,24,1990-06-11,0.1129,0.0928,0.205071,0.210271,3.48,299.207143,...,38.6,82.0,3.48,17.234286,2.042857,27.414286,6.771429,32.2,23.3,29.7
7,sj,1990,25,1990-06-18,0.0725,0.0725,0.151471,0.133029,151.12,299.591429,...,30.0,83.375714,151.12,17.977143,1.571429,28.371429,7.685714,33.9,22.8,21.1
8,sj,1990,26,1990-06-25,0.10245,0.146175,0.125571,0.1236,19.32,299.578571,...,37.51,82.768571,19.32,17.79,1.885714,28.328571,7.385714,33.9,22.8,21.1
9,sj,1990,27,1990-07-02,,0.12155,0.160683,0.202567,14.41,300.154286,...,28.4,81.281429,14.41,18.071429,2.014286,28.328571,6.514286,33.9,24.4,1.1


In [38]:
y = pd.read_csv('data/dengue_labels_train.csv')
y.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,1990,18,4
1,sj,1990,19,5
2,sj,1990,20,4
3,sj,1990,21,3
4,sj,1990,22,6


In [39]:
# To start, week_start_date does not any new info, so we'll drop it
X.drop('week_start_date', axis=1, inplace=True)
X_test.drop('week_start_date', axis=1, inplace=True)

# As the data is ordered by city, we'll have to split by city
# I'll run the script twice, one for each city
city = 'sj'  # 'iq'
X = X[X.city == city]
X_test = X_test[X_test.city == city]
y = y[y.city == city]

In [40]:
# Check for missing values
print(len(X))
print(X.isnull().sum())
print(X_test.isnull().sum())

936
city                                       0
year                                       0
weekofyear                                 0
ndvi_ne                                  191
ndvi_nw                                   49
ndvi_se                                   19
ndvi_sw                                   19
precipitation_amt_mm                       9
reanalysis_air_temp_k                      6
reanalysis_avg_temp_k                      6
reanalysis_dew_point_temp_k                6
reanalysis_max_air_temp_k                  6
reanalysis_min_air_temp_k                  6
reanalysis_precip_amt_kg_per_m2            6
reanalysis_relative_humidity_percent       6
reanalysis_sat_precip_amt_mm               9
reanalysis_specific_humidity_g_per_kg      6
reanalysis_tdtr_k                          6
station_avg_temp_c                         6
station_diur_temp_rng_c                    6
station_max_temp_c                         6
station_min_temp_c                         6
statio

In [41]:
# Impute missing values with ffill
X.ffill(inplace=True)
X_test.ffill(inplace=True)

# Check if there are still missing values
print(X.isnull().sum())
print(X_test.isnull().sum())

city                                     0
year                                     0
weekofyear                               0
ndvi_ne                                  0
ndvi_nw                                  0
ndvi_se                                  0
ndvi_sw                                  0
precipitation_amt_mm                     0
reanalysis_air_temp_k                    0
reanalysis_avg_temp_k                    0
reanalysis_dew_point_temp_k              0
reanalysis_max_air_temp_k                0
reanalysis_min_air_temp_k                0
reanalysis_precip_amt_kg_per_m2          0
reanalysis_relative_humidity_percent     0
reanalysis_sat_precip_amt_mm             0
reanalysis_specific_humidity_g_per_kg    0
reanalysis_tdtr_k                        0
station_avg_temp_c                       0
station_diur_temp_rng_c                  0
station_max_temp_c                       0
station_min_temp_c                       0
station_precip_mm                        0
dtype: int6

In [42]:
# Split data; don't shuffle because it's a time series!
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=False)

In [43]:
# Next we engineer the weekofyear function to be cyclical, so that the last week of the year is close to the first
# By assigning sin and cos to the weekofyear, we can keep the cyclical nature of the data
X_train['weekofyear_sin'] = X_train['weekofyear'].apply(lambda x: np.sin(2*np.pi*x/52))
X_train['weekofyear_cos'] = X_train['weekofyear'].apply(lambda x: np.cos(2*np.pi*x/52))
X_train.drop('weekofyear', axis=1, inplace=True)

X_valid['weekofyear_sin'] = X_valid['weekofyear'].apply(lambda x: np.sin(2*np.pi*x/52))
X_valid['weekofyear_cos'] = X_valid['weekofyear'].apply(lambda x: np.cos(2*np.pi*x/52))
X_valid.drop('weekofyear', axis=1, inplace=True)

X_test['weekofyear_sin'] = X_test['weekofyear'].apply(lambda x: np.sin(2*np.pi*x/52))
X_test['weekofyear_cos'] = X_test['weekofyear'].apply(lambda x: np.cos(2*np.pi*x/52))
# X_test_imputed.drop('weekofyear', axis=1, inplace=True)  ## don't drop this yet, we need it for submission

# Also not for y, as these will be dropped from there before training
X_train.head()

Unnamed: 0,city,year,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,...,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,weekofyear_sin,weekofyear_cos
0,sj,1990,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,...,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,0.822984,-0.568065
1,sj,1990,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,...,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,0.748511,-0.663123
2,sj,1990,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,...,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,0.663123,-0.748511
3,sj,1990,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.31,...,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0,0.568065,-0.822984
4,sj,1990,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,295.821429,...,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8,0.464723,-0.885456


In [44]:
# Drop the cities, except for the test data, still need it for submission
X_train_prep = X_train.drop('city', axis=1)
X_valid_prep = X_valid.drop('city', axis=1)
X_test_prep = X_test

In [45]:
# Finally save the data
X_train_prep.to_csv(f'prep_data_ffill/X_train_prep_{city}_ffill.csv', index=False)
X_valid_prep.to_csv(f'prep_data_ffill/X_valid_prep_{city}_ffill.csv', index=False)
X_test_prep.to_csv(f'prep_data_ffill/X_test_prep_{city}_ffill.csv', index=False)

y_train.drop(['city', 'year', 'weekofyear'], axis=1, inplace=True)
y_train.to_csv(f'prep_data_ffill/y_train_prep_{city}.csv', index=False)
y_valid.drop(['city', 'year', 'weekofyear'], axis=1, inplace=True)
y_valid.to_csv(f'prep_data_ffill/y_valid_prep_{city}.csv', index=False)