### This notebook contains the code to generate the preprocessed data for the "DengAI: Predicting Disease Spread" competition.

We are given the training data dengue_features_train.csv and dengue_labels_train.csv, and test data dengue_feature_test.csv.

This notebook preprocesses these and saves this data.

We'll need to impute missing values. Also note the city-year-weekofyear is essentially the identifier for each number of cases, but they contain useful information so won't be dropped.

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [29]:
X = pd.read_csv('data/dengue_features_train.csv')
X_test = pd.read_csv('data/dengue_features_test.csv')
X.head(10)

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8
5,sj,1990,23,1990-06-04,,0.17485,0.254314,0.181743,9.58,299.63,...,26.49,79.891429,9.58,17.212857,2.1,28.114286,6.942857,34.4,23.9,39.1
6,sj,1990,24,1990-06-11,0.1129,0.0928,0.205071,0.210271,3.48,299.207143,...,38.6,82.0,3.48,17.234286,2.042857,27.414286,6.771429,32.2,23.3,29.7
7,sj,1990,25,1990-06-18,0.0725,0.0725,0.151471,0.133029,151.12,299.591429,...,30.0,83.375714,151.12,17.977143,1.571429,28.371429,7.685714,33.9,22.8,21.1
8,sj,1990,26,1990-06-25,0.10245,0.146175,0.125571,0.1236,19.32,299.578571,...,37.51,82.768571,19.32,17.79,1.885714,28.328571,7.385714,33.9,22.8,21.1
9,sj,1990,27,1990-07-02,,0.12155,0.160683,0.202567,14.41,300.154286,...,28.4,81.281429,14.41,18.071429,2.014286,28.328571,6.514286,33.9,24.4,1.1


In [30]:
y = pd.read_csv('data/dengue_labels_train.csv')
y.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,1990,18,4
1,sj,1990,19,5
2,sj,1990,20,4
3,sj,1990,21,3
4,sj,1990,22,6


In [31]:
# To start, week_start_date does not any new info, so we'll drop it
X.drop('week_start_date', axis=1, inplace=True)
X_test.drop('week_start_date', axis=1, inplace=True)

# Split data; don't shuffle because it's a time series!
# Also, as the data is ordered by city, we'll have to split by city
# I'll run the script twice, one for each city
city = 'iq' # 'sj'
X = X[X.city == city]
X_test = X_test[X_test.city == city]
y = y[y.city == city]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=False)

In [32]:
# Check for missing values
print(len(X))
print(X.isnull().sum())
print(X_test.isnull().sum())

520
city                                      0
year                                      0
weekofyear                                0
ndvi_ne                                   3
ndvi_nw                                   3
ndvi_se                                   3
ndvi_sw                                   3
precipitation_amt_mm                      4
reanalysis_air_temp_k                     4
reanalysis_avg_temp_k                     4
reanalysis_dew_point_temp_k               4
reanalysis_max_air_temp_k                 4
reanalysis_min_air_temp_k                 4
reanalysis_precip_amt_kg_per_m2           4
reanalysis_relative_humidity_percent      4
reanalysis_sat_precip_amt_mm              4
reanalysis_specific_humidity_g_per_kg     4
reanalysis_tdtr_k                         4
station_avg_temp_c                       37
station_diur_temp_rng_c                  37
station_max_temp_c                       14
station_min_temp_c                        8
station_precip_mm           

In [None]:
# Try to impute missing values with the mean over all years of the same week (and city)
# The mean is taken using train data only, preventing data leakage
mean_values = X_train.drop('year', axis=1).groupby(['city', 'weekofyear']).mean()
if city=='sj':
    mean_values.loc[('sj', 23)]['ndvi_ne']  # this one is missing as seen below

In [35]:
# Print to check if it will work
X_train.head(10)

Unnamed: 0,city,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
936,iq,2000,26,0.192886,0.132257,0.340886,0.2472,25.41,296.74,298.45,...,43.19,92.418571,25.41,16.651429,8.928571,26.4,10.775,32.5,20.7,3.0
937,iq,2000,27,0.216833,0.2761,0.289457,0.241657,60.61,296.634286,298.428571,...,46.0,93.581429,60.61,16.862857,10.314286,26.9,11.566667,34.0,20.8,55.6
938,iq,2000,28,0.176757,0.173129,0.204114,0.128014,55.52,296.415714,297.392857,...,64.77,95.848571,55.52,17.12,7.385714,26.8,11.466667,33.0,20.7,38.1
939,iq,2000,29,0.227729,0.145429,0.2542,0.200314,5.6,295.357143,296.228571,...,23.96,87.234286,5.6,14.431429,9.114286,25.766667,10.533333,31.5,14.7,30.0
940,iq,2000,30,0.328643,0.322129,0.254371,0.361043,62.76,296.432857,297.635714,...,31.8,88.161429,62.76,15.444286,9.5,26.6,11.48,33.3,19.1,4.0
941,iq,2000,31,0.205529,0.190757,0.231671,0.255314,16.24,297.191429,298.285714,...,1.0,74.728571,16.24,13.421429,13.771429,25.34,10.94,32.0,17.0,11.5
942,iq,2000,32,0.312486,0.329986,0.380586,0.387271,89.37,297.32,298.978571,...,26.68,83.275714,89.37,15.311429,11.471429,27.016667,11.65,34.0,19.9,72.9
943,iq,2000,33,0.384133,0.39224,0.34178,0.38275,42.08,297.627143,299.335714,...,16.9,82.49,42.08,15.465714,13.7,26.583333,10.316667,33.0,20.5,50.1
944,iq,2000,34,0.408157,0.322157,0.406714,0.302714,49.22,298.238571,299.571429,...,5.59,74.74,49.22,14.444286,13.771429,26.9,13.4,34.0,19.0,89.2
945,iq,2000,35,0.332043,0.321057,0.314614,0.324257,53.65,299.218571,300.928571,...,16.07,74.151429,53.65,15.057143,12.457143,27.116667,12.266667,34.0,20.0,78.0


In [36]:
# Select the right row based on city and week
def impute_with_city_week_means(df, city_week_means):
    return df.apply(lambda row: row.fillna(city_week_means.loc[(row["city"], row["weekofyear"])])
                    if (row["city"], row["weekofyear"]) in city_week_means.index else row, axis=1)

X_train_imputed = impute_with_city_week_means(X_train, mean_values)
X_valid_imputed = impute_with_city_week_means(X_valid, mean_values)
X_test_imputed = impute_with_city_week_means(X_test, mean_values)
X_train_imputed.head(10)  # see that the right mean is filled in!

Unnamed: 0,city,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
936,iq,2000,26,0.192886,0.132257,0.340886,0.2472,25.41,296.74,298.45,...,43.19,92.418571,25.41,16.651429,8.928571,26.4,10.775,32.5,20.7,3.0
937,iq,2000,27,0.216833,0.2761,0.289457,0.241657,60.61,296.634286,298.428571,...,46.0,93.581429,60.61,16.862857,10.314286,26.9,11.566667,34.0,20.8,55.6
938,iq,2000,28,0.176757,0.173129,0.204114,0.128014,55.52,296.415714,297.392857,...,64.77,95.848571,55.52,17.12,7.385714,26.8,11.466667,33.0,20.7,38.1
939,iq,2000,29,0.227729,0.145429,0.2542,0.200314,5.6,295.357143,296.228571,...,23.96,87.234286,5.6,14.431429,9.114286,25.766667,10.533333,31.5,14.7,30.0
940,iq,2000,30,0.328643,0.322129,0.254371,0.361043,62.76,296.432857,297.635714,...,31.8,88.161429,62.76,15.444286,9.5,26.6,11.48,33.3,19.1,4.0
941,iq,2000,31,0.205529,0.190757,0.231671,0.255314,16.24,297.191429,298.285714,...,1.0,74.728571,16.24,13.421429,13.771429,25.34,10.94,32.0,17.0,11.5
942,iq,2000,32,0.312486,0.329986,0.380586,0.387271,89.37,297.32,298.978571,...,26.68,83.275714,89.37,15.311429,11.471429,27.016667,11.65,34.0,19.9,72.9
943,iq,2000,33,0.384133,0.39224,0.34178,0.38275,42.08,297.627143,299.335714,...,16.9,82.49,42.08,15.465714,13.7,26.583333,10.316667,33.0,20.5,50.1
944,iq,2000,34,0.408157,0.322157,0.406714,0.302714,49.22,298.238571,299.571429,...,5.59,74.74,49.22,14.444286,13.771429,26.9,13.4,34.0,19.0,89.2
945,iq,2000,35,0.332043,0.321057,0.314614,0.324257,53.65,299.218571,300.928571,...,16.07,74.151429,53.65,15.057143,12.457143,27.116667,12.266667,34.0,20.0,78.0


In [37]:
# Check if there are still missing values
print(X_train_imputed.isnull().sum())
print(X_valid_imputed.isnull().sum())
print(X_test_imputed.isnull().sum())

city                                     0
year                                     0
weekofyear                               0
ndvi_ne                                  1
ndvi_nw                                  1
ndvi_se                                  1
ndvi_sw                                  1
precipitation_amt_mm                     1
reanalysis_air_temp_k                    1
reanalysis_avg_temp_k                    1
reanalysis_dew_point_temp_k              1
reanalysis_max_air_temp_k                1
reanalysis_min_air_temp_k                1
reanalysis_precip_amt_kg_per_m2          1
reanalysis_relative_humidity_percent     1
reanalysis_sat_precip_amt_mm             1
reanalysis_specific_humidity_g_per_kg    1
reanalysis_tdtr_k                        1
station_avg_temp_c                       1
station_diur_temp_rng_c                  1
station_max_temp_c                       1
station_min_temp_c                       1
station_precip_mm                        1
dtype: int6

In [38]:
# It seems like there are rows with no entries at all
# These are all at week 53, and there are no other rows with data at week 53, so we can't impute them
# Normally it might be a good idea to drop them, but since there is one week 53 in the test data, we need to get a submission for it and
# impute something there. We'll do the same for the train data

def impute_week_53(df):
    """
    Impute by taking the mean value of the week preceding and following the missing week
    Also note that e.g. iq,2010,53 is in between iq,2009,52 and iq,2010,1 ! Year does not make sense here
    df should be X_train_imputed or X_valid_imputed or X_test_imputed
    """
    for feature in df.drop(['city', 'year', 'weekofyear'], axis=1).columns:
        for city in df["city"].unique():
            # Get only rows for this city
            city_df = df[df["city"] == city]
            
            # Find rows where weekofyear == 53
            week_53_mask = (city_df["weekofyear"] == 53)

            # Find years where week 53 exists
            years_with_week_53 = city_df[week_53_mask]["year"].unique()
            
            for year in years_with_week_53:
                # Get values for week 52 and week 1 around week 53, thinking of the weird order
                week_52_value = city_df.loc[(city_df["year"] == year-1) & (city_df["weekofyear"] == 52), feature]
                week_1_value = city_df.loc[(city_df["year"] == year) & (city_df["weekofyear"] == 1), feature]
                
                # If both week 52 and week 1 exist, take the mean
                if not week_52_value.empty and not week_1_value.empty:
                    imputed_value = (week_52_value.values[0] + week_1_value.values[0]) / 2
                elif not week_52_value.empty:
                    imputed_value = week_52_value.values[0]  # Use week 52 if week 1 is missing
                elif not week_1_value.empty:
                    imputed_value = week_1_value.values[0]  # Use week 1 if week 52 is missing
                else:
                    continue  # Skip if both are missing (rare case)

                # Apply imputation
                df.loc[(df["city"] == city) & (df["year"] == year) & week_53_mask, feature] = imputed_value
    return df

X_train_imputed = impute_week_53(X_train_imputed)
X_valid_imputed = impute_week_53(X_valid_imputed)
X_test_imputed = impute_week_53(X_test_imputed)  ## can check by hand in the output file that it worked properly!

X_train_imputed.head()

Unnamed: 0,city,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
936,iq,2000,26,0.192886,0.132257,0.340886,0.2472,25.41,296.74,298.45,...,43.19,92.418571,25.41,16.651429,8.928571,26.4,10.775,32.5,20.7,3.0
937,iq,2000,27,0.216833,0.2761,0.289457,0.241657,60.61,296.634286,298.428571,...,46.0,93.581429,60.61,16.862857,10.314286,26.9,11.566667,34.0,20.8,55.6
938,iq,2000,28,0.176757,0.173129,0.204114,0.128014,55.52,296.415714,297.392857,...,64.77,95.848571,55.52,17.12,7.385714,26.8,11.466667,33.0,20.7,38.1
939,iq,2000,29,0.227729,0.145429,0.2542,0.200314,5.6,295.357143,296.228571,...,23.96,87.234286,5.6,14.431429,9.114286,25.766667,10.533333,31.5,14.7,30.0
940,iq,2000,30,0.328643,0.322129,0.254371,0.361043,62.76,296.432857,297.635714,...,31.8,88.161429,62.76,15.444286,9.5,26.6,11.48,33.3,19.1,4.0


In [39]:
# Check if there are still missing values
print(X_train_imputed.isnull().sum().max())
print(X_valid_imputed.isnull().sum().max())
print(X_test_imputed.isnull().sum().max())

0
0
0


In [40]:
# Next we engineer the weekofyear function to be cyclical, so that the last week of the year is close to the first
# By assigning sin and cos to the weekofyear, we can keep the cyclical nature of the data
X_train_imputed['weekofyear_sin'] = X_train_imputed['weekofyear'].apply(lambda x: np.sin(2*np.pi*x/52))
X_train_imputed['weekofyear_cos'] = X_train_imputed['weekofyear'].apply(lambda x: np.cos(2*np.pi*x/52))
X_train_imputed.drop('weekofyear', axis=1, inplace=True)

X_valid_imputed['weekofyear_sin'] = X_valid_imputed['weekofyear'].apply(lambda x: np.sin(2*np.pi*x/52))
X_valid_imputed['weekofyear_cos'] = X_valid_imputed['weekofyear'].apply(lambda x: np.cos(2*np.pi*x/52))
X_valid_imputed.drop('weekofyear', axis=1, inplace=True)

X_test_imputed['weekofyear_sin'] = X_test_imputed['weekofyear'].apply(lambda x: np.sin(2*np.pi*x/52))
X_test_imputed['weekofyear_cos'] = X_test_imputed['weekofyear'].apply(lambda x: np.cos(2*np.pi*x/52))
# X_test_imputed.drop('weekofyear', axis=1, inplace=True)  ## don't drop this yet, we need it for submission

# Also not for y, as these will be dropped from there before training
X_train_imputed.head()

Unnamed: 0,city,year,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,...,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,weekofyear_sin,weekofyear_cos
936,iq,2000,0.192886,0.132257,0.340886,0.2472,25.41,296.74,298.45,295.184286,...,25.41,16.651429,8.928571,26.4,10.775,32.5,20.7,3.0,-3.216245e-16,-1.0
937,iq,2000,0.216833,0.2761,0.289457,0.241657,60.61,296.634286,298.428571,295.358571,...,60.61,16.862857,10.314286,26.9,11.566667,34.0,20.8,55.6,-0.1205367,-0.992709
938,iq,2000,0.176757,0.173129,0.204114,0.128014,55.52,296.415714,297.392857,295.622857,...,55.52,17.12,7.385714,26.8,11.466667,33.0,20.7,38.1,-0.2393157,-0.970942
939,iq,2000,0.227729,0.145429,0.2542,0.200314,5.6,295.357143,296.228571,292.797143,...,5.6,14.431429,9.114286,25.766667,10.533333,31.5,14.7,30.0,-0.3546049,-0.935016
940,iq,2000,0.328643,0.322129,0.254371,0.361043,62.76,296.432857,297.635714,293.957143,...,62.76,15.444286,9.5,26.6,11.48,33.3,19.1,4.0,-0.4647232,-0.885456


In [None]:
# Drop the cities, except for the test data, still need it for submission
X_train_prep = X_train_imputed.drop('city', axis=1)
X_valid_prep = X_valid_imputed.drop('city', axis=1)
X_test_prep = X_test_imputed

In [None]:
# Finally save the data
X_train_prep.to_csv(f'prep_data/X_train_prep_{city}.csv', index=False)
X_valid_prep.to_csv(f'prep_data/X_valid_prep_{city}.csv', index=False)
X_test_prep.to_csv(f'prep_data/X_test_prep_{city}.csv', index=False)

y_train.drop(['city', 'year', 'weekofyear'], axis=1, inplace=True)
y_train.to_csv(f'prep_data/y_train_prep_{city}.csv', index=False)
y_valid.drop(['city', 'year', 'weekofyear'], axis=1, inplace=True)
y_valid.to_csv(f'prep_data/y_valid_prep_{city}.csv', index=False)