### This notebook contains the code to generate the submission for the "DengAI: Predicting Disease Spread" competition.

The preprocessed data is saved in my prep_data folder.

This notebook trains the model and predicts the submission values. We want to predict the number of cases, rounded to an integer.

The score is evaluated using the mean absolute error metric.

Here we train the XGBoost model. We also train two separate models, one for each city. An attempt was made to train with only the most important features, but that did not actually improve performance.

In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

In [None]:
X_train_sj = pd.read_csv('prep_data/X_train_prep_sj.csv')
y_train_sj = pd.read_csv('prep_data/y_train_prep_sj.csv')
X_valid_sj = pd.read_csv('prep_data/X_valid_prep_sj.csv')
y_valid_sj = pd.read_csv('prep_data/y_valid_prep_sj.csv')
X_train_iq = pd.read_csv('prep_data/X_train_prep_iq.csv')
y_train_iq = pd.read_csv('prep_data/y_train_prep_iq.csv')
X_valid_iq = pd.read_csv('prep_data/X_valid_prep_iq.csv')
y_valid_iq = pd.read_csv('prep_data/y_valid_prep_iq.csv')

In [None]:
X_train_sj.head()

Unnamed: 0,year,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,...,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,weekofyear_sin,weekofyear_cos
0,1990,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,299.8,...,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,0.822984,-0.568065
1,1990,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,300.9,...,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,0.748511,-0.663123
2,1990,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,300.5,...,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,0.663123,-0.748511
3,1990,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.31,301.4,...,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0,0.568065,-0.822984
4,1990,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,295.821429,301.9,...,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8,0.464723,-0.885456


In [29]:
y_train_sj.head()

Unnamed: 0,total_cases
0,4
1,5
2,4
3,3
4,6


In [30]:
# Select the features for each city
features_sj = ['weekofyear_sin', 'year', 'reanalysis_specific_humidity_g_per_kg', 'reanalysis_dew_point_temp_k', 'station_avg_temp_c', 
               'reanalysis_max_air_temp_k', 'station_max_temp_c', 'reanalysis_min_air_temp_k', 'reanalysis_air_temp_k', 
               'station_min_temp_c', 'reanalysis_avg_temp_k', 'reanalysis_relative_humidity_percent', 'weekofyear_cos', 
               'reanalysis_precip_amt_kg_per_m2']
features_iq = ['weekofyear_cos', 'reanalysis_specific_humidity_g_per_kg', 'reanalysis_dew_point_temp_k', 'reanalysis_min_air_temp_k', 
               'station_min_temp_c', 'year', 'reanalysis_tdtr_k', 'reanalysis_relative_humidity_percent', 'station_avg_temp_c', 
               'reanalysis_precip_amt_kg_per_m2']

X_train_sj_select = X_train_sj#[features_sj]
X_valid_sj_select = X_valid_sj#[features_sj]
X_train_iq_select = X_train_iq#[features_iq]
X_valid_iq_select = X_valid_iq#[features_iq]
X_train_sj_select.head()

Unnamed: 0,year,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,...,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,weekofyear_sin,weekofyear_cos
0,1990,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,299.8,...,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,0.822984,-0.568065
1,1990,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,300.9,...,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,0.748511,-0.663123
2,1990,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,300.5,...,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,0.663123,-0.748511
3,1990,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.31,301.4,...,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0,0.568065,-0.822984
4,1990,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,295.821429,301.9,...,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8,0.464723,-0.885456


In [31]:
# Train a test model for San Juan
model_sj = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=-1, early_stopping_rounds=5, random_state=42)
model_sj.fit(X_train_sj_select, y_train_sj,
             eval_set=[(X_valid_sj_select, y_valid_sj)],
             verbose=False)

In [32]:
y_pred_sj = model_sj.predict(X_valid_sj_select)
y_pred_sj = [int(round(value)) for value in y_pred_sj]
mae = mean_absolute_error(y_valid_sj, y_pred_sj)
print('MAE:', mae)

MAE: 23.643617021276597


In [33]:
# Optimise
param_grid = {
    'n_estimators': [500, 1000, 1200],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.25]
}

scores = []
for n_est in param_grid['n_estimators']:
    for lr in param_grid['learning_rate']:
        model = XGBRegressor(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
        model.fit(X_train_sj_select, y_train_sj,
                  eval_set=[(X_valid_sj_select, y_valid_sj)],
                  verbose=False)
        y_pred = model.predict(X_valid_sj_select)
        y_pred = [int(round(value)) for value in y_pred]
        mae = mean_absolute_error(y_valid_sj, y_pred)
        scores.append(mae)
        print(f"for {n_est}, {lr} have MAE: {mae:.4f}")
print(min(scores))

for 500, 0.01 have MAE: 23.3723
for 500, 0.05 have MAE: 23.6436
for 500, 0.1 have MAE: 20.8245
for 500, 0.2 have MAE: 23.5426
for 500, 0.25 have MAE: 22.4255
for 1000, 0.01 have MAE: 23.3723
for 1000, 0.05 have MAE: 23.6436
for 1000, 0.1 have MAE: 20.8245
for 1000, 0.2 have MAE: 23.5426
for 1000, 0.25 have MAE: 22.4255
for 1200, 0.01 have MAE: 23.3723
for 1200, 0.05 have MAE: 23.6436
for 1200, 0.1 have MAE: 20.8245
for 1200, 0.2 have MAE: 23.5426
for 1200, 0.25 have MAE: 22.4255
20.824468085106382


In [34]:
# See lr=0.05 is best, n_est does not matter there
param_grid = {
    'n_estimators': [500, 1000],
    'learning_rate': [0.02, 0.03, 0.04, 0.07]
}

scores = []
for n_est in param_grid['n_estimators']:
    for lr in param_grid['learning_rate']:
        model = XGBRegressor(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
        model.fit(X_train_sj_select, y_train_sj,
                  eval_set=[(X_valid_sj_select, y_valid_sj)],
                  verbose=False)
        y_pred = model.predict(X_valid_sj_select)
        y_pred = [int(round(value)) for value in y_pred]
        mae = mean_absolute_error(y_valid_sj, y_pred)
        scores.append(mae)
        print(f"for {n_est}, {lr} have MAE: {mae:.4f}")
print(min(scores))

for 500, 0.02 have MAE: 23.5106
for 500, 0.03 have MAE: 22.3883
for 500, 0.04 have MAE: 22.8191
for 500, 0.07 have MAE: 22.3298
for 1000, 0.02 have MAE: 23.5106
for 1000, 0.03 have MAE: 22.3883
for 1000, 0.04 have MAE: 22.8191
for 1000, 0.07 have MAE: 22.3298
22.329787234042552


In [41]:
# So we go with lr=0.03, n_est=500
# With all features: 0.1, 500
model_sj_fin = XGBRegressor(n_estimators=500, learning_rate=0.1, n_jobs=-1, early_stopping_rounds=5, random_state=42)
model_sj_fin.fit(X_train_sj_select, y_train_sj,
                 eval_set=[(X_valid_sj_select, y_valid_sj)],
                 verbose=False)
y_pred_sj = model_sj_fin.predict(X_valid_sj_select)
y_pred_sj = [int(round(value)) for value in y_pred_sj]
mae = mean_absolute_error(y_valid_sj, y_pred_sj)
print('MAE:', mae)

MAE: 20.824468085106382


In [36]:
# Now repeat for Iquitos
param_grid = {
    'n_estimators': [500],
    'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.23, 0.24, 0.25, 0.26, 0.27, 0.3]
}

scores = []
for n_est in param_grid['n_estimators']:
    for lr in param_grid['learning_rate']:
        model = XGBRegressor(n_estimators=n_est, learning_rate=lr, n_jobs=-1, early_stopping_rounds=5, random_state=42)
        model.fit(X_train_iq_select, y_train_iq,
                  eval_set=[(X_valid_iq_select, y_valid_iq)],
                  verbose=False)
        y_pred = model.predict(X_valid_iq_select)
        y_pred = [int(round(value)) for value in y_pred]
        mae = mean_absolute_error(y_valid_iq, y_pred)
        scores.append(mae)
        print(f"for {n_est}, {lr} have MAE: {mae:.4f}")
print(min(scores))

for 500, 0.05 have MAE: 6.7404
for 500, 0.1 have MAE: 6.8558
for 500, 0.15 have MAE: 6.7500
for 500, 0.2 have MAE: 6.9038
for 500, 0.23 have MAE: 7.0769
for 500, 0.24 have MAE: 7.2308
for 500, 0.25 have MAE: 6.6442
for 500, 0.26 have MAE: 6.6442
for 500, 0.27 have MAE: 6.6923
for 500, 0.3 have MAE: 6.6923
6.644230769230769


In [42]:
# So go with n_est=500, lr=0.25
# Same for all features!
model_iq_fin = XGBRegressor(n_estimators=500, learning_rate=0.25, n_jobs=-1, early_stopping_rounds=5, random_state=42)
model_iq_fin.fit(X_train_iq_select, y_train_iq,
                 eval_set=[(X_valid_iq_select, y_valid_iq)],
                 verbose=False)
y_pred_iq = model_iq_fin.predict(X_valid_iq_select)
y_pred_iq = [int(round(value)) for value in y_pred_iq]
mae = mean_absolute_error(y_valid_iq, y_pred_iq)
print('MAE:', mae)

MAE: 6.644230769230769


In [43]:
# Prepare output
X_test_sj = pd.read_csv('prep_data/X_test_prep_sj.csv')
X_test_iq = pd.read_csv('prep_data/X_test_prep_iq.csv')
X_test = pd.concat([X_test_sj, X_test_iq], axis=0)  # easier to make output with this

output = pd.DataFrame()
output['city'] = X_test['city']
output['year'] = X_test['year']
output['weekofyear'] = X_test['weekofyear']


X_test_sj.drop(['weekofyear', 'city'], axis=1, inplace=True)
X_test_iq.drop(['weekofyear', 'city'], axis=1, inplace=True)

# Select features
X_test_sj_select = X_test_sj#[features_sj]
X_test_iq_select = X_test_iq#[features_iq]
X_test_sj_select.head()

Unnamed: 0,year,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,...,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,weekofyear_sin,weekofyear_cos
0,2008,-0.0189,-0.0189,0.102729,0.0912,78.6,298.492857,298.55,294.527143,301.1,...,78.6,15.918571,3.128571,26.528571,7.057143,33.3,21.7,75.2,0.822984,-0.568065
1,2008,-0.018,-0.0124,0.082043,0.072314,12.56,298.475714,298.557143,294.395714,300.8,...,12.56,15.791429,2.571429,26.071429,5.557143,30.0,22.2,34.3,0.748511,-0.663123
2,2008,-0.0015,0.073809,0.151083,0.091529,3.66,299.455714,299.357143,295.308571,302.2,...,3.66,16.674286,4.428571,27.928571,7.785714,32.8,22.8,3.0,0.663123,-0.748511
3,2008,0.071975,-0.019867,0.124329,0.125686,0.0,299.69,299.728571,294.402857,303.0,...,0.0,15.775714,4.342857,28.057143,6.271429,33.3,24.4,0.3,0.568065,-0.822984
4,2008,0.0568,0.039833,0.062267,0.075914,0.76,299.78,299.671429,294.76,302.3,...,0.76,16.137143,3.542857,27.614286,7.085714,33.3,23.3,84.1,0.464723,-0.885456


In [44]:
# Predict for each city
y_pred_sj = model_sj_fin.predict(X_test_sj_select)
y_pred_sj = [int(round(value)) for value in y_pred_sj]
y_pred_iq = model_iq_fin.predict(X_test_iq_select)
y_pred_iq = [int(round(value)) for value in y_pred_iq]

# First make an empty column
output['total_cases'] = pd.Series()
# Assign for each city
output.loc[output['city'] == 'sj', 'total_cases'] = y_pred_sj
output.loc[output['city'] == 'iq', 'total_cases'] = y_pred_iq

output.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,18
1,sj,2008,19,18
2,sj,2008,20,18
3,sj,2008,21,22
4,sj,2008,22,18


In [None]:
# Save to csv
output.to_csv('submissions/submission_XGB.csv', index=False)

### Final note:

This model gets a submission score of 26.7404. This is not great.

What if we do everything the same except also keeping in ALL features: 26.6034. Which is actually the best so far (rank 3636). So I guess dropping features really is not a good strategy in this case.