# Exploratory Data Analysis for Dengue Competition
competition link: https://www.drivendata.org/competitions/44/dengai-predicting-disease-spread/data/
completed on 4 Feb 2020

trying out splitting by cities, as well as using select features
insights from benchmark: http://drivendata.co/blog/dengue-benchmark/


In [1]:
import numpy as np
import pandas as pd

In [2]:
test_features = pd.read_csv("dengue_features_test.csv") 
train_features = pd.read_csv("dengue_features_train.csv") 
train_labels = pd.read_csv("dengue_labels_train.csv") 
submission_format = pd.read_csv("submission_format.csv") 

In [3]:
train_features

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,1990-04-30,0.122600,0.103725,0.198483,0.177617,12.42,297.572857,...,32.00,73.365714,12.42,14.012857,2.628571,25.442857,6.900000,29.4,20.0,16.0
1,sj,1990,19,1990-05-07,0.169900,0.142175,0.162357,0.155486,22.82,298.211429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990,20,1990-05-14,0.032250,0.172967,0.157200,0.170843,34.54,298.781429,...,26.10,82.052857,34.54,16.848571,2.300000,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,13.90,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990,22,1990-05-28,0.196200,0.262200,0.251200,0.247340,7.52,299.518571,...,12.20,80.460000,7.52,17.210000,3.014286,28.942857,9.371429,35.0,23.9,5.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,iq,2010,21,2010-05-28,0.342750,0.318900,0.256343,0.292514,55.30,299.334286,...,45.00,88.765714,55.30,18.485714,9.800000,28.633333,11.933333,35.4,22.4,27.0
1452,iq,2010,22,2010-06-04,0.160157,0.160371,0.136043,0.225657,86.47,298.330000,...,207.10,91.600000,86.47,18.070000,7.471429,27.433333,10.500000,34.7,21.7,36.6
1453,iq,2010,23,2010-06-11,0.247057,0.146057,0.250357,0.233714,58.94,296.598571,...,50.60,94.280000,58.94,17.008571,7.500000,24.400000,6.900000,32.2,19.2,7.4
1454,iq,2010,24,2010-06-18,0.333914,0.245771,0.278886,0.325486,59.67,296.345714,...,62.33,94.660000,59.67,16.815714,7.871429,25.433333,8.733333,31.2,21.0,16.0


In [4]:
train_labels

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,1990,18,4
1,sj,1990,19,5
2,sj,1990,20,4
3,sj,1990,21,3
4,sj,1990,22,6
...,...,...,...,...
1451,iq,2010,21,5
1452,iq,2010,22,8
1453,iq,2010,23,1
1454,iq,2010,24,1


# Gradient Boosting Model (XGBoost)

model, without using city and week date

In [5]:
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [6]:
X = train_features.drop(columns = ["city", "week_start_date"])
Y = train_labels["total_cases"]

In [7]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [8]:
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [9]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [10]:
# evaluate predictions
accuracy = mean_absolute_error(y_test, predictions)
print("MAE: %.2f%%" % (accuracy))

MAE: 15.92%


## Submission on 4th Feb 2020

In [11]:
test = test_features.drop(columns = ["city", "week_start_date"])
test_pred = model.predict(test)
test_predictions = [round(value) for value in test_pred]

In [12]:
len(test_predictions)

416

In [14]:
submission_format

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,0
1,sj,2008,19,0
2,sj,2008,20,0
3,sj,2008,21,0
4,sj,2008,22,0
...,...,...,...,...
411,iq,2013,22,0
412,iq,2013,23,0
413,iq,2013,24,0
414,iq,2013,25,0


In [15]:
submission = submission_format
submission["total_cases"] = test_predictions

In [16]:
submission

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,3
1,sj,2008,19,3
2,sj,2008,20,3
3,sj,2008,21,11
4,sj,2008,22,1
...,...,...,...,...
411,iq,2013,22,2
412,iq,2013,23,1
413,iq,2013,24,6
414,iq,2013,25,1


In [28]:
submission.to_csv("submission_xgboost_5.csv", index = False)

In [21]:
only_cases = submission["total_cases"]
only_cases.to_csv("submission_xgboost_4.csv", header = True)

In [20]:
only_cases

0       3
1       3
2       3
3      11
4       1
       ..
411     2
412     1
413     6
414     1
415     2
Name: total_cases, Length: 416, dtype: int64

In [None]:
submission