In [16]:
import sys
import pandas as pd
import numpy as np

from sklearn import neighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing

In [2]:
### 1. Load the data
data_features = pd.read_csv('../dengue_fever/data/dengue_features_train.csv')
data_labels = pd.read_csv('../dengue_fever/data/dengue_labels_train.csv')
data = pd.merge(data_features, data_labels, on=['city', 'year', 'weekofyear'])
data_features_test = pd.read_csv('../dengue_fever/data/dengue_features_test.csv')

In [3]:
### 2. Missing Values

# Perform linear interpolation where there is missing data
data = data.interpolate()
data_features_test = data_features_test.interpolate()

# Remove outliers (they have lot of missing data)
outliers_sj = [89, 141, 401, 453, 713, 765]
outliers_iq = [184, 236, 444, 496]
outliers = outliers_sj + [x + 936 for x in outliers_iq]
data = data.drop([x - 2 for x in outliers]) # Because element index is element - 2

In [4]:
### 3. Feature Selection

features_selected_sj = ['weekofyear',
                      'reanalysis_dew_point_temp_k',
                      'reanalysis_relative_humidity_percent',
                      'reanalysis_precip_amt_kg_per_m2',
                      'ndvi_se',
                      'reanalysis_specific_humidity_g_per_kg']

features_selected_iq = ['weekofyear',
                      'reanalysis_min_air_temp_k',
                      'reanalysis_specific_humidity_g_per_kg',
                      'reanalysis_precip_amt_kg_per_m2']

features_sj = data.loc[data['city'] == 'sj'][features_selected_sj]
features_iq = data.loc[data['city'] == 'iq'][features_selected_iq]
labels_sj = data.loc[data['city'] == 'sj']['total_cases']
labels_iq = data.loc[data['city'] == 'iq']['total_cases']

# Not use first years
features_sj = features_sj.tail(650)
features_iq = features_iq.tail(300)
labels_sj = labels_sj.tail(650)
labels_iq = labels_iq.tail(300)

In [5]:
### 4. Execute the regresor and make predictions

## San Juan
data_features_test_sj = data_features_test.loc[data_features_test['city'] == 'sj']

# Parametrization
n_estimators = 50
max_depth = None
max_features = len(features_selected_sj)

# Random Forest regressor
regressor_sj = RandomForestRegressor(n_estimators= n_estimators, max_depth = max_depth, max_features=max_features, criterion='mae', random_state=0)
regressor_sj.fit(features_sj, labels_sj)

# Prediction
pred_sj = [int(round(x)) for x in regressor_sj.predict(data_features_test_sj[features_selected_sj])]
data_features_test_sj = data_features_test_sj.assign(total_cases = pred_sj)


## Iquitos
data_features_test_iq = data_features_test.loc[data_features_test['city'] == 'iq']

# Normalization of the data
max_abs_scaler = preprocessing.MaxAbsScaler()
data_features_test_iq_norm = max_abs_scaler.fit_transform(data_features_test_iq[features_selected_iq])
features_iq_norm = max_abs_scaler.fit_transform(features_iq)

# Parametrization
n_neighbors = 18
weights = 'distance'

# Knn regressor
regressor_iq = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, p=2)
regressor_iq.fit(features_iq, labels_iq)

# Prediction
pred_iq = [int(round(x)) for x in regressor_iq.predict(data_features_test_iq_norm)]
data_features_test_iq = data_features_test_iq.assign(total_cases = pred_iq)

  warn(


In [7]:
pred_sj

[10,
 8,
 14,
 8,
 10,
 13,
 8,
 9,
 24,
 12,
 5,
 16,
 27,
 25,
 93,
 59,
 27,
 39,
 107,
 64,
 34,
 51,
 35,
 36,
 33,
 60,
 32,
 37,
 30,
 31,
 20,
 30,
 15,
 15,
 19,
 14,
 16,
 32,
 23,
 37,
 25,
 16,
 14,
 13,
 9,
 11,
 5,
 12,
 9,
 7,
 7,
 10,
 7,
 8,
 7,
 6,
 22,
 14,
 13,
 11,
 8,
 16,
 16,
 27,
 40,
 25,
 26,
 35,
 22,
 41,
 32,
 48,
 27,
 59,
 46,
 40,
 41,
 51,
 52,
 37,
 21,
 35,
 45,
 48,
 31,
 45,
 21,
 18,
 42,
 29,
 52,
 42,
 18,
 14,
 24,
 19,
 47,
 21,
 16,
 12,
 11,
 7,
 9,
 9,
 22,
 23,
 13,
 17,
 14,
 103,
 34,
 94,
 64,
 42,
 53,
 65,
 105,
 61,
 88,
 98,
 92,
 97,
 135,
 167,
 73,
 37,
 59,
 51,
 72,
 57,
 59,
 32,
 55,
 24,
 24,
 26,
 32,
 27,
 24,
 21,
 19,
 33,
 34,
 28,
 20,
 18,
 20,
 18,
 28,
 12,
 10,
 8,
 17,
 18,
 10,
 6,
 11,
 9,
 8,
 6,
 36,
 19,
 22,
 104,
 64,
 22,
 44,
 67,
 21,
 77,
 49,
 166,
 120,
 205,
 67,
 26,
 39,
 51,
 53,
 76,
 56,
 52,
 46,
 48,
 40,
 33,
 37,
 41,
 36,
 34,
 20,
 20,
 27,
 25,
 14,
 24,
 23,
 17,
 16,
 15,
 12,
 14,
 9,


In [8]:
### 5. Save results
result = data_features_test_sj.append(data_features_test_iq, ignore_index=True)
result = result[['city', 'year', 'weekofyear', 'total_cases']]

  result = data_features_test_sj.append(data_features_test_iq, ignore_index=True)


In [10]:
sj_pred = result[result['city'] == 'sj']
iq_pred = result[result['city'] == 'iq']

In [11]:
sj_pred = sj_pred["total_cases"].to_numpy()
iq_pred = iq_pred["total_cases"].to_numpy()

In [12]:
sj = []
for i in sj_pred:
    sj.append([i])

In [13]:
iq = []
for i in iq_pred:
    iq.append([i])

In [14]:
submission = pd.read_csv('../dengue_fever/data/submission_format.csv',
                            index_col=[0, 1, 2])

In [18]:
submission.total_cases = np.concatenate([sj, iq])
submission.to_csv("../dengue_fever/data/ak_submission_KNN.csv")