# Import libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

sns.set_style("whitegrid")

import dengue_utils as dutils

# Load data

In [2]:
train_dataset = dutils.load_train_dataset()

20190413 - 20:M:12: Loading train dataset from: dengue_features_train.csv


In [3]:
train_dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,1456.0,2001.031593,5.408314,1990.0,1997.0,2002.0,2005.0,2010.0
weekofyear,1456.0,26.503434,15.019437,1.0,13.75,26.5,39.25,53.0
ndvi_ne,1262.0,0.142294,0.140531,-0.40625,0.04495,0.128817,0.248483,0.508357
ndvi_nw,1404.0,0.130553,0.119999,-0.4561,0.049217,0.121429,0.2166,0.454429
ndvi_se,1434.0,0.203783,0.07386,-0.015533,0.155087,0.19605,0.248846,0.538314
ndvi_sw,1434.0,0.202305,0.083903,-0.063457,0.144209,0.18945,0.246982,0.546017
precipitation_amt_mm,1443.0,45.760388,43.715537,0.0,9.8,38.34,70.235,390.6
reanalysis_air_temp_k,1446.0,298.701852,1.36242,294.635714,297.658929,298.646429,299.833571,302.2
reanalysis_avg_temp_k,1446.0,299.225578,1.261715,294.892857,298.257143,299.289286,300.207143,302.928571
reanalysis_dew_point_temp_k,1446.0,295.246356,1.52781,289.642857,294.118929,295.640714,296.46,298.45


# Temperature Conversion

In [4]:
train_dataset = dutils.temperature_conversion(train_dataset)
train_dataset.describe().T

20190413 - 20:M:13: Temperature conversion: kelvin to celsius reanalysis_air_temp_k to reanalysis_air_temp_c
20190413 - 20:M:13: Temperature conversion: kelvin to celsius reanalysis_dew_point_temp_k to reanalysis_dew_point_temp_c
20190413 - 20:M:13: Temperature conversion: kelvin to celsius reanalysis_max_air_temp_k to reanalysis_max_air_temp_c
20190413 - 20:M:13: Temperature conversion: kelvin to celsius reanalysis_min_air_temp_k to reanalysis_min_air_temp_c
20190413 - 20:M:13: Temperature conversion: kelvin to celsius reanalysis_avg_temp_k to reanalysis_avg_temp_c
20190413 - 20:M:13: Temperature conversion: kelvin to celsius reanalysis_tdtr_k to reanalysis_tdtr_c


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,1456.0,2001.031593,5.408314,1990.0,1997.0,2002.0,2005.0,2010.0
weekofyear,1456.0,26.503434,15.019437,1.0,13.75,26.5,39.25,53.0
ndvi_ne,1262.0,0.142294,0.140531,-0.40625,0.04495,0.128817,0.248483,0.508357
ndvi_nw,1404.0,0.130553,0.119999,-0.4561,0.049217,0.121429,0.2166,0.454429
ndvi_se,1434.0,0.203783,0.07386,-0.015533,0.155087,0.19605,0.248846,0.538314
ndvi_sw,1434.0,0.202305,0.083903,-0.063457,0.144209,0.18945,0.246982,0.546017
precipitation_amt_mm,1443.0,45.760388,43.715537,0.0,9.8,38.34,70.235,390.6
reanalysis_precip_amt_kg_per_m2,1446.0,40.151819,43.434399,0.0,13.055,27.245,52.2,570.5
reanalysis_relative_humidity_percent,1446.0,82.161959,7.153897,57.787143,77.177143,80.301429,86.357857,98.61
reanalysis_sat_precip_amt_mm,1443.0,45.760388,43.715537,0.0,9.8,38.34,70.235,390.6


# Detect outliers
## Z-score

In [5]:
def detect_outlier(data_1):
    
    threshold=3
    mean_1 = np.mean(data_1)
    std_1 =np.std(data_1)
    
    outliers = list()
    for y in data_1:
        z_score= (y - mean_1)/std_1 
        if np.abs(z_score) > threshold:
            outliers.append(y)
    return outliers



In [6]:
train_dataset.columns

Index(['city', 'year', 'weekofyear', 'week_start_date', 'ndvi_ne', 'ndvi_nw',
       'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm',
       'reanalysis_precip_amt_kg_per_m2',
       'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm',
       'reanalysis_specific_humidity_g_per_kg', 'station_avg_temp_c',
       'station_diur_temp_rng_c', 'station_max_temp_c', 'station_min_temp_c',
       'station_precip_mm', 'total_cases', 'reanalysis_air_temp_c',
       'reanalysis_dew_point_temp_c', 'reanalysis_max_air_temp_c',
       'reanalysis_min_air_temp_c', 'reanalysis_avg_temp_c',
       'reanalysis_tdtr_c'],
      dtype='object')

In [7]:

for feature in  train_dataset.columns:
    outliers = detect_outlier(train_dataset[feature])
    print(feature + " " + str(len(outliers)))

year 0
weekofyear 0
ndvi_ne 5
ndvi_nw 4
ndvi_se 9
ndvi_sw 11
precipitation_amt_mm 17
reanalysis_precip_amt_kg_per_m2 33
reanalysis_relative_humidity_percent 2
reanalysis_sat_precip_amt_mm 17
reanalysis_specific_humidity_g_per_kg 3
station_avg_temp_c 2
station_diur_temp_rng_c 3
station_max_temp_c 2
station_min_temp_c 8
station_precip_mm 29
total_cases 24
reanalysis_air_temp_c 0
reanalysis_dew_point_temp_c 10
reanalysis_max_air_temp_c 3
reanalysis_min_air_temp_c 6
reanalysis_avg_temp_c 3
reanalysis_tdtr_c 1


In [21]:
def remove_outliers_iqr(data, feature, exclude = list()):
    
    if np.issubdtype(train_dataset[feature].dtype, np.number) and feature not in exclude:
        
        q1 = train_dataset[feature].quantile(0.25)
        q3 = train_dataset[feature].quantile(0.75)

        iqr = q3 - q1
        lower_bound = q1 -(1.5 * iqr) 
        upper_bound = q3 +(1.5 * iqr) 

        outliers = train_dataset[feature].apply(lambda x : np.nan if (x < lower_bound or 
                                                                      x > upper_bound) 
                                                                  else x)
    else:
        outliers = train_dataset[feature]
        
    return outliers

In [22]:
for feature in  train_dataset.columns:
    train_dataset[feature] = remove_outliers_iqr(train_dataset, feature, exclude = ['total_cases'])


In [23]:
train_dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,1456.0,2001.031593,5.408314,1990.0,1997.0,2002.0,2005.0,2010.0
weekofyear,1456.0,26.503434,15.019437,1.0,13.75,26.5,39.25,53.0
ndvi_ne,1255.0,0.144806,0.136782,-0.2517,0.045892,0.129243,0.24916,0.508357
ndvi_nw,1399.0,0.13208,0.117346,-0.178625,0.049733,0.122,0.216863,0.454429
ndvi_se,1412.0,0.200866,0.068728,0.028343,0.154493,0.195571,0.245348,0.387883
ndvi_sw,1401.0,0.197257,0.075781,0.01025,0.143171,0.187214,0.241929,0.400686
precipitation_amt_mm,1419.0,42.893404,37.226576,0.0,9.55,37.3,68.175,157.86
reanalysis_precip_amt_kg_per_m2,1359.0,31.96443,24.93137,0.0,12.425,25.3,45.325,110.1
reanalysis_relative_humidity_percent,1444.0,82.193743,7.107425,64.658571,77.196429,80.305714,86.406429,98.61
reanalysis_sat_precip_amt_mm,1419.0,42.893404,37.226576,0.0,9.55,37.3,68.175,157.86
