## Libraries

In [5]:
from itertools import count
import pandas as pd
import matplotlib.pyplot as plt
import seaborn           as sns
import numpy             as np
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors        import KNeighborsClassifier
from sklearn.metrics          import accuracy_score, confusion_matrix
from sklearn.naive_bayes      import GaussianNB
from sklearn.preprocessing    import StandardScaler, Normalizer
from sklearn.feature_selection import SequentialFeatureSelector

#### Understanding the data

##### Read the data

In [131]:
data = pd.read_csv('data/drought_forecasting.csv', dayfirst=True, parse_dates =["date"], index_col ="date")
data

Unnamed: 0_level_0,PRECTOT,PS,T2M,T2MDEW,T2MWET,TS,QV2M
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,0.22,100.51,14.74,13.51,13.51,14.65,9.65
2000-01-02,0.20,100.55,16.69,14.71,14.71,16.60,10.42
2000-01-03,3.65,100.15,18.49,16.52,16.52,18.41,11.76
2000-01-04,15.95,100.29,11.40,6.09,6.10,11.31,6.42
2000-01-05,0.00,101.15,3.86,-3.29,-3.20,2.65,2.95
...,...,...,...,...,...,...,...
2020-12-27,0.00,100.73,3.50,0.87,2.18,3.32,4.03
2020-12-28,0.05,101.07,7.20,6.39,6.80,6.95,5.95
2020-12-29,0.11,101.10,10.01,8.78,9.39,9.96,7.01
2020-12-30,0.06,100.62,12.22,10.54,11.38,12.14,7.93


##### Missing values

In [132]:
nullValues = data.isnull().sum()
nullValues

PRECTOT    0
PS         0
T2M        0
T2MDEW     0
T2MWET     0
TS         0
QV2M       0
dtype: int64

##### Data types and data summary

In [133]:
data.head()

Unnamed: 0_level_0,PRECTOT,PS,T2M,T2MDEW,T2MWET,TS,QV2M
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,0.22,100.51,14.74,13.51,13.51,14.65,9.65
2000-01-02,0.2,100.55,16.69,14.71,14.71,16.6,10.42
2000-01-03,3.65,100.15,18.49,16.52,16.52,18.41,11.76
2000-01-04,15.95,100.29,11.4,6.09,6.1,11.31,6.42
2000-01-05,0.0,101.15,3.86,-3.29,-3.2,2.65,2.95


In [134]:
data.describe()
# looking the data description is already possible to see that aggregate the data could be not so good, 
# because there is a high value in PRECTOT that can be an outliers and will 'desapear' in the aggregation process.
# So use the atomic granularity possibly is the best option

Unnamed: 0,PRECTOT,PS,T2M,T2MDEW,T2MWET,TS,QV2M
count,7671.0,7671.0,7671.0,7671.0,7671.0,7671.0,7671.0
mean,3.718138,100.283734,17.687126,12.902799,12.89532,17.585104,10.44732
std,8.118149,0.497352,8.444942,8.096155,8.066492,8.532297,4.686545
min,0.0,97.97,-6.49,-15.45,-14.52,-8.36,1.04
25%,0.0,99.96,11.24,6.98,6.97,11.03,6.32
50%,0.22,100.24,18.83,14.69,14.65,18.7,10.51
75%,3.545,100.58,25.16,19.84,19.82,25.14,14.54
max,137.59,102.47,32.97,24.81,24.81,33.45,19.79


In [135]:
data.dtypes

PRECTOT    float64
PS         float64
T2M        float64
T2MDEW     float64
T2MWET     float64
TS         float64
QV2M       float64
dtype: object

In [13]:
"""data['day'] = pd.DatetimeIndex(data['date'], dayfirst=True).day
data['month'] = pd.DatetimeIndex(data['date'], dayfirst=True).month
data['year'] = pd.DatetimeIndex(data['date'], dayfirst=True).year
data = data[[col for col in data if col not in ['QV2M']] + ['QV2M']]
data.pop('date')"""

0       01/01/2000
1       02/01/2000
2       03/01/2000
3       04/01/2000
4       05/01/2000
           ...    
7666    27/12/2020
7667    28/12/2020
7668    29/12/2020
7669    30/12/2020
7670    31/12/2020
Name: date, Length: 7671, dtype: object

#### 1. Data Profiling

##### 1.1. Data Granularity 

###### 1.1.1. Resample

In [136]:
# Granularity: atomic (daily)
daily_data = data
# Granularity: weekly
weekly_data = data.resample('W').mean()
# Granularity: monthly
monthly_data = data.resample('M').mean()

##### 1.2. Data Distribution and Stationarity  

###### 1.2.1. Boxplots

In [137]:
def boxplot(data, filename):
    """"""

    #fig, ax = plt.subplots()
    sns.boxplot(data=data)
    plt.savefig('plots/'+filename + ".png")
    plt.close()

In [138]:
boxplot(daily_data, 'boxplot_drought_forecasting_dailySeparated')
boxplot(weekly_data, 'boxplot_drought_forecasting_weeklySeparated')
boxplot(monthly_data, 'boxplot_drought_forecasting_monthlySeparated')

###### 1.2.2. Histograms 

In [139]:
def histograms(data, filename, dimension):
    i, j = dimension
    fig, ax = plt.subplots(i, j, figsize=(24, 14))
    
    for position in range(len(data.columns)):
        col = data.columns[position]

        pos_i = position//j
        pos_j = position%j

        ax[pos_i][pos_j].hist(data[col])
        ax[pos_i][pos_j].set_title(col)
        ax[pos_i][pos_j].legend()
        
    plt.savefig('plots/' + filename + '.png')
    plt.close()

In [140]:
histograms(daily_data, 'hist_drought_forecasting_dailySeparated', (2, 4))
histograms(weekly_data, 'hist_drought_forecasting_weeklySeparated', (2, 4))
histograms(monthly_data, 'hist_drought_forecasting_monthlySeparated', (2, 4))

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that 

###### 1.2.3. Stationarity

In [145]:
def plot_stationarity(data, filename, dimension):
    i, j = dimension
    fig, ax = plt.subplots(i, j, figsize=(24, 14))
    
    for position in range(len(data.columns)):
        col = data.columns[position]

        pos_i = position//j
        pos_j = position%j

        ax[pos_i][pos_j].plot(data[col])
        ax[pos_i][pos_j].set_title(col)
        ax[pos_i][pos_j].legend()
        
    plt.savefig('plots/' + filename + '.png')
    plt.close()
    

In [146]:
plot_stationarity(daily_data, 'stationarity_drought_forecantig_dailySeparated', (2, 4))
plot_stationarity(weekly_data, 'stationarity_drought_forecantig_weeklySeparated', (2, 4))
plot_stationarity(monthly_data, 'stationarity_drought_forecantig_monthlySeparated', (2, 4))

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
No artists with labels found to put in legend.  Note that 