### Libraries

In [None]:
from itertools import count
import pandas as pd
import matplotlib.pyplot as plt
import seaborn           as sns
import numpy             as np
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors        import KNeighborsClassifier
from sklearn.metrics          import accuracy_score, confusion_matrix
from sklearn.naive_bayes      import GaussianNB
from sklearn.preprocessing    import StandardScaler, Normalizer
from sklearn.feature_selection import SequentialFeatureSelector

### 1. Data Profiling

In [None]:
data = pd.read_csv('data/drought_forecasting.csv')
data = data.rename(columns = {"class" : "drought"})
data['day'] = pd.DatetimeIndex(data['date'], dayfirst=True).day
data['month'] = pd.DatetimeIndex(data['date'], dayfirst=True).month
data['year'] = pd.DatetimeIndex(data['date'], dayfirst=True).year
data = data[[col for col in data if col not in ['drought']] + ['drought']]
data.pop('date')
data

#### 1.1. Data Dimensionality

#### 1.2. Data Distribution

##### 1.2.1. Histograms

In [None]:
def histograms(data, dimension):
    i, j = dimension
    fig, ax = plt.subplots(i, j, figsize=(50, 80))
    
    for position in range(len(data.columns)):
        col = data.columns[position]

        pos_i = position//j
        pos_j = position%j
        

        dist_0 = data[data['drought']==0][col]
        dist_1 = data[data['drought']==1][col]

        ax[pos_i][pos_j].hist([dist_0, dist_1],
                          stacked=False,
                          label=['drought = 0', 'drought = 1'],
                          color=['#7547B8', '#8AB847'])
        ax[pos_i][pos_j].set_title(col)
        ax[pos_i][pos_j].legend()

    plt.savefig('plots/drought_hist01.png')


In [None]:
histograms(data, (9, 6))

##### 1.2.2. Boxplots

In [None]:
def boxplotAllInd(data, filename, dimension):
    i, j = dimension
    fig, ax = plt.subplots(i, j, figsize=(50, 80))
    
    for position in range(len(data.columns)):
        col = data.columns[position]

        pos_i = position//j
        pos_j = position%j

        ax[pos_i][pos_j].boxplot(data[col])
        ax[pos_i][pos_j].set_title(col)

    plt.savefig('plots/' + filename + '.png')
    plt.close()

In [None]:
boxplotAllInd(data, 'boxplot_drought_classification_allInd', (9, 6))

In [None]:
def boxplot(data, filename):
    """"""

    sns.set(rc={"figure.figsize":(16, 14)})
    sns.boxplot(data=data)
    plt.xticks(rotation='vertical')
    plt.savefig('plots/'+filename + ".png")
    plt.close()

In [None]:
boxplot(data, 'boxplot_drought_classification_all')

In [None]:
# Groups of attributes to plot together: 
# fips and year
# Interval of values: [0,1) - slope1, slope2, slope3, slope4, slope5, slope6, slope7, slope8, aspectN, aspectE, aspectS, aspectW, aspectUnknown
# Interval of values: [1,10) - WS10M,WS10M_MIN, WS50M_MIN, SQ1, SQ2, SQ3, SQ4, SQ5, SQ6, SQ7, drought
# Interval of values: [0,21) - QV2M,WS10M_MAX, WS10M_RANGE, WS50M, WS50M_MAX, month
# Interval of values: [-20, 50) - T2M, T2MDEW, T2MWET, T2M_MAX, T2M_MIN, T2M_RANGE, TS 
# Interval of values: [-100, 0) - lon
# Interval of values: [-100, 0) - lat
# Interval of values: [0, 800) - elevation
# Interval of values: [0, 102) - WAT_LAND, NVG_LAND, URB_LAND, GRS_LAND, FOR_LAND, CULTRF_LAND, CULTIR_LAND, CULT_LAND, day, PRECTOT

dataValuesPerIntervals = [data[["slope1", "slope2", "slope3", "slope4", "slope5", "slope6", "slope7", 
                                "slope8", "aspectN", "aspectE", "aspectS", "aspectW", "aspectUnknown"]], 
                            data[["WS10M","WS10M_MIN", "WS50M_MIN", "SQ1", "SQ2", "SQ3", "SQ4", "SQ5", 
                            "SQ6", "SQ7", "drought"]], 
                            data[["QV2M","WS10M_MAX", "WS10M_RANGE", "WS50M", "WS50M_MAX", "month"]], 
                            data[["T2M", "T2MDEW", "T2MWET", "T2M_MAX", "T2M_MIN", "T2M_RANGE", "TS"]], 
                            data["lon"], data["lat"], data["elevation"], data[["fips", "year"]],
                            data[["WAT_LAND", "NVG_LAND", "URB_LAND", "GRS_LAND", "FOR_LAND", "CULTRF_LAND", 
                            "CULTIR_LAND", "CULT_LAND", "day", "PRECTOT"]]]

filenameC = 0
for d in dataValuesPerIntervals:
    boxplot(d, "boxplot_drought_classification_" + filenameC)
    filenameC+=1

#### 1.3. Data Granularity


#### 1.4. Sparsity

In [None]:
def heatmap(data):
    heatmap = data.corr()
    f, ax = plt.subplots(figsize=(15,15))
    sns.heatmap(heatmap,
                cmap=sns.color_palette("RdBu_r", 1000),
                vmin=-1,
                vmax=1,
                square=True)
    
    plt.savefig('plots/drought_heatmap01.png')
    plt.close()

In [None]:
heatmap(data)

In [None]:
def scatterplots(data):
    """ Given the data, the function plot the scatter plots """

    figur = sns.pairplot(data, diag_kind='hist')
    fig = figur.fig
    fig.savefig("plots/scatterplotGeneral.png")
    plt.close()

In [None]:
scatterplots(data)

### 2. Data Preparation

#### 2.1. Data types and summary

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.dtypes

In [None]:
fipsCount = data['fips'].value_counts()

In [None]:
dtypesCount = data.dtypes
dfDtypesCount = pd.DataFrame(dtypesCount)
sumDtypes = dfDtypesCount.value_counts()

In [None]:
uniqueValues = data['fips'].nunique()

#### 2.2. Missing Values

In [None]:
nullValues = data.isnull().sum()

#### 2.3. Models to evaluate the intermediate steps

In [None]:
def knn(X_train, X_test, Y_train, Y_test):

    # p=2 resultou numa acurácia menor
    model = KNeighborsClassifier(p=1)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred).ravel()
    return accuracy_score(Y_test, Y_pred), tn, fp, fn, tp

# Naive Bayes
def nb(X_train, X_test, Y_train, Y_test):

    model = GaussianNB()
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred).ravel()
    return accuracy_score(Y_test, Y_pred), tn, fp, fn, tp


def temporal_data_split(data, target, train_size=0.95):

    lim = round(len(data)*train_size)
    tmp_data_train = data.loc[0:lim]
    tmp_data_test = data.loc[lim:]
    X_train = tmp_data_train.drop(target,axis=1) 
    Y_train = tmp_data_train[target] 
    X_test = tmp_data_test.drop(target,axis=1)
    Y_test = tmp_data_test[target]
    return X_train, X_test, Y_train, Y_test

#### 2.4. Outliers Treatment

In [None]:
# test_outliers() is a method that test a set of parameters using the distance metodology
# parameters 
# - r: is the ratio that defines the neighborhood
# - f: is the data fraction that should be in the neighborhood, 
# - params: list of tuples where the first element is r and the second is f. it's used to select the outliers, and subsequently 
# train and test the data
#  
params = [(298, 0.01), (298, 0.05), (298, 0.1), (298, 0.2), (298, 0.5), 
          (574, 0.01), (574, 0.05), (574, 0.1), (574, 0.2), (574, 0.5),
          (1000, 0.01), (1000, 0.05), (1000, 0.1), (1000, 0.2), (1000, 0.5)]

def test_outliers(data, params):
    """"""

    test_outliers_stats = pd.DataFrame(columns=['r', 'f', 'n_outliers', 
                                                'knn_accuracy', 'knn_tn', 'knn_fp', 'knn_fn', 'knn_tp', 
                                                'nb_accuracy', 'nb_tn', 'nb_fp', 'nb_fn', 'nb_tp'])
    
    outliers = [[[], 0], [[], 0], [[], 0], [[], 0], [[], 0],
                [[], 0], [[], 0], [[], 0], [[], 0], [[], 0],
                [[], 0], [[], 0], [[], 0], [[], 0], [[], 0]]

    lenD = len(data)
    for c in range(len(data)):
        euclidean_matrix = euclidean_distances(data.iloc[[c]], data)
        
        for p in range(len(params)):
            tmp_df = pd.DataFrame(euclidean_matrix)
            tmp_df = tmp_df < params[p][0]
            frac = params[p][1] * lenD
            if tmp_df.values.sum() < frac:
                outliers[p][0].append(c)
                outliers[p][1]+=1

    c = 0
    for c in range(len(outliers)):

        dataOutliers = data.drop(outliers[c][0], axis=0)

        X_train, X_test, Y_train, Y_test = temporal_data_split(dataOutliers, 'drought')
        if len(X_train) != 0 and len(X_test) != 0 and  len(Y_train) != 0 and  len(Y_test) != 0:
            knn_acc, knn_tn, knn_fp, knn_fn, knn_tp = knn(X_train, X_test, Y_train, Y_test)
            nb_acc, nb_tn, nb_fp, nb_fn, nb_tp  = nb(X_train, X_test, Y_train, Y_test)
            test_outliers_stats.loc[c] = [params[c][0], params[c][1], outliers[c][1], knn_acc, knn_tn, knn_fp, knn_fn, knn_tp, nb_acc, nb_tn, nb_fp, nb_fn, nb_tp]

    X_train, X_test, Y_train, Y_test = temporal_data_split(data, 'drought')
    knn_acc, knn_tn, knn_fp, knn_fn, knn_tp = knn(X_train, X_test, Y_train, Y_test)
    nb_acc, nb_tn, nb_fp, nb_fn, nb_tp  = nb(X_train, X_test, Y_train, Y_test)
    test_outliers_stats.loc[c+1] = [0, 0, 0, 
                                                    knn_acc, knn_tn, knn_fp, knn_fn, knn_tp,
                                                    nb_acc, nb_tn, nb_fp, nb_fn, nb_tp]

    return test_outliers_stats

In [None]:
countOutliers = 0
pos_outliers = []
frac = 0.05*len(data)
for c in range(len(data)):
    euclidean_matrix = euclidean_distances(data.iloc[[c]], data)
    tmp_df = pd.DataFrame(euclidean_matrix)
    tmp_df = tmp_df < 299
    if tmp_df.values.sum() < frac:
        countOutliers+=1
        pos_outliers.append(c)

dataWithoutOutliers = data.drop(pos_outliers, axis=0)

#### 2.5. Scaling

In [None]:
dataWithoutOutliers.index = range(0,len(dataWithoutOutliers))
dataWithoutOutliers

In [None]:
test_scaling_stats = pd.DataFrame(columns=['method', 
                                                'knn_accuracy', 'knn_tn', 'knn_fp', 'knn_fn', 'knn_tp', 
                                                'nb_accuracy', 'nb_tn', 'nb_fp', 'nb_fn', 'nb_tp'])

##### 2.5.1. Standardization

In [None]:
std_scaler = StandardScaler()
tmp_dataWithoutOutliers = dataWithoutOutliers.drop('drought', axis=1)
data_std = std_scaler.fit_transform(tmp_dataWithoutOutliers)
data_std = pd.DataFrame(data_std, columns=tmp_dataWithoutOutliers.columns)
data_std['drought'] = dataWithoutOutliers['drought']

In [None]:
# stats
X_train, X_test, Y_train, Y_test = temporal_data_split(data_std, 'drought')
knn_acc, knn_tn, knn_fp, knn_fn, knn_tp = knn(X_train, X_test, Y_train, Y_test)
nb_acc, nb_tn, nb_fp, nb_fn, nb_tp  = nb(X_train, X_test, Y_train, Y_test)

test_scaling_stats.loc[0] = ['standardization', 
                            knn_acc, knn_tn, knn_fp, knn_fn, knn_tp,
                            nb_acc, nb_tn, nb_fp, nb_fn, nb_tp]

In [None]:
data_std.to_csv("/intermediate_data/data_step_std.csv", encoding='utf-8', index=False, columns=data_std.columns)

##### 2.5.2. Normalization

In [None]:

norm = Normalizer()
tmp_dataWithoutOutliers = dataWithoutOutliers.drop('drought', axis=1)
data_norm = norm.fit_transform(tmp_dataWithoutOutliers)
data_norm = pd.DataFrame(data_norm, columns=tmp_dataWithoutOutliers.columns)
data_norm['drought'] = dataWithoutOutliers['drought']

In [None]:
# stats
X_train, X_test, Y_train, Y_test = temporal_data_split(data_norm, 'drought')
knn_acc, knn_tn, knn_fp, knn_fn, knn_tp = knn(X_train, X_test, Y_train, Y_test)
nb_acc, nb_tn, nb_fp, nb_fn, nb_tp  = nb(X_train, X_test, Y_train, Y_test)

test_scaling_stats.loc[1] = ['normalization', 
                            knn_acc, knn_tn, knn_fp, knn_fn, knn_tp,
                            nb_acc, nb_tn, nb_fp, nb_fn, nb_tp]

##### 2.5.3. No changes

In [None]:
X_train, X_test, Y_train, Y_test = temporal_data_split(dataWithoutOutliers, 'drought')
knn_acc, knn_tn, knn_fp, knn_fn, knn_tp = knn(X_train, X_test, Y_train, Y_test)
nb_acc, nb_tn, nb_fp, nb_fn, nb_tp  = nb(X_train, X_test, Y_train, Y_test)

test_scaling_stats.loc[2] = ['none', 
                            knn_acc, knn_tn, knn_fp, knn_fn, knn_tp,
                            nb_acc, nb_tn, nb_fp, nb_fn, nb_tp]

##### 2.5.4. Stats

In [None]:
test_scaling_stats

#### 2.6. Feature Selection

In [None]:
knn = KNeighborsClassifier(p=1)

feature_names = np.array(data_std.drop)

X_train, X_test, Y_train, Y_test = temporal_data_split(data_std, 'drought')
X = data_std.drop('drought', axis=1)
y = data_std['drought']

sfs_forward_knn = SequentialFeatureSelector(
    knn, n_features_to_select=50, direction="forward"
).fit(X, y)

sfs_backward_knn = SequentialFeatureSelector(
    knn, n_features_to_select=50, direction="backward"
).fit(X, y)

In [None]:
feature_names = np.array(data_std.drop('drought', axis=1).columns)

In [None]:
# n_features = 50
features_selected_knn_forward = feature_names[sfs_forward_knn.get_support()]
features_selected_knn_backward = feature_names[sfs_backward_knn.get_support()]

data_features_selected_knn_forward = data_std[features_selected_knn_forward]
data_features_selected_knn_backward = data_std[features_selected_knn_backward]

In [None]:
data_features_selected_knn_forward.to_csv("intermediate_data/data_step_fs_knn_forward.csv", encoding='utf-8', index=False, columns=data_features_selected_knn_forward.columns)
data_features_selected_knn_backward.to_csv("intermediate_data/data_step_fs_knn_backward.csv", encoding='utf-8', index=False, columns=data_features_selected_knn_backward.columns)

In [None]:
features_selection_scaling_stats = pd.DataFrame(columns=['estimator', 'direction',
                                                'knn_accuracy', 'knn_tn', 'knn_fp', 'knn_fn', 'knn_tp', 
                                                'nb_accuracy', 'nb_tn', 'nb_fp', 'nb_fn', 'nb_tp'])

In [None]:
tmp_data_features_selected_knn_forward = data_features_selected_knn_forward
tmp_data_features_selected_knn_forward['drought'] = data_std['drought']
tmp_data_features_selected_knn_forward

In [None]:

X_train, X_test, Y_train, Y_test = temporal_data_split(tmp_data_features_selected_knn_forward, 'drought')
knn_acc, knn_tn, knn_fp, knn_fn, knn_tp = knn(X_train, X_test, Y_train, Y_test)
nb_acc, nb_tn, nb_fp, nb_fn, nb_tp  = nb(X_train, X_test, Y_train, Y_test)


features_selection_scaling_stats.loc[0] = ['knn', 'forward', 
                                          knn_acc, knn_tn, knn_fp, knn_fn, knn_tp,
                                          nb_acc, nb_tn, nb_fp, nb_fn, nb_tp]

In [None]:
tmp_data_features_selected_knn_backward = data_features_selected_knn_backward
tmp_data_features_selected_knn_backward['drought'] = data_std['drought']
tmp_data_features_selected_knn_backward

In [None]:
X_train, X_test, Y_train, Y_test = temporal_data_split(tmp_data_features_selected_knn_backward, 'drought')
knn_acc, knn_tn, knn_fp, knn_fn, knn_tp = knn(X_train, X_test, Y_train, Y_test)
nb_acc, nb_tn, nb_fp, nb_fn, nb_tp  = nb(X_train, X_test, Y_train, Y_test)

features_selection_scaling_stats.loc[1] = ['knn', 'backward',
                                          knn_acc, knn_tn, knn_fp, knn_fn, knn_tp,
                                          nb_acc, nb_tn, nb_fp, nb_fn, nb_tp]

In [None]:
features_selection_scaling_stats

In [None]:
tmp_data_features_selected_knn_backward.to_csv("intermediate_data/data_prepared.csv", encoding='utf-8', index=False, columns=tmp_data_features_selected_knn_backward.columns)