# Data preparation/exploration for the Aguathon

## Problem definition

Find the best model that predict the level of the Evro river when cros Zaragoza.

----

## Index

1. [Import modules and Data](#first-bullet)
2. [Missing Values](#second-bullet)
3. [Feature engineering](#third-bullet)
    * Split the data in train and test set
4. [Random forest regressor](#forth-bullet)
    * Cross-validation

## Import modules and Data <a class="anchor" id="first-bullet"></a>

In [1]:
%load_ext autoreload
%matplotlib inline
%autoreload 1


from __future__ import print_function

# STD lib imports
import os
import sys

# Third party libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklear stuff
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# root path of the project
PROJ_ROOT = os.path.join(os.pardir)

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(PROJ_ROOT, 'src')
sys.path.append(src_dir)

# import my method from the source code
%aimport data.make_dataset
import data.make_dataset as md

print(os.path.abspath(PROJ_ROOT))

raw_file = 'datos.csv'
path = '../ENTRADA/'

raw_data = pd.read_csv(os.path.join(path, raw_file))
raw_data.head()

/Users/miloc/Documents/git-hub/Apsu


Unnamed: 0,time,ALAGON_NR,GRISEN_NR,NOVILLAS_NR,TAUSTE_NR,TUDELA_NR,ZGZ_NR,RIESGO,pred_24h,pred_48h,pred_72h
0,2008-01-01 00:00:00,0.81,0.4375,1.6,0.2675,0.7875,0.74,False,0.75,0.74,0.76
1,2008-01-01 01:00:00,0.81,0.4725,1.6075,0.265,0.79,0.74,False,0.745,0.7325,0.76
2,2008-01-01 02:00:00,0.81,0.5425,1.61,0.2675,0.79,0.74,False,0.74,0.73,0.76
3,2008-01-01 03:00:00,0.8075,0.55,1.61,0.26,0.79,0.74,False,0.74,0.72,0.76
4,2008-01-01 04:00:00,0.8,0.5525,1.6025,0.265,0.79,0.74,False,0.74,0.72,0.76


## Missing Values <a class="anchor" id="second-bullet"></a>

In [2]:
# List the columns that contain missing values
raw_data.isnull().sum()

time               0
ALAGON_NR      16968
GRISEN_NR       8710
NOVILLAS_NR     1468
TAUSTE_NR        519
TUDELA_NR        277
ZGZ_NR             0
RIESGO             0
pred_24h           0
pred_48h           0
pred_72h           0
dtype: int64

In [3]:
X_imputation = raw_data[['TAUSTE_NR', 'TUDELA_NR', 'ZGZ_NR', 'ALAGON_NR']]
X_imputation = X_imputation.dropna()

Y_imputation = X_imputation['ALAGON_NR'].values
X_imputation = X_imputation[['TAUSTE_NR', 'TUDELA_NR', 'ZGZ_NR']]

x_train_imputation, x_test_imputation, \
y_train_imputation, y_test_imputation = train_test_split(X_imputation, 
                                                         Y_imputation, 
                                                         test_size = 0.3, 
                                                         random_state = 0)

rf_imputation = RandomForestRegressor(max_depth=9, 
                                     random_state=0,
                                     n_estimators=100)

rf_imputation.fit(x_train_imputation, y_train_imputation)

print('r2_score = %0.3f' % rf_imputation.score(x_test_imputation, y_test_imputation))

r2_score = 0.996


In [4]:
A = raw_data[['TAUSTE_NR', 'TUDELA_NR', 'ZGZ_NR']][raw_data.ALAGON_NR.isnull()]
ix = A[A.isnull().sum(axis=1) == 0].index
prediction = rf_imputation.predict(A[A.isnull().sum(axis=1) == 0])
raw_data['ALAGON_NR'].iloc[ix] = prediction

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [5]:
raw_data.isnull().sum()

time              0
ALAGON_NR         1
GRISEN_NR      8710
NOVILLAS_NR    1468
TAUSTE_NR       519
TUDELA_NR       277
ZGZ_NR            0
RIESGO            0
pred_24h          0
pred_48h          0
pred_72h          0
dtype: int64

In [6]:
# Most simple thing, remove missing values
data_s1 = raw_data.dropna()
data_s1.isnull().any()

time           False
ALAGON_NR      False
GRISEN_NR      False
NOVILLAS_NR    False
TAUSTE_NR      False
TUDELA_NR      False
ZGZ_NR         False
RIESGO         False
pred_24h       False
pred_48h       False
pred_72h       False
dtype: bool

## Feature engineering <a class="anchor" id="third-bullet"></a>

In [7]:
i = 2

# No feature engineering yet
columns_x = ['ALAGON_NR', 'GRISEN_NR', 'NOVILLAS_NR', 'TAUSTE_NR', 'TUDELA_NR', 'ZGZ_NR', 'RIESGO']
columns_y = ['pred_24h', 'pred_48h', 'pred_72h']
X = data_s1[columns_x]
Y = data_s1[columns_y[i]]

In [8]:
dAL = (X['ALAGON_NR'].shift(5).fillna(0).values - X['ALAGON_NR'].values)/5
X = X.assign(dAL = dAL)

dGR = (X['GRISEN_NR'].shift(5).fillna(0).values - X['GRISEN_NR'].values)/5
X = X.assign(dGR = dGR)

dNO = (X['NOVILLAS_NR'].shift(5).fillna(0).values - X['NOVILLAS_NR'].values)/5
X = X.assign(dNO = dNO)

dTA = (X['TAUSTE_NR'].shift(5).fillna(0).values - X['TAUSTE_NR'].values)/5
X = X.assign(dTA = dTA)

dTU = (X['TUDELA_NR'].shift(5).fillna(0).values - X['TUDELA_NR'].values)/5
X = X.assign(dTU = dTU)

dZG = (X['ZGZ_NR'].shift(5).fillna(0).values - X['ZGZ_NR'].values)/5
X = X.assign(dZG = dZG)

X.head()

Unnamed: 0,ALAGON_NR,GRISEN_NR,NOVILLAS_NR,TAUSTE_NR,TUDELA_NR,ZGZ_NR,RIESGO,dAL,dGR,dNO,dTA,dTU,dZG
0,0.81,0.4375,1.6,0.2675,0.7875,0.74,False,-0.162,-0.0875,-0.32,-0.0535,-0.1575,-0.148
1,0.81,0.4725,1.6075,0.265,0.79,0.74,False,-0.162,-0.0945,-0.3215,-0.053,-0.158,-0.148
2,0.81,0.5425,1.61,0.2675,0.79,0.74,False,-0.162,-0.1085,-0.322,-0.0535,-0.158,-0.148
3,0.8075,0.55,1.61,0.26,0.79,0.74,False,-0.1615,-0.11,-0.322,-0.052,-0.158,-0.148
4,0.8,0.5525,1.6025,0.265,0.79,0.74,False,-0.16,-0.1105,-0.3205,-0.053,-0.158,-0.148


In [9]:
X[columns_x] = X[columns_x].shift(5).fillna(0)
X = X[5:]

Y = Y.shift(5).fillna(0)[5:]

### Split the data in train and test set

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)
x_train.head()

Unnamed: 0,ALAGON_NR,GRISEN_NR,NOVILLAS_NR,TAUSTE_NR,TUDELA_NR,ZGZ_NR,RIESGO,dAL,dGR,dNO,dTA,dTU,dZG
8727,2.135,0.64,2.975,0.665,1.2,1.5,False,0.0035,0.002,0.0165,-0.0035,0.0,0.002
28724,1.1975,0.5425,2.0575,0.47,0.91,1.06,False,0.023,0.0355,-0.0005,-0.01,0.0015,0.006
60127,1.13,0.53,2.01,0.55,0.85,1.045,False,0.0055,0.0,-0.0015,0.0,0.0,0.003
44113,0.9425,0.33,1.8525,0.665,0.8775,0.9,False,-0.0055,0.0115,-0.0015,0.0005,0.0015,0.0005
90863,1.636875,0.14,2.44,0.75,1.05,1.325,False,0.00093,0.0,-0.008,0.006,0.001,0.003


## Random forest regressor <a class="anchor" id="forth-bullet"></a>

In [11]:
rf_regressor = RandomForestRegressor(max_depth=9, 
                                     random_state=0,
                                     n_estimators=100)

rf_regressor.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=9,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [12]:
print('r2_score = %0.3f' % rf_regressor.score(x_test, y_test))

r2_score = 0.912


### Cross-validation

In [13]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf_regressor, X, Y, scoring='r2', cv=5)

print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

Accuracy: 0.79 (+/- 0.16)
