## AGUATHON ITA

#### Resumen V4

* v4 eliminado Tmax y Tmin, solo Tmean
* v4 se carga por separado datos de lluvia y temp para procesarlos por separado
* Los datos de Temp aparentemente no ayudan mucho a subir R2
* Los datos de lluvie suben R2
* generar columnas desfasadas en tiempo ayuda a R2

In [215]:
%matplotlib inline
import os
import scipy
import numpy as np
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import datetime as dt

## Fun

In [268]:
def load_rios():
    ''' 
    Load CSV using Pandas saved in current working directory
    '''
    cwd = os.getcwd()
    in_path = os.path.join(cwd,'data','ENTRADA')
    in_file = 'datos.csv'
    filename  = os.path.join(in_path,in_file)
    data = pd.read_csv(filename, parse_dates = ['time'], index_col = 'time') #, names=names)
    return data

def load_meteo(csvfile, cols):
    ''' 
    Load CSV using Pandas saved in current working directory
    '''
    cwd = os.getcwd()
    in_path = os.path.join(cwd, 'data', 'DatosPorEstacion')
    in_file = csvfile + '.csv'
    filename  = os.path.join(in_path,in_file)
    data = pd.read_csv(filename,sep=";", usecols = cols,
                       parse_dates = ['FECHA'], index_col='FECHA')
    return data

    
def derived_features(df, feature, delta):
    '''
    Deriving new features using data from prior two days. That is each time series 
    shifted back 24, 48 and 72 hrs
    '''
    rows = df.shape[0]
    previous_time = [np.nan]*delta + [df[feature][i-delta] for i in range(delta, rows)]
    col_name = "{}_{}".format(feature, delta)
    df[col_name] = previous_time

### DATA LOADING

In [273]:
### RIOS
df_rios = load_rios()
df_rios.columns = ['Ala','Gri','Nov','Tau','Tud','Zar','Risk','P24','P48','P72']
idx = df_rios.resample('D').mean().index

# Estaciones
estaciones = {'pa1':'9262-19530901-20190131',   # pna
              'pa2':'9263D-19750101-20190302',  # pna Aerop
              'za1':'9434-19410101-20190302',   # zar Aerop
              'za2':'9244X-19920204-20190302',  # zar Sos rey 
              'hu1':'9208E-20060201-20190302',  # huesca aragues
              'hu2':'9201K-19920101-20190302',  # huesca jaca
              'log':'9170-19481101-20190302',   # Logroño
             }

### TEMP
cols = ['FECHA','TMEDIA']
dframes = []
for k,v in estaciones.items():
    #  Read csv into a DataFrame: df
    df = load_meteo(v, cols)
    df = df.loc[idx]
    #colnames = ['Tm','rain']
    #colnames = [x+'_'+k for x in colnames]
    df.columns = ['Tm_'+k]
    # Append df to frames
    dframes.append(df)

# Concatenate frames into a single DataFrame: uber
df_temp = pd.concat(dframes, axis=1)

### RAIN
cols = ['FECHA','PRECIPITACION']
dframes = []
for k,v in estaciones.items():
    #  Read csv into a DataFrame: df
    df = load_meteo(v, cols)
    df = df.loc[idx]
    #colnames = ['Tm','rain']
    #colnames = [x+'_'+k for x in colnames]
    df.columns = ['rain_'+ k] 
    # Append df to frames
    dframes.append(df)

# Concatenate frames into a single DataFrame: uber
df_rain = pd.concat(dframes, axis=1)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [274]:
df_rain.head()

Unnamed: 0_level_0,rain_pa1,rain_pa2,rain_za1,rain_za2,rain_hu1,rain_hu2,rain_log
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-01-01,0.0,0.0,0.0,0.0,0.0,,0.0
2008-01-02,6.8,1.5,1.2,2.6,0.6,1.2,0.3
2008-01-03,4.6,6.1,2.0,,17.0,18.4,0.1
2008-01-04,1.4,1.1,0.0,0.3,1.4,,0.1
2008-01-05,5.0,2.6,Ip,0.6,10.5,,0.4


### DATA PREPARATION

In [298]:
#### RAIN DATA
df_rain.fillna(value = 0,inplace=True) 
# replace string values defined by AEMET
df_rain.replace(to_replace =['Ip','Acum','Varias'] , value =0 , inplace = True)
#Text columns to numeric
for col in df_rain.columns:
    df_rain[col] = pd.to_numeric(df_rain[col], errors='coerce')

#### TEMP DATA
df_temp.fillna(method = 'ffill',inplace=True) 
df_temp.fillna(method = 'bfill',inplace=True)
# replace string values defined by AEMET    
# meteo dataframe resample

### DATA resample
df_rain = df_rain.resample('H').ffill()/24.0
df_temp = df_temp.resample('H').ffill()

### RIOS DATA
target = 'P72'
cols = ['Ala','Tud','Zar']
cols.append(target)
df_rios = df_rios[cols]

### RIOS + TEMP + LLUVIA
df = pd.concat([df_rios, df_rain], axis=1)

In [285]:
#dfrain
df.head()
#dfrain[dfrain['date']=='07/08/2017']

Unnamed: 0_level_0,Ala,Tud,Zar,P72,rain_pa1,rain_pa2,rain_za1,rain_za2,rain_hu1,rain_hu2,rain_log
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2008-01-01 00:00:00,0.81,0.7875,0.74,0.76,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2008-01-01 01:00:00,0.81,0.79,0.74,0.76,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2008-01-01 02:00:00,0.81,0.79,0.74,0.76,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2008-01-01 03:00:00,0.8075,0.79,0.74,0.76,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2008-01-01 04:00:00,0.8,0.79,0.74,0.76,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [302]:
### GENERAR COLUMNAS Desfasadas en T
#derive_new_features = ['Tud', 'Ala']
derive_new_features = [x for x in df.columns if x != target]

N = 5       #number of derived columns per feature
delta = 1   # time delta in hr 
for column in derive_new_features:  
    #if feature != 'date':
    for D in range(1, N + 1):
        derived_features(df, column, D*delta)
        
df.dropna(axis=0, how='any', inplace=True)
df.head()

Unnamed: 0_level_0,Ala,Tud,Zar,P72,rain_pa1,rain_pa2,rain_za1,rain_za2,rain_hu1,rain_hu2,...,Ala_4_1,Ala_4_2,Ala_4_3,Ala_4_4,Ala_4_5,Ala_5_1,Ala_5_2,Ala_5_3,Ala_5_4,Ala_5_5
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-01-01 15:00:00,0.79,0.79,0.75,0.755,0.0,0.0,0.0,0.0,0.0,0.0,...,0.79,0.79,0.79,0.795,0.8,0.79,0.79,0.795,0.8,0.8
2008-01-01 16:00:00,0.79,0.79,0.75,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.79,0.79,0.79,0.79,0.795,0.79,0.79,0.79,0.795,0.8
2008-01-01 17:00:00,0.79,0.79,0.75,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.79,0.79,0.79,0.79,0.79,0.79,0.79,0.79,0.79,0.795
2008-01-01 18:00:00,0.79,0.79,0.75,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.7825,0.79,0.79,0.79,0.79,0.79,0.79,0.79,0.79,0.79
2008-01-01 19:00:00,0.79,0.79,0.75,0.75,0.0,0.0,0.0,0.0,0.0,0.0,...,0.7875,0.7825,0.79,0.79,0.79,0.7825,0.79,0.79,0.79,0.79


In [None]:
### PLot de correlacion entre features
colormap = plt.cm.RdBu
plt.figure(figsize=(40,40))
plt.title(u'10 hours', y=1.05, size=16)

mask = np.zeros_like(df.corr())
mask[np.triu_indices_from(mask)] = True

svm = sns.heatmap(df.corr(), mask=mask, linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

In [303]:
features = [x for x in df.columns if x != target]
Y = df[target]
X = df.loc[:,features]#, 'Gri', 'Tau')]

In [304]:
# Linear Regression

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression


kfold = KFold(n_splits=10, random_state=0)
model = LinearRegression()
scoring = 'r2'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("Accuracy: {:.3f}% ({:.3f}%)".format(results.mean()*100.0, results.std()*100.0))

Accuracy: 84.200% (7.315%)


In [266]:
# Compare Algorithms
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor


# prepare models
models = []
models.append(( 'LINR' , LinearRegression()))
models.append(( 'RDGE' , Ridge()))
#models.append(( 'KNB' , KNeighborsRegressor()))
#models.append(( 'DTR' , DecisionTreeRegressor()))
models.append(( 'GBR' , GradientBoostingRegressor()))

# evaluate each model in turn
results = []
names = []
scoring = 'r2'

for name, model in models:
    kfold = KFold(n_splits=10, random_state=7)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = '{}: {} ({})'.format(name, cv_results.mean(), cv_results.std())
    print(msg)


LINR: 0.835816041355727 (0.07991426165350195)
RDGE: 0.8358043022256805 (0.07984665867098886)
GBR: 0.8240378519403928 (0.08604958499925439)
