## AGUATHON ITA

#### Resumen 5

Estudio sobre la influencia de Temp, Presion y Localizacion de estacion meteorologica en R2. En este script se simplifica el codigo para automatizar la generacion de resultados. No se trata de un script final optimizado, sino simplificado para analizar la influencia de cada variable.


In [19]:
import os
import scipy
import numpy as np
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import datetime as dt

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

## Fun

In [45]:
def load_rios():
    ''' 
    Load CSV using Pandas saved in current working directory
    '''
    cwd = os.getcwd()
    in_path = os.path.join(cwd,'data','ENTRADA')
    in_file = 'datos.csv'
    filename  = os.path.join(in_path,in_file)
    data = pd.read_csv(filename, parse_dates = ['time'], index_col = 'time') #, names=names)
    return data

def load_meteo(csvfile, cols):
    ''' 
    Load CSV using Pandas saved in current working directory
    '''
    cwd = os.getcwd()
    in_path = os.path.join(cwd, 'data', 'DatosPorEstacion')
    in_file = csvfile + '.csv'
    filename  = os.path.join(in_path,in_file)
    data = pd.read_csv(filename,sep=";", usecols = cols,
                       parse_dates = ['FECHA'], index_col='FECHA')
    return data

    
def derived_features(df, column, delta):
    '''
    Deriving new features using data from prior two days. That is each time series 
    shifted back 24, 48 and 72 hrs
    '''
    rows = df.shape[0]
    s = [np.nan]*delta + [df[column][i-delta] for i in range(delta, rows)]
    col_name = "{}_{}".format(column, delta)
    df[col_name] = s
    return df

### DATA LOADING

In [49]:
############################################################## 
##     DATA LOADING
##############################################################

### RIOS
df_rios = load_rios()
df_rios.columns = ['Ala','Gri','Nov','Tau','Tud','Zar','Risk','P24','P48','P72']
start = df_rios.index[0]
end = df_rios.index[-1]

### Estaciones
estaciones = {'pa1':'9262-19530901-20190131',   # pna
              'pa2':'9263D-19750101-20190302',  # pna Aerop
              'za1':'9434-19410101-20190302',   # zar Aerop
              'za2':'9244X-19920204-20190302',  # zar Sos rey 
              'hu1':'9208E-20060201-20190302',  # huesca aragues
              'hu2':'9201K-19920101-20190302',  # huesca jaca
              'log':'9170-19481101-20190302',   # Logro√±o
              'cal':'9394X-19930401-20190302',  # Calatayud
             }

locations = ['pa1', 'pa2', 'za1', 'za2', 'log', 'cal', 'hu1', 'hu2']

### TEMP
cols = ['FECHA','TMEDIA']
dframes = []
for k,v in estaciones.items():
    if k in locations:
        df = load_meteo(v, cols)
        df = df.resample('H').ffill()
        df = df[start:end]
        df.columns = ['Tm_'+k]
        dframes.append(df)
df_temp = pd.concat(dframes, axis=1)


locations = ['pa1', 'pa2', 'hu1', 'hu2', 'log'] # mejores para lluvia

### RAIN
cols = ['FECHA','PRECIPITACION']
dframes = []
for k,v in estaciones.items():
    if k in locations:    
        df = load_meteo(v, cols)
        df = df.resample('H').ffill()
        df = df[start:end]
        df.columns = ['rain_'+ k] 
        dframes.append(df)
df_rain = pd.concat(dframes, axis=1)


locations = ['pa2', 'za1', 'log'] # solo estas tienen datos de presion
    
### PRESION
cols = ['FECHA','PRESMAX', 'PRESMIN']
dframes = []
for k,v in estaciones.items():
    # load selected locations only 
    if k in locations:
        #  Read csv into a DataFrame: df
        df = load_meteo(v, cols)
        df = df.resample('H').ffill()
        df = df[start:end]        
        df.columns = ['Pmax_'+ k, 'Pmin_'+ k] 
        dframes.append(df)
df_pres = pd.concat(dframes, axis=1)



####################################################################### 
######     DATA PREPARATION
######################################################################


#### RAIN DATA
df_rain.fillna(value = 0,inplace=True) 
# replace string values defined by AEMET
df_rain.replace(to_replace =['Ip','Acum','Varias'] , value =0 , inplace = True)
#convert Text columns to numeric
for col in df_rain.columns:
    df_rain[col] = pd.to_numeric(df_rain[col], errors='coerce')


#### TEMP DATA
df_temp.fillna(method = 'ffill',inplace=True) 
df_temp.fillna(method = 'bfill',inplace=True)

#### PRES DATA
df_pres.fillna(method = 'ffill',inplace=True) 
df_pres.fillna(method = 'bfill',inplace=True)

### SELECT RIOS DATA
target = 'P72'
# selected features
cols = ['Ala','Tud','Nov','Zar']#,'Gri']
cols.append(target)
df_rios = df_rios[cols]

### RIOS + LLUVIA + PRES (TEMP eliminada porque no ayuda)
#df = pd.concat([df_rios, df_rain, df_pres], axis=1)
#df = df_rios
df = pd.concat([df_rios, df_rain], axis=1)

df.dropna(axis=0, how='any', inplace=True)


############################################################### 
########       ML SPLIT AND REGRESSION
###############################################################

features = [x for x in df.columns if x != target]
Y = df[target]
X = df.loc[:,features]


kfold = KFold(n_splits=10, random_state=0)
model = LinearRegression()
scoring = 'r2'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("Accuracy: {:.3f}% ({:.3f}%)".format(results.mean()*100.0, results.std()*100.0))

Accuracy: 80.744% (9.111%)


### Estudio Sensibilidad Variables
```
Rios
* 66.601% (15.258%) Ala
* 70.748% (13.848%) Ala,Tud
* 70.570% (14.261%) Ala,Tud,Zar
* 70.027% (14.635%) Ala,Tud,Gri
* 69.781% (15.112%) Ala,Tud,Gri,Zar
* 71.320% (14.805%) Ala,Tud,Nov        <= (2)
* 71.393% (14.716%) Ala,Tud,Nov,Zar    <= (1)
* 71.279% (14.391%) Ala,Tud,Nov,Zar,Gri

Rios + Lluvia por estacion
* 80.646%  (9.153%) (1) + todas estaciones
* 80.002%  (9.534%) pa1
* 78.740% (10.846%) pa2
* 73.261% (13.912%) za1
* 73.850% (14.069%) za2
* 74.850% (13.424%) log
* 72.488% (13.854%) cal
* 76.010% (11.421%) hu1
* 74.148% (13.295%) hu2
* 80.805% (8.939%)  pa1 + pa2 + hu1 + hu2
* 80.744% (9.111%)  pa1 + pa2 + hu1 + hu2 + log

Rios + Temp
* 71.781% (15.308%) 1 + all locations
* 72.175% (14.747%) pa1
* 72.171% (14.655%) pa2
* 72.128% (14.525%) za1
* 71.894% (15.165%) za2
* 72.197% (14.415%) log
* 71.738% (14.184%) cal
* 72.630% (14.121%) hu1
* 72.156% (14.855%) hu2

Rios + Pres (max & min)
* 74.339% (12.836%) pa2
* 73.991% (13.042%) zag1
* 73.870% (13.228%) log
* 76.010% (11.205%) pa2, zag1, log 

```