# Preprocesamiento

## Importación de librerías

In [236]:
import pandas as pd
import numpy as np
from pyproj import Transformer
from sklearn.preprocessing import StandardScaler

## Lectura de datos

In [191]:
df = pd.read_csv("../../data/raw/historico_completo.csv")
df_est = pd.read_csv("../../data/raw/estaciones.csv")
df_integrado = pd.merge(df, df_est, on=["codigoEstacion", "provincia_id"])
df_integrado.head()

Unnamed: 0,fecha,dia,tempMedia,tempMax,horMinTempMax,tempMin,horMinTempMin,humedadMedia,humedadMax,horMinHumMax,...,bajoplastico,activa,visible,longitud,latitud,altitud,xutm,yutm,huso,provincia_nombre
0,2005-01-01,1,9.07,18.58,14:30,0.658,07:20,67.31,87.2,07:40,...,False,True,True,060102000W,364525000N,39,230650.0,4072170.0,30,Cádiz
1,2005-01-02,2,9.02,12.17,14:50,5.81,21:30,84.7,91.8,23:30,...,False,True,True,060102000W,364525000N,39,230650.0,4072170.0,30,Cádiz
2,2005-01-03,3,8.18,15.91,14:20,3.196,06:10,80.6,92.9,11:50,...,False,True,True,060102000W,364525000N,39,230650.0,4072170.0,30,Cádiz
3,2005-01-04,4,10.55,18.26,12:40,3.608,07:30,68.63,87.2,06:50,...,False,True,True,060102000W,364525000N,39,230650.0,4072170.0,30,Cádiz
4,2005-01-05,5,9.82,17.78,14:50,2.802,07:30,67.62,87.5,06:10,...,False,True,True,060102000W,364525000N,39,230650.0,4072170.0,30,Cádiz


In [166]:
df_integrado.to_csv("../../data/processed/datos_integrados.csv", sep=";", index=False)

In [192]:
df_integrado.shape

(690709, 35)

In [193]:
df_integrado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690709 entries, 0 to 690708
Data columns (total 35 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   fecha             690709 non-null  object 
 1   dia               690709 non-null  int64  
 2   tempMedia         689485 non-null  float64
 3   tempMax           689588 non-null  float64
 4   horMinTempMax     690709 non-null  object 
 5   tempMin           689146 non-null  float64
 6   horMinTempMin     690709 non-null  object 
 7   humedadMedia      689194 non-null  float64
 8   humedadMax        688772 non-null  float64
 9   horMinHumMax      690709 non-null  object 
 10  humedadMin        688507 non-null  float64
 11  horMinHumMin      690709 non-null  object 
 12  velViento         681821 non-null  float64
 13  dirViento         681604 non-null  float64
 14  velVientoMax      681822 non-null  float64
 15  horMinVelMax      690709 non-null  object 
 16  dirVientoVelMax   68

## Eliminación de registros duplicados

In [194]:
print(f"{df_integrado.duplicated().sum()} instancias duplicadas.")
df_integrado=df_integrado.drop_duplicates()
print(f"{df_integrado.duplicated().sum()} instancias duplicadas tras eliminar los duplicados.")

107 instancias duplicadas.
0 instancias duplicadas tras eliminar los duplicados.


## Eliminación de estaciones problemáticas

Como se ha visto en el análisis exploratorio, se van a eliminar los registros de las estaciones:
* Estación 102, provincia 21: mala calidad (datos nulos, serie temporal inconsistente)
* Estación 103, provincia 21: mala calidad (datos nulos, serie temporal inconsistente)
* Estación 12, provincia 21: bajo número de registros disponibles (menos de un año)

In [195]:
df_integrado= df_integrado.query('not (codigoEstacion == 12 and provincia_id == 21)')
df_integrado= df_integrado.query('not (codigoEstacion == 102 and provincia_id == 21)')
df_integrado= df_integrado.query('not (codigoEstacion == 103 and provincia_id == 21)')

In [196]:
df_integrado.shape

(682109, 35)

## Eliminación de registros problemáticos

Para evitar incluir segos, se va a eliminar los registros cuyo valor de et0 sea nulo.

In [197]:
df_integrado = df_integrado[df_integrado['et0'].notna()]
df_integrado.shape

(678890, 35)

## Corrección de formato de variables

Fecha:

In [198]:
df_integrado['fecha']= pd.to_datetime(df_integrado['fecha'])

Coordenadas:

In [199]:
def convertir_utm_a_latlon(utm_x, utm_y):
    lon, lat = transformer.transform(utm_x, utm_y)
    return lon, lat

transformer = Transformer.from_crs("epsg:32630", "epsg:4326", always_xy=True)
df_integrado['lon'], df_integrado['lat'] = convertir_utm_a_latlon(df_integrado['xutm'],df_integrado['yutm'])


Día del año (para indicarle que es una variable cíclica):

In [200]:
df_integrado['dia_del_año_sin'] = np.sin(2 * np.pi * df_integrado['dia']/365)
df_integrado['dia_del_año_cos'] = np.cos(2 * np.pi * df_integrado['dia']/365)

## Creación de nuevas variables

In [201]:
df_integrado['año'] = df_integrado['fecha'].dt.year

In [202]:
df_integrado['mes'] = df_integrado['fecha'].dt.month

In [203]:
df_integrado['mes_sin'] = np.sin(2 * np.pi * df_integrado['mes']/12)
df_integrado['mes_cos'] = np.cos(2 * np.pi * df_integrado['mes']/12)

## Selección de variables de interés

In [204]:
df_integrado.columns

Index(['fecha', 'dia', 'tempMedia', 'tempMax', 'horMinTempMax', 'tempMin',
       'horMinTempMin', 'humedadMedia', 'humedadMax', 'horMinHumMax',
       'humedadMin', 'horMinHumMin', 'velViento', 'dirViento', 'velVientoMax',
       'horMinVelMax', 'dirVientoVelMax', 'radiacion', 'precipitacion',
       'bateria', 'fechaUtlMod', 'et0', 'provincia_id', 'codigoEstacion',
       'nombre', 'bajoplastico', 'activa', 'visible', 'longitud', 'latitud',
       'altitud', 'xutm', 'yutm', 'huso', 'provincia_nombre', 'lon', 'lat',
       'dia_del_año_sin', 'dia_del_año_cos', 'año', 'mes', 'mes_sin',
       'mes_cos'],
      dtype='object')

In [205]:
df_integrado = df_integrado[['fecha','tempMedia', 'tempMax', 'tempMin','humedadMedia', 'humedadMax', 'humedadMin', 'velViento',
        'dirViento', 'velVientoMax', 'dirVientoVelMax', 'radiacion', 'precipitacion','et0', 'altitud','lon', 'lat', 
        'dia_del_año_sin', 'dia_del_año_cos', 'año', 'mes', 'mes_sin', 'mes_cos', 'provincia_id', 'codigoEstacion']]

In [206]:
print(df_integrado.shape)
df_integrado.head()

(678890, 25)


Unnamed: 0,fecha,tempMedia,tempMax,tempMin,humedadMedia,humedadMax,humedadMin,velViento,dirViento,velVientoMax,...,lon,lat,dia_del_año_sin,dia_del_año_cos,año,mes,mes_sin,mes_cos,provincia_id,codigoEstacion
0,2005-01-01,9.07,18.58,0.658,67.31,87.2,38.85,1.162,44.99,5.008,...,-6.017246,36.757068,0.017213,0.999852,2005,1,0.5,0.866025,11,1
1,2005-01-02,9.02,12.17,5.81,84.7,91.8,76.9,1.287,43.56,4.106,...,-6.017246,36.757068,0.034422,0.999407,2005,1,0.5,0.866025,11,1
2,2005-01-03,8.18,15.91,3.196,80.6,92.9,51.89,1.486,61.36,5.174,...,-6.017246,36.757068,0.05162,0.998667,2005,1,0.5,0.866025,11,1
3,2005-01-04,10.55,18.26,3.608,68.63,87.2,41.21,1.747,98.8,7.68,...,-6.017246,36.757068,0.068802,0.99763,2005,1,0.5,0.866025,11,1
4,2005-01-05,9.82,17.78,2.802,67.62,87.5,42.28,1.082,88.5,4.508,...,-6.017246,36.757068,0.085965,0.996298,2005,1,0.5,0.866025,11,1


In [207]:
df_integrado.columns

Index(['fecha', 'tempMedia', 'tempMax', 'tempMin', 'humedadMedia',
       'humedadMax', 'humedadMin', 'velViento', 'dirViento', 'velVientoMax',
       'dirVientoVelMax', 'radiacion', 'precipitacion', 'et0', 'altitud',
       'lon', 'lat', 'dia_del_año_sin', 'dia_del_año_cos', 'año', 'mes',
       'mes_sin', 'mes_cos', 'provincia_id', 'codigoEstacion'],
      dtype='object')

## División Dataset

In [208]:
año_corte_test = 2020

In [209]:
porc_test=len(df_integrado[df_integrado['año'] > año_corte_test])/len(df_integrado)*100
print(f"Tomando como año de corte {año_corte_test}, el conjunto de train supone un {round(100-porc_test,2)}% y el test un {round(porc_test,2)}%")

Tomando como año de corte 2020, el conjunto de train supone un 76.35% y el test un 23.65%


In [210]:
var = ['fecha','tempMedia', 'tempMax', 'tempMin', 'humedadMedia', 'humedadMax',
       'humedadMin', 'velViento', 'dirViento', 'velVientoMax',
       'dirVientoVelMax', 'radiacion', 'precipitacion', 'altitud',
       'lon', 'lat', 'dia_del_año_sin', 'dia_del_año_cos', 'año', 'mes',
       'mes_sin', 'mes_cos', 'provincia_id', 'codigoEstacion']

In [219]:
train_indices = df_integrado[df_integrado['año'] <= año_corte_test].index
test_indices = df_integrado[df_integrado['año'] > año_corte_test].index

X_train, X_test = df_integrado[var].loc[train_indices], df_integrado[var].loc[test_indices]

y_train, y_test = df_integrado['et0'].loc[train_indices], df_integrado['et0'].loc[test_indices]

print(f"Tamaño X: Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Tamaño Y: Train: {y_train.shape}, Test: {y_train.shape}")

Tamaño X: Train: (518312, 24), Test: (160578, 24)
Tamaño Y: Train: (518312,), Test: (518312,)


In [212]:
y_train

0         1.495588
1         0.776324
2         1.327342
3         1.796508
4         1.371563
            ...   
689035    1.497096
689036    4.001301
689037    2.310957
689038    2.191842
689039    2.319915
Name: et0, Length: 518312, dtype: float64

In [213]:
X_train

Unnamed: 0,fecha,tempMedia,tempMax,tempMin,humedadMedia,humedadMax,humedadMin,velViento,dirViento,velVientoMax,...,lon,lat,dia_del_año_sin,dia_del_año_cos,año,mes,mes_sin,mes_cos,provincia_id,codigoEstacion
0,2005-01-01,9.07,18.58,0.658,67.31,87.20,38.85,1.162,44.99,5.008,...,-6.017246,36.757068,1.721336e-02,0.999852,2005,1,5.000000e-01,0.866025,11,1
1,2005-01-02,9.02,12.17,5.810,84.70,91.80,76.90,1.287,43.56,4.106,...,-6.017246,36.757068,3.442161e-02,0.999407,2005,1,5.000000e-01,0.866025,11,1
2,2005-01-03,8.18,15.91,3.196,80.60,92.90,51.89,1.486,61.36,5.174,...,-6.017246,36.757068,5.161967e-02,0.998667,2005,1,5.000000e-01,0.866025,11,1
3,2005-01-04,10.55,18.26,3.608,68.63,87.20,41.21,1.747,98.80,7.680,...,-6.017246,36.757068,6.880243e-02,0.997630,2005,1,5.000000e-01,0.866025,11,1
4,2005-01-05,9.82,17.78,2.802,67.62,87.50,42.28,1.082,88.50,4.508,...,-6.017246,36.757068,8.596480e-02,0.996298,2005,1,5.000000e-01,0.866025,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689035,2020-12-27,7.63,16.95,0.287,68.10,88.90,24.40,1.098,313.10,4.831,...,-4.131663,36.796094,-5.161967e-02,0.998667,2020,12,-2.449294e-16,1.000000,29,2
689036,2020-12-28,13.62,18.89,9.920,49.09,59.44,22.53,4.120,324.90,11.020,...,-4.131663,36.796094,-3.442161e-02,0.999407,2020,12,-2.449294e-16,1.000000,29,2
689037,2020-12-29,11.44,15.50,7.990,52.50,78.70,40.24,3.255,321.40,9.910,...,-4.131663,36.796094,-1.721336e-02,0.999852,2020,12,-2.449294e-16,1.000000,29,2
689038,2020-12-30,10.08,18.62,1.754,45.49,71.90,15.05,1.587,315.40,7.680,...,-4.131663,36.796094,6.432491e-16,1.000000,2020,12,-2.449294e-16,1.000000,29,2


In [214]:
X_train.to_csv("../../data/processed/X_train.csv", sep=";", index=False)
X_test.to_csv("../../data/processed/X_test.csv", sep=";", index=False)
y_train.to_csv("../../data/processed/y_train.csv", sep=";", index=False)
y_test.to_csv("../../data/processed/y_test.csv", sep=";", index=False)

## Valores nulos

In [230]:
def interpolacion_nulos(df):
    data = df.copy()
    var_nulas = data.isnull().sum()
    
    list_var_nulas = var_nulas[var_nulas>0].index.to_list()
    print(f'Listado de variables con valores nulos: {list_var_nulas}: {var_nulas[var_nulas>0]}')
    interpolacion = data.groupby(['provincia_id','codigoEstacion']).apply(
        lambda x: x.set_index('fecha').interpolate(method='time', limit_direction='both').reset_index())

    data.set_index(['provincia_id','codigoEstacion', 'fecha'], inplace=True)
    interpolacion.set_index(['provincia_id','codigoEstacion', 'fecha'], inplace=True)
    data[list_var_nulas] = interpolacion[list_var_nulas]
    data.reset_index(inplace=True)

    print(f'Nulos tras interpolación: {data.isnull().sum()}')
    return data

In [221]:
X_train = interpolacion_nulos(X_train)

Listado de variables con valores nulos: ['humedadMedia', 'dirViento', 'velVientoMax', 'dirVientoVelMax', 'precipitacion']: humedadMedia         1
dirViento          305
velVientoMax         7
dirVientoVelMax      9
precipitacion       67
dtype: int64
Nulos tras interpolación: provincia_id       0
codigoEstacion     0
fecha              0
tempMedia          0
tempMax            0
tempMin            0
humedadMedia       0
humedadMax         0
humedadMin         0
velViento          0
dirViento          0
velVientoMax       0
dirVientoVelMax    0
radiacion          0
precipitacion      0
altitud            0
lon                0
lat                0
dia_del_año_sin    0
dia_del_año_cos    0
año                0
mes                0
mes_sin            0
mes_cos            0
dtype: int64


  interpolacion = data.groupby(['provincia_id','codigoEstacion']).apply(


In [222]:
X_test = interpolacion_nulos(X_test)

Listado de variables con valores nulos: ['dirViento', 'precipitacion']: dirViento        25
precipitacion    58
dtype: int64
Nulos tras interpolación: provincia_id       0
codigoEstacion     0
fecha              0
tempMedia          0
tempMax            0
tempMin            0
humedadMedia       0
humedadMax         0
humedadMin         0
velViento          0
dirViento          0
velVientoMax       0
dirVientoVelMax    0
radiacion          0
precipitacion      0
altitud            0
lon                0
lat                0
dia_del_año_sin    0
dia_del_año_cos    0
año                0
mes                0
mes_sin            0
mes_cos            0
dtype: int64


  interpolacion = data.groupby(['provincia_id','codigoEstacion']).apply(


## Valores atípicos

In [223]:
def deteccion_outlier(data, var):

    Q1 = data[var].quantile(0.25)
    Q3 = data[var].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outliers = (data[var] < lower) | (data[var] > upper)
    return outliers

In [233]:
var_imputar = ["tempMin","tempMax", "tempMedia", "radiacion"]
var_capping = ["precipitacion", "velViento", "velVientoMax","humedadMax", 
               "dirVientoVelMax", "dirViento", "humedadMin", "humedadMedia"]

In [None]:
def outliers_interpolar(df, var_imputar):
    data = df.copy()
    data.set_index('fecha', inplace=True)
    outlier_imputar = data.groupby(['provincia_id', 'codigoEstacion'])[var_imputar].apply(
    lambda x: deteccion_outlier(x,var_imputar))
    sum_outlier = outlier_imputar.sum().sum()
    print(f"Detectados {sum_outlier}")

    if sum_outlier==0:
        print(f'No existen valores outliers en el dataset para las variables {var_imputar}')
    else:
        data.reset_index(inplace=True)
        data.set_index(['provincia_id', 'codigoEstacion', 'fecha'], inplace=True)
        data[var_imputar] = data[var_imputar].mask(outlier_imputar)
        data.reset_index(inplace=True)
        interpolacion = data.groupby(['provincia_id','codigoEstacion']).apply(
            lambda x: x.set_index('fecha').interpolate(method='time', limit_direction='both').reset_index())

        interpolacion = interpolacion.reset_index(drop=True)
        data.set_index(['provincia_id','codigoEstacion','fecha'], inplace=True)
        interpolacion.set_index(['provincia_id','codigoEstacion','fecha'], inplace=True)
        data[var_imputar]=interpolacion[var_imputar]
        data.reset_index(inplace=True)
        return data


In [229]:
X_train = outliers_interpolar(X_train, var_imputar)
X_test = outliers_interpolar(X_test, var_imputar)

Detectados 79


  interpolacion = data.groupby(['provincia_id','codigoEstacion']).apply(


Detectados 27


  interpolacion = data.groupby(['provincia_id','codigoEstacion']).apply(


In [231]:
def outliers_capping(df, var_capping):
    data = df.copy()
    for col in var_capping:
        upper_limit = data.groupby(['provincia_id', 'codigoEstacion'])[col].transform(lambda x: x.quantile(0.99))
        lower_limit = data.groupby(['provincia_id', 'codigoEstacion'])[col].transform(lambda x: x.quantile(0.01))
        data[col] = data[col].clip(lower=lower_limit, upper=upper_limit)

    print("Capping completado.")
    return data

In [235]:
X_train = outliers_capping(X_train, var_capping)
X_test = outliers_capping(X_test, var_capping)

Capping completado.
Capping completado.


## Selección de variables y estandarización

In [241]:
var_num = ['tempMedia', 'tempMax', 'tempMin', 'humedadMedia', 'humedadMax', 
           'humedadMin', 'velViento', 'dirViento', 'velVientoMax', 'dirVientoVelMax', 
           'radiacion','precipitacion', 'altitud', 'lon', 'lat', 'dia_del_año_sin',
           'dia_del_año_cos', 'año', 'mes', 'mes_sin', 'mes_cos']

In [None]:
scaler = StandardScaler()
scaler.fit(X_train[var_num])
X_train_scaled = scaler.transform(X_train[var_num])
X_test_scaled = scaler.transform(X_test[var_num])

In [243]:
X_train_final = pd.DataFrame(X_train_scaled, columns=var_num, index=X_train.index)
X_test_final = pd.DataFrame(X_test_scaled, columns=var_num, index=X_test.index)

In [246]:
X_train_final.head()

Unnamed: 0,tempMedia,tempMax,tempMin,humedadMedia,humedadMax,humedadMin,velViento,dirViento,velVientoMax,dirVientoVelMax,...,precipitacion,altitud,lon,lat,dia_del_año_sin,dia_del_año_cos,año,mes,mes_sin,mes_cos
0,-1.149191,-0.644914,-1.635524,0.18961,0.079664,0.032829,-0.429441,-1.506338,-0.459902,1.510954,...,-0.292242,-0.821274,-0.961184,-1.212781,0.025922,1.417772,-1.705504,-1.607476,0.717969,1.22936
1,-1.156422,-1.448554,-0.80796,1.178122,0.419198,2.146815,-0.29923,-1.52109,-0.855152,1.46254,...,-0.292242,-0.821274,-0.961184,-1.212781,0.050257,1.417144,-1.705504,-1.607476,0.717969,1.22936
2,-1.27791,-0.97966,-1.227846,0.945063,0.500391,0.757307,-0.091935,-1.337465,-0.387162,1.301512,...,-0.24223,-0.821274,-0.961184,-1.212781,0.074578,1.416096,-1.705504,-1.607476,0.717969,1.22936
3,-0.935141,-0.685034,-1.161667,0.264644,0.079664,0.163946,0.179945,-0.951233,0.710951,-0.820273,...,-0.292242,-0.821274,-0.961184,-1.212781,0.098877,1.41463,-1.705504,-1.607476,0.717969,1.22936
4,-1.040719,-0.745213,-1.291134,0.207231,0.101808,0.223394,-0.512776,-1.057488,-0.678998,-0.514003,...,-0.292242,-0.821274,-0.961184,-1.212781,0.123147,1.412746,-1.705504,-1.607476,0.717969,1.22936


In [247]:
X_test_final.head()

Unnamed: 0,tempMedia,tempMax,tempMin,humedadMedia,humedadMax,humedadMin,velViento,dirViento,velVientoMax,dirVientoVelMax,...,precipitacion,altitud,lon,lat,dia_del_año_sin,dia_del_año_cos,año,mes,mes_sin,mes_cos
0,-1.198364,-1.299361,-1.23138,0.621054,0.36753,0.752307,0.637245,1.05317,0.210096,1.105752,...,0.457943,-0.821274,-0.961184,-1.212781,0.025922,1.417772,1.792456,-1.607476,0.717969,1.22936
1,-1.810143,-1.518763,-1.920642,0.751794,0.278956,0.445626,-0.772155,1.554529,-1.026924,1.539413,...,-0.24223,-0.821274,-0.961184,-1.212781,0.050257,1.417144,1.792456,-1.607476,0.717969,1.22936
2,-1.897498,-1.451062,-2.146487,0.405048,0.020615,-0.194403,-0.877366,-0.825378,-1.013778,1.136274,...,-0.24223,-0.821274,-0.961184,-1.212781,0.074578,1.416096,1.792456,-1.607476,0.717969,1.22936
3,-1.717725,-1.400913,-1.984412,0.490313,0.027996,0.378956,-0.681529,-0.94917,-0.610202,1.309932,...,-0.24223,-0.821274,-0.961184,-1.212781,0.098877,1.41463,1.792456,-1.607476,0.717969,1.22936
4,-1.345885,-1.333211,-1.307037,0.507366,-0.00891,0.703416,-0.528401,-1.366453,-0.881005,-1.871693,...,-0.292242,-0.821274,-0.961184,-1.212781,0.123147,1.412746,1.792456,-1.607476,0.717969,1.22936


## Dataset final

In [248]:
X_train_final.to_csv("../../data/final/X_train_final.csv", sep=";", index=False)
X_test_final.to_csv("../../data/final/X_test_final.csv", sep=";", index=False)
y_train.to_csv("../../data/final/y_train_final.csv", sep=";", index=False)
y_test.to_csv("../../data/final/y_test_final.csv", sep=";", index=False)