<a href="https://colab.research.google.com/github/paulomarc49/ETo_climate/blob/main/ETo_weather_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Data Loading

### 1.1. The data of years 2017, 2018, 2019 and 2020 was previus treated and joined in a Linx system.
### 1.2. The data rest in the google drive:
* **2017:** https://drive.google.com/file/d/1xLvLI0ftcs6M32fTA3-DBdAplFCDmGQ3/view?usp=drive_link
* **2018:** https://drive.google.com/file/d/1cx0pssucEfZiwJQCCUTGvGssKNWx1COa/view?usp=sharing
* **2019:** https://drive.google.com/file/d/1qGx75lNpyVC-1N9Kr8TjJ-piuSJxLTr2/view?usp=drive_link
* **2020:** https://drive.google.com/file/d/15ENdwtDGKiPjZVLn7zTwMl9ZOVq4__OS/view?usp=sharing

In [1]:
import numpy as np
import pandas as pd
import joblib
import os
from tqdm import tqdm

# Load preprocessed data for the years 2017, 2018, 2019, and 2020 as a NumPy array.
# If Drive access is unavailable, download the data from the provided links and edit your directory.

numpy_data_2017 = np.load('/content/drive/MyDrive/DATA_ETO_NUMPY/2017_numpy.npy')
numpy_data_2018 = np.load('/content/drive/MyDrive/DATA_ETO_NUMPY/2018_numpy.npy')
numpy_data_2019 = np.load('/content/drive/MyDrive/DATA_ETO_NUMPY/2019_numpy.npy')
numpy_data_2020 = np.load('/content/drive/MyDrive/DATA_ETO_NUMPY/2020_numpy.npy')

In [2]:
# Reshaping preprocessed data in an array of three dimentions with the shape:
# (d * x, y, 8 variables * 24 hours)
# where:
# d: number of days of each year
# x: Pixel in axis "X"
# y: Pixel in axis "Y"
# 8 * 24 number of variables per hour

d, x, y, n_variables = 365, 171, 171, 192

# Reshape to: d*x, y, 3, 2
numpy_data_2017 = numpy_data_2017.reshape(d,   x, y, 24, 8)
numpy_data_2018 = numpy_data_2018.reshape(d,   x, y, 24, 8)
numpy_data_2019 = numpy_data_2019.reshape(d,   x, y, 24, 8)
numpy_data_2020 = numpy_data_2020.reshape(366, x, y, 24, 8) # leap year

# Flattering
numpy_data_2017 = numpy_data_2017.transpose(0, 3, 1, 2, 4).reshape(-1, 8)
numpy_data_2018 = numpy_data_2018.transpose(0, 3, 1, 2, 4).reshape(-1, 8)
numpy_data_2019 = numpy_data_2019.transpose(0, 3, 1, 2, 4).reshape(-1, 8)
numpy_data_2020 = numpy_data_2020.transpose(0, 3, 1, 2, 4).reshape(-1, 8)
print("The shapes of the flattened 2017, 2018, 2019, and 2020 input data are:\n"
      f"2017: {numpy_data_2017.shape}\n"
      f"2018: {numpy_data_2018.shape}\n"
      f"2019: {numpy_data_2019.shape}\n"
      f"2020: {numpy_data_2020.shape}")

The shapes of the flattened 2017, 2018, 2019, and 2020 input data are:
2017: (256151160, 8)
2018: (256151160, 8)
2019: (256151160, 8)
2020: (256852944, 8)


## 2. SOM

* The project is based in a custom sci-kit learn SOM

In [5]:
# Installing sklearn-som package

!pip install sklearn-som

# Making Custom sklearn-som package

from sklearn.base import BaseEstimator, ClusterMixin
from sklearn_som.som import SOM as SklearnSOM

class CustomSOM(BaseEstimator, ClusterMixin):
    def __init__(self, m=1, n=1, dim=1, sigma=1.7, lr=1, max_iter=10, random_state=None):
        self.m = m
        self.n = n
        self.dim = dim
        self.sigma = sigma
        self.lr = lr
        self.max_iter = max_iter
        self.random_state = random_state
        self.model_ = None

    def fit(self, X, y=None):
        self.model_ = SklearnSOM(m=self.m, n=self.n, dim=self.dim, lr=self.lr, max_iter=self.max_iter, random_state=self.random_state)
        self.model_.fit(X)
        return self

    def predict(self, X):
        return self.model_.predict(X)

    def transform(self, X):
        return self.model_.transform(X)

    def score(self, X, y=None):
        distancias_punto_centroide = self.transform(X)
        distorsion_total = 0
        for i in range(len(distancias_punto_centroide)):
            distancias_minimas_cuadradas = (np.min(distancias_punto_centroide[i]))**2
            distorsion_total += distancias_minimas_cuadradas
        return distorsion_total



## 3. Normalization

In [9]:
import os
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Define the file output paths for the normalized data:

folder_path = '/content/ETo_prediction'
os.mkdir(folder_path)

file_paths = [
    '/content/ETo_prediction/2017_numpy_normalized.npy',
    '/content/ETo_prediction/2018_numpy_normalized.npy',
    '/content/ETo_prediction/2019_numpy_normalized.npy',
    '/content/ETo_prediction/2020_numpy_normalized.npy'
]

# Define the numpy datasets
numpy_datasets = [numpy_data_2017, numpy_data_2018, numpy_data_2019, numpy_data_2020]

# Iterate through the datasets and save the normalized data only if the file does not exist
for i, (data, path) in enumerate(zip(numpy_datasets, file_paths)):
    if not os.path.exists(path):
        normalized_data = scaler.fit_transform(data)
        np.save(path, normalized_data, allow_pickle=False)
        print(f"Data for year {2017 + i} normalized and saved.")
    else:
        print(f"File for year {2017 + i} already exists. Skipping.")

Data for year 2017 normalized and saved.
Data for year 2018 normalized and saved.
Data for year 2019 normalized and saved.
Data for year 2020 normalized and saved.


In [10]:
# Load the normalized data
numpy_data_2017 = np.load('/content/ETo_prediction/2017_numpy_normalized.npy')
numpy_data_2018 = np.load('/content/ETo_prediction/2018_numpy_normalized.npy')
numpy_data_2019 = np.load('/content/ETo_prediction/2019_numpy_normalized.npy')
numpy_data_2020 = np.load('/content/ETo_prediction/2020_numpy_normalized.npy')

## 4. Prediction

The trained model file is located in the following repository:

* https://drive.google.com/file/d/1E771bVe5fM3JDX-otPUqUdcIXk0l-lJ-/view?usp=sharing

In [11]:
from joblib import Parallel, delayed
import numpy as np
import os

# Define file paths for saving the labels for each year
label_paths = {
    2017: '/content/ETo_prediction/2017_ETo_weahter_labels.npy',
    2018: '/content/ETo_prediction/2018_ETo_weahter_labels.npy',
    2019: '/content/ETo_prediction/2019_ETo_weahter_labels.npy',
    2020: '/content/ETo_prediction/2020_ETo_weahter_labels.npy'
}

# Load the trained SOM model
som_model = joblib.load('/content/drive/MyDrive/ETo/som_model_fit.pkl')

# Dictionary of numpy data for each year
numpy_data_dict = {
    2017: numpy_data_2017,
    2018: numpy_data_2018,
    2019: numpy_data_2019,
    2020: numpy_data_2020
}

# Function to predict SOM labels for a chunk of data
def predict_chunk(som_model, data_chunk):
    return som_model.predict(data_chunk)

# Function to divide the data into chunks and process them in parallel
def process_data_in_parallel(som_model, data, n_jobs=-1, chunk_size=None):
    n_samples = data.shape[0]
    if chunk_size is None:
        chunk_size = n_samples // 90  # Default to dividing data into 10 chunks if not provided
    data_chunks = [data[i:i + chunk_size] for i in range(0, n_samples, chunk_size)]

    # Parallel processing of each chunk
    results = Parallel(n_jobs=n_jobs)(
        delayed(predict_chunk)(som_model, chunk) for chunk in data_chunks
    )

    # Join the predicted labels
    return np.concatenate(results)

# Iterate over each year and save labels if the file doesn't exist
for year, data in numpy_data_dict.items():
    outpath = label_paths[year]

    # Check if the labels file already exists
    if not os.path.exists(outpath):
        # Process data in parallel and predict labels
        labels_SOM_train = process_data_in_parallel(som_model, data)

        # Save the labels array
        np.save(outpath, labels_SOM_train.astype(np.int16), allow_pickle=False)
        print(f'Labels array for {year} saved to: {outpath}')
    else:
        print(f'File already exists for {year} at {outpath}, skipping label generation.')


Labels array for 2017 saved to: /content/ETo_prediction/2017_ETo_weahter_labels.npy
Labels array for 2018 saved to: /content/ETo_prediction/2018_ETo_weahter_labels.npy
Labels array for 2019 saved to: /content/ETo_prediction/2019_ETo_weahter_labels.npy
Labels array for 2020 saved to: /content/ETo_prediction/2020_ETo_weahter_labels.npy


In [14]:
import numpy as np
import os

# Define file paths for labels and ETo_weather datasets for each year
label_paths = {
    2017: '/content/ETo_prediction/2017_ETo_weahter_labels.npy',
    2018: '/content/ETo_prediction/2018_ETo_weahter_labels.npy',
    2019: '/content/ETo_prediction/2019_ETo_weahter_labels.npy',
    2020: '/content/ETo_prediction/2020_ETo_weahter_labels.npy',
}

eto_weather_paths = {
    2017: '/content/ETo_prediction/ETo_weather_2017.npy',
    2018: '/content/ETo_prediction/ETo_weather_2018.npy',
    2019: '/content/ETo_prediction/ETo_weather_2019.npy',
    2020: '/content/ETo_prediction/ETo_weather_2020.npy',
}

# Dictionary of numpy data for each year
numpy_data_dict = {
    2017: numpy_data_2017,
    2018: numpy_data_2018,
    2019: numpy_data_2019,
    2020: numpy_data_2020,
}

# Iterate over each year, concatenate labels with data, and save ETo_weather
for year, data in numpy_data_dict.items():
    outpath_labels = label_paths[year]
    outpath_ETo_weather = eto_weather_paths[year]

    # Check if the ETo_weather file already exists
    if not os.path.exists(outpath_ETo_weather):
        # Load the predicted labels for the corresponding year
        labels_SOM_train = np.load(outpath_labels)
        print(f'Shape of labels for {year}: {labels_SOM_train.shape}')
        print(f'Shape of data for {year}: {data.shape}')

        # Concatenate the labels with the original data
        ETo_weather = np.concatenate((data, labels_SOM_train.reshape(-1, 1)), axis=1)

        # Save the concatenated dataset
        np.save(outpath_ETo_weather, ETo_weather.astype(np.int16), allow_pickle=False)
        print(f'ETo_weather array for {year} saved to: {outpath_ETo_weather}')
        print(f'ETo_weather shape for {year}: {ETo_weather.shape}')
    else:
        print(f'File already exists for {year} at {outpath_ETo_weather}, skipping.')



Shape of labels for 2017: (256151160,)
Shape of data for 2017: (256151160, 8)
ETo_weather array for 2017 saved to: /content/ETo_prediction/ETo_weather_2017.npy
ETo_weather shape for 2017: (256151160, 9)
Shape of labels for 2018: (256151160,)
Shape of data for 2018: (256151160, 8)
ETo_weather array for 2018 saved to: /content/ETo_prediction/ETo_weather_2018.npy
ETo_weather shape for 2018: (256151160, 9)
Shape of labels for 2019: (256151160,)
Shape of data for 2019: (256151160, 8)
ETo_weather array for 2019 saved to: /content/ETo_prediction/ETo_weather_2019.npy
ETo_weather shape for 2019: (256151160, 9)
Shape of labels for 2020: (256852944,)
Shape of data for 2020: (256852944, 8)
ETo_weather array for 2020 saved to: /content/ETo_prediction/ETo_weather_2020.npy
ETo_weather shape for 2020: (256852944, 9)


In [15]:
import numpy as np
import os

# Define file paths for reshaped ETo_weather datasets for each year
eto_weather_paths = {
    2017: '/content/ETo_prediction/ETo_weather_2017.npy',
    2018: '/content/ETo_prediction/ETo_weather_2018.npy',
    2019: '/content/ETo_prediction/ETo_weather_2019.npy',
    2020: '/content/ETo_prediction/ETo_weather_2020.npy',
}

eto_weather_reshaped_paths = {
    2017: '/content/ETo_prediction/ETo_weather_2017_reshaped.npy',
    2018: '/content/ETo_prediction/ETo_weather_2018_reshaped.npy',
    2019: '/content/ETo_prediction/ETo_weather_2019_reshaped.npy',
    2020: '/content/ETo_prediction/ETo_weather_2020_reshaped.npy',
}

# Parameters for reshaping: days per year, grid size (x, y), and number of variables
n_variables = 192
grid_size = (171, 171)  # x and y dimensions

# Define number of days for each year (consider leap year for 2020)
days_per_year = {
    2017: 365,
    2018: 365,
    2019: 365,
    2020: 366,  # Leap year
}

# Function to reshape the data
def reshape_data(year, data):
    d = days_per_year[year]
    x, y = grid_size

    print(f"Original data shape for {year}: {data.shape}")

    # Reshape the array: (days, 24 hours, x, y, n_variables)
    reshaped_data = data.reshape(d, 24, x, y, 9)
    print(f"Shape after first reshaping for {year}: {reshaped_data.shape}")

    # Transpose to: (days, x, y, 24 hours, n_variables)
    reshaped_data = reshaped_data.transpose(0, 2, 3, 1, 4)
    print(f"Shape after transposing for {year}: {reshaped_data.shape}")

    # Final reshape to: (days * x, y, 24 * n_variables)
    reshaped_data = reshaped_data.reshape(d * x, y, 24 * 9)
    print(f"Final reshaped data shape for {year}: {reshaped_data.shape}")

    return reshaped_data

# Iterate over each year, reshape, and save if not already saved
for year, outpath in eto_weather_reshaped_paths.items():
    if not os.path.exists(outpath):
        # Load the original ETo_weather data
        eto_weather_data = np.load(eto_weather_paths[year])

        # Reshape the data for the current year
        eto_weather_reshaped = reshape_data(year, eto_weather_data)

        # Save the reshaped array
        np.save(outpath, eto_weather_reshaped.astype(np.int16), allow_pickle=False)
        print(f'ETo_weather reshaped array for {year} saved to: {outpath}')
    else:
        print(f'File already exists for {year} at {outpath}, skipping.')


Forma de los datos originales para 2017: (256151160, 9)
Forma 2 del array original con etiquetas para 2017: (365, 24, 171, 171, 9)
Forma 3 del array original con etiquetas para 2017: (365, 171, 171, 24, 9)
Forma final del array original con etiquetas para 2017: (62415, 171, 216)
ETo_weather reshaped array for 2017 saved to: /content/ETo_prediction/ETo_weather_2017_reshaped.npy
Forma de los datos originales para 2018: (256151160, 9)
Forma 2 del array original con etiquetas para 2018: (365, 24, 171, 171, 9)
Forma 3 del array original con etiquetas para 2018: (365, 171, 171, 24, 9)
Forma final del array original con etiquetas para 2018: (62415, 171, 216)
ETo_weather reshaped array for 2018 saved to: /content/ETo_prediction/ETo_weather_2018_reshaped.npy
Forma de los datos originales para 2019: (256151160, 9)
Forma 2 del array original con etiquetas para 2019: (365, 24, 171, 171, 9)
Forma 3 del array original con etiquetas para 2019: (365, 171, 171, 24, 9)
Forma final del array original con

## 5. Visualization and simulation

In [16]:
# LOAD THE DATA OF THE YEARS 2017, 2018, 2019, 2020

ETo_weather_2017_reshaped = np.load('/content/ETo_prediction/ETo_weather_2017_reshaped.npy')
ETo_weather_2018_reshaped = np.load('/content/ETo_prediction/ETo_weather_2018_reshaped.npy')
ETo_weather_2019_reshaped = np.load('/content/ETo_prediction/ETo_weather_2019_reshaped.npy')
ETo_weather_2020_reshaped = np.load('/content/ETo_prediction/ETo_weather_2020_reshaped.npy')
print('The data of the year 2017has a size of: ', ETo_weather_2017_reshaped.shape)
print('The data of the year 2017has a size of: ', ETo_weather_2018_reshaped.shape)
print('The data of the year 2017has a size of: ', ETo_weather_2019_reshaped.shape)
print('The data of the year 2017has a size of: ', ETo_weather_2020_reshaped.shape)

The data of the year 2017has a size of:  (62415, 171, 216)
The data of the year 2017has a size of:  (62415, 171, 216)
The data of the year 2017has a size of:  (62415, 171, 216)
The data of the year 2017has a size of:  (62586, 171, 216)


In [17]:
%matplotlib inline

!pip install ipywidgets>=7,<8

import numpy              as np
import ipywidgets         as widgets
import matplotlib.pyplot  as plt
import joblib
from IPython.display      import display
from PIL                  import Image
from matplotlib.widgets   import Cursor
from tqdm                 import tqdm
from google.colab         import output

/bin/bash: line 1: 8: No such file or directory


In [18]:
output.enable_custom_widget_manager()
dropdown = widgets.Dropdown(
    options=[('Net radiation', 0),
             ('Ground Flux', 1),
             ('Air temperature at 2 m', 2),
             ('Slope vapour pressure curve', 3),
             ('Psychrometric const', 4),
             ('Saturation vapour pressure', 5),
             ('Actual vapour pressure', 6),
             ('Wind speed magnitude at 2m', 7),
             ('ETo Weather Classes', 8)],
    value=8,
    description='VARIABLE:',
)

dropdown1 = widgets.IntSlider(min=0, max=23, step=1, description='HOUR:')
dropdown2 = widgets.IntSlider(min=1, max=364, step=1, description='DAY:')

play = widgets.Play(
    value=0,
    min=1,
    max=364,
    step=1,
    interval=2000,
    description="Press play",
    disabled=False
)

def Dropdown_Menu(value=1, hora=1, dia=1):
    fig, axes = plt.subplots(2, 2, figsize=(8, 8))
    # Adjust space between the plots
    plt.subplots_adjust(wspace=10, hspace=100)

    # Access each subplot using the 2D array structure
    ax_2017 = axes[0, 0]
    ax_2018 = axes[0, 1]
    ax_2019 = axes[1, 0]
    ax_2020 = axes[1, 1]

    fig.dpi = 90

    x = [0, 1, 2, 3, 4, 5, 6, 7, 8]
    y = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
    z = lista = list(range(0, 365))

    pos = x.index(value)
    horas = y.index(round(hora))
    dias = z.index(round(dia))

    dia1_2017 = ETo_weather_2017_reshaped[0 + (171*(dias-1)) : 171 + (171*(dias-1))]
    dia1_2018 = ETo_weather_2018_reshaped[0 + (171*(dias-1)) : 171 + (171*(dias-1))]
    dia1_2019 = ETo_weather_2019_reshaped[0 + (171*(dias-1)) : 171 + (171*(dias-1))]
    dia1_2020 = ETo_weather_2020_reshaped[0 + (171*(dias-1)) : 171 + (171*(dias-1))]

    var_name = ['Net radiation MJ/(m2*hr)', 'Ground Flux MJ/(m2*hr)', 'Air temperature at 2 m °C',
                'Slope vapour pressure curve kPa °C-1', 'Psychrometric const kPa °C-1',
                'Saturation vapour pressure kPa', 'Actual vapour pressure kPa',
                'Wind speed magnitude at 2m m/s', 'ETo Weather Clusters'][pos]

    data_2017 = dia1_2017[:,:,round(pos+(horas*9))]
    data_2018 = dia1_2018[:,:,round(pos+(horas*9))]
    data_2019 = dia1_2019[:,:,round(pos+(horas*9))]
    data_2020 = dia1_2020[:,:,round(pos+(horas*9))]

    # Clear the axes to update the plots
    ax_2017.clear()
    ax_2018.clear()
    ax_2019.clear()
    ax_2020.clear()

    plt.subplots_adjust(left=0.1,
                        bottom=0.1,
                        right=0.9,
                        top=0.9,
                        wspace=0.4,
                        hspace=0.1)

    scala_inf = [-1.0, -0.05,  5.00, 0.00, 0.03, 0, 0, 0, 0]
    scala_sup = [ 0.9,  0.20, 31.00, 0.25, 0.08, 4, 3, 5, 36]

    titulo_dinamico = f'\n"{var_name}"'

    im  = ax_2017.imshow(data_2017, cmap='gnuplot2', vmin=scala_inf[pos], vmax=scala_sup[pos], alpha=1, interpolation='bilinear')
    im2 = ax_2018.imshow(data_2018, cmap='gnuplot2', vmin=scala_inf[pos], vmax=scala_sup[pos], alpha=1, interpolation='bilinear')
    im3 = ax_2019.imshow(data_2019, cmap='gnuplot2', vmin=scala_inf[pos], vmax=scala_sup[pos], alpha=1, interpolation='bilinear')
    im4 = ax_2020.imshow(data_2020, cmap='gnuplot2', vmin=scala_inf[pos], vmax=scala_sup[pos], alpha=1, interpolation='bilinear')

    ax_2017.set_title(titulo_dinamico+f'\nYear 2017', color="steelblue")
    ax_2017.set_xlabel('Geographic pixel in X axis')
    ax_2017.set_ylabel('Geographic pixel in Y axis')

    ax_2018.set_title(titulo_dinamico+f'\nYear 2018', color="steelblue")
    ax_2018.set_xlabel('Geographic pixel in X axis')
    ax_2018.set_ylabel('Geographic pixel in Y axis')

    ax_2019.set_title(titulo_dinamico+f'\nYear 2019', color="steelblue")
    ax_2019.set_xlabel('Geographic pixel in X axis')
    ax_2019.set_ylabel('Geographic pixel in Y axis')

    ax_2020.set_title(titulo_dinamico+f'\nYear 2020', color="steelblue")
    ax_2020.set_xlabel('Geographic pixel in X axis')
    ax_2020.set_ylabel('Geographic pixel in Y axis')

    plt.colorbar(im,  ax=ax_2017, label=var_name)
    plt.colorbar(im2, ax=ax_2018, label=var_name)
    plt.colorbar(im3, ax=ax_2019, label=var_name)
    plt.colorbar(im4, ax=ax_2020, label=var_name)

    plt.show()

widgets.jslink((play, 'value'), (dropdown2, 'value'))

widgets.interact(Dropdown_Menu, value=dropdown, hora=dropdown1, dia=dropdown2)
widgets.HBox([play])


interactive(children=(Dropdown(description='VARIABLE:', index=8, options=(('Net radiation', 0), ('Ground Flux'…

HBox(children=(Play(value=1, description='Press play', interval=2000, max=364, min=1),))