In [2]:
import os
import cv2
import mlflow
import tensorflow as tf
from sklearn import model_selection, metrics
from datetime import datetime

from src_nowcasting import image_preprocessing, sequence_img_generator, get_models, constants


### Data pre-processing

- preparation of the dataframe with ***'Target', 'Irr', 'Image'*** columns
- preprocessing and saving of the acquired images

In [2]:
pre_processor = image_preprocessing.PreProcessImage()

In [None]:
# go through all the days in the folder
for day in os.listdir(constants.PATH_INPUT_FOLDER):    

   # go through all the images in the day
    files = os.listdir(os.path.join(constants.PATH_INPUT_FOLDER, day))

    for f in files:

        # Load image
        in_path = os.path.join(constants.PATH_INPUT_FOLDER, day, f)
        image =  cv2.imread(in_path, cv2.IMREAD_UNCHANGED)

        # Transform image
        new_image = pre_processor.transform_image(image)

        # Save image
        folder_path = os.path.join(constants.PATH_OUTPUT_FOLDER, day)
        # Check if exists and if not create folder
        if not os.path.exists(folder_path): os.mkdir(folder_path)
        out_path = os.path.join(folder_path, f.split('.')[0]+'.jpg')

        cv2.imwrite(out_path, new_image)

In [3]:
df_data = sequence_img_generator.generate_dataframe(constants.PATH_WEATHER_FILES, FORECAST_HORIZON = 15)
df_data.to_parquet(r'..\dataset\df_data.parquet.gzip')


### Training

Initialize keras session (ML library)

In [5]:
tf.keras.backend.clear_session()
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

Initialize mlflow session (experiment tracking library):

- parts related to mlflow can be deleted (if checkpointing is active)
- in the provided code only single model with best parameters will be run (trivial experiment)

In [6]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("sCNN_best_model")

<Experiment: artifact_location='file:///c:/Users/Admin/Code/maciej-solar-nowcasting/nowcasting/mlruns/4', creation_time=1696335798033, experiment_id='4', last_update_time=1696335798033, lifecycle_stage='active', name='sCNN_best_model_tests', tags={}>

Prepare the DF

In [9]:
# Define the Target column according to needs
df_data['Target'] = df_data.Target_CSI

# Remove the data with low elevation
df_data_reduced = df_data[df_data.elevation > constants.ELEVATION_THRESHOLD]


df_train_full, df_test = model_selection.train_test_split(df_data_reduced, train_size=constants.TRAIN_SIZE, shuffle=False)
df_train, df_val = model_selection.train_test_split(df_train_full, train_size=constants.TRAIN_SIZE, shuffle=False)

train_generator = sequence_img_generator.DataGeneratorGHI_SCNN(df_train, constants.PATH_OUTPUT_FOLDER, **constants.train_params)

val_generator = sequence_img_generator.DataGeneratorGHI_SCNN(df_val, constants.PATH_OUTPUT_FOLDER, **constants.valid_params)

Test cases: 

- Sunny day test - 19/08/2023, 23/08/2023
- Partially cloudy day test - 26/08/2023, 29/08/2023
 - Mostly cloudy / rainy day test - 27/08/2023

In [10]:
df_test_1 = df_test[df_test.date.dt.date == datetime(2023, 8, 19).date()].copy() # Sunny day
df_test_2 = df_test[df_test.date.dt.date == datetime(2023, 8, 23).date()].copy() # Sunny day
df_test_3 = df_test[df_test.date.dt.date == datetime(2023, 8, 26).date()].copy() # Partially cloudy day
df_test_4 = df_test[df_test.date.dt.date == datetime(2023, 8, 27).date()].copy() # Mostly cloudy / rainy day
df_test_5 = df_test[df_test.date.dt.date == datetime(2023, 8, 29).date()].copy() # Partially cloudy day

test_cases = [df_test_1, df_test_2, df_test_3, df_test_4, df_test_5]

Model training

In [3]:
model = get_models.SCNN_small(input_shape=[constants.IMG_SIZE[0], constants.IMG_SIZE[1], constants.NO_IMAGES])

In [None]:
mlflow.tensorflow.autolog(disable=True)

# Model parameters
BETA_1 = 0.9
BETA_2 = 0.999
LEARNING_RATE_START = 0.0003
LOSS = 'mean_squared_error'

RUN_ID = 1


with mlflow.start_run(run_name=f'run_{RUN_ID:03d}_{constants.MODEL_TYPE}_scaled_15, lr: {LEARNING_RATE_START}, loss: {LOSS}'):
    
    params = {
        'forecast_horizon': constants.FORECAST_HORIZON,
        'elevation_threshold': constants.ELEVATION_THRESHOLD,
        'model_type': constants.MODEL_TYPE,
        'learning_rate': LEARNING_RATE_START,
        'beta_1': BETA_1,
        'beta_2': BETA_2,
        'loss': 'mean_squared_error',    
    }

    mlflow.log_params(params)

    callbacks_list = []
    
    # Logging
    if constants.LOG_PATH: 
        callbacks_list.append(tf.keras.callbacks.CSVLogger(os.path.join(constants.LOG_PATH, f'training_id_{constants.MODEL_TYPE}_{LEARNING_RATE_START}.csv')))
    # Checkpointing
    if constants.CHECKPOINT_PATH:
        callbacks_list.append(tf.keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(constants.CHECKPOINT_PATH, f'training_id_{constants.MODEL_TYPE}_scaled_{constants.FORECAST_HORIZON}_{LEARNING_RATE_START}_{LOSS}.h5'),
            verbose = 1,
            save_best_only = True,
            ))

    # Early stopping
    callbacks_list.append(tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10))
    # Learing rate reduction scheduler
    # callbacks_list.append(get_models.OneCycleScheduler(math.ceil(len(df_train) / train_batchsize) * epochs, max_rate = 0.0005))
    # callbacks_list.append(tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn))
                
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=LEARNING_RATE_START, 
        beta_1=0.9, 
        beta_2=0.999, 
        amsgrad=False
        )

    model.compile(
        optimizer=optimizer, 
        loss=LOSS,
        metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    history = model.fit(
        train_generator,
        #steps_per_epoch=int(df_train.shape[0] / train_batchsize),
        epochs=constants.EPOCHS,
        validation_data=val_generator,
        #validation_steps=int(df_val.shape[0] / train_batchsize),
        callbacks=callbacks_list                              
        )
        
    mlflow.log_param("model_params", model.count_params())

            
    for i_test, df_t in enumerate(test_cases):
            
        test_generator = sequence_img_generator.DataGeneratorGHI_SCNN(df_t, constants.PATH_OUTPUT_FOLDER, **constants.test_params)

        # Test ghi
        y_test = model.predict(test_generator) * df_t.Target_GHICS.values.reshape(-1, 1)
        y_true = df_t.Target_GHIr.values
        y_pers = df_t.ghi1.values
                
                
        mae_test = metrics.mean_squared_error(y_true, y_test)
        mae_per = metrics.mean_squared_error(y_true, y_pers)
                
        FS = 1 - mae_test / mae_per
            
        print(f'Test case: {df_t.date.dt.date.iloc[0]}')    
        print(f"model_params: {model.count_params()}")
        print(f"mae_test: {mae_test}")
        print(f"mae_pers {mae_per}")
        print(f"FS: {FS}")
            
        mlflow.log_metric(f'mae_test_{i_test}', mae_test)
        mlflow.log_metric(f"mae_pers_{i_test}", mae_per)
        mlflow.log_metric(f"FS_{i_test}", FS)
            
            
    mlflow.tensorflow.log_model(model, f'{constants.MODEL_TYPE}_{LEARNING_RATE_START}_{LOSS}')