In [1]:
import pandas as pd
from tqdm.notebook import tqdm
from itertools import product

from sklearn.svm import SVR
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import (
    train_test_split,
    HalvingGridSearchCV,
)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
)

import sys, os
sys.path.append(os.path.abspath(os.path.join("..")))

In [2]:
import sys, os

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2
from modules.config import *

In [3]:
def get_results_df(path):
    if os.path.isfile(path):
        return pd.read_parquet(path)
    return pd.DataFrame()

def store_results(new_results, path):
    if os.path.isfile(path):
        results = pd.read_parquet(path)
        results = pd.concat([results, new_results], ignore_index=True)
        results.to_parquet(path)
    else:
        new_results.to_parquet(path)

In [4]:
# this method will get model data for a specific h3 resolution and time interval length
def get_model_data(h3_res, time_interval_length):
    model_data = pd.read_feather(os.path.join(MODEL_DATA_DIR_PATH, f"{h3_res}_{time_interval_length}.feather"))
    return model_data

In [5]:
def split_and_scale_data(model_data):
    y = model_data["demand"]
    X = model_data.drop(columns=["demand"])

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

In [30]:
def train_model(param_grid, X_train, y_train):
    svr = SVR()
    models = HalvingGridSearchCV(svr, param_grid, n_jobs=-1, scoring="neg_mean_squared_error", random_state=42)
    # print(X_train)
    models.fit(X_train, y_train)
    return models

In [31]:
def mean_average_percentage_error(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred) / y_true.mean()


def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5


def get_results(models, h3_res, time_interval_length, do_evaluate_model, X_test, y_test):
    results = pd.DataFrame(models.cv_results_)
    results['n_iter'] = 0
    results.loc[0, 'n_iter'] = models.best_estimator_.n_iter_
    results['h3_res'] = h3_res
    results['time_interval_length'] = time_interval_length

    if do_evaluate_model:
        y_pred = models.best_estimator_.predict(X_test)
        results['mse'] = mean_squared_error(y_test, y_pred)
        results['mae'] = mean_absolute_error(y_test, y_pred)
        results['mape'] = mean_average_percentage_error(y_test, y_pred)
        results['rmse'] = root_mean_squared_error(y_test, y_pred)
        
    return results

In [32]:
def get_svm_metas():
    return [
        {'kernel': ['linear'], 'C': [1, 10, 100], 'gamma': [-1],            'degree': [-1],         'max_iter': [100000]},
        {'kernel': ['rbf'],    'C': [1, 10, 100], 'gamma': [0.001, 0.0001], 'degree': [-1],         'max_iter': [100000]},
        {'kernel': ['poly'],   'C': [1, 10, 100], 'gamma': [-1],            'degree': [2, 3, 4, 5], 'max_iter': [100000]}
    ]


def check_if_model_result_empty(meta, results, h3_res, time_interval_length):
    return results[
        (results['h3_res'] == h3_res) &
        (results['time_interval_length'] == time_interval_length) &
        (results['param_kernel'] == meta[0]) &
        (results['param_C'] == meta[1]) &
        ((results['param_gamma'] == meta[2]) | (pd.isnull(results['param_gamma']))) &
        ((results['param_degree'] == meta[3]) | (pd.isnull(results['param_degree']))) 
    ]['mean_test_score'].empty


def get_param_grid(model_meta):
    param_grid = {
        'kernel': [model_meta[0]],
        'C': [model_meta[1]],
        'max_iter': [model_meta[4]]
    }
    if model_meta[2] > 0: param_grid = {**param_grid, 'gamma': [model_meta[2]]}
    if model_meta[3] > 0: param_grid = {**param_grid, 'degree': [model_meta[3]]}
    return param_grid

In [33]:
def get_availabe_models_metas_first_stage(h3_res, time_interval_length):
    results = get_results_df(SVM_FIRST_STAGE_RESULTS_PATH)    
    all_metas = get_svm_metas()

    # the following code will create all possible combinations of parameters for all models
    metas = [list(product(*meta.values())) for meta in all_metas]
    metas = [item for sublist in metas for item in sublist]
    available_metas = metas
    if not results.empty:
        available_metas = [meta for meta in metas if check_if_model_result_empty(meta, results, h3_res, time_interval_length)]

    # group_by h3 and time, put other params in param grid
    metas_grouped = []
    for kernel in ['linear', 'rbf', 'poly']:
        param_grid = [get_param_grid(meta) for meta in available_metas if (meta[0] == kernel)]
        if len(param_grid) == 0: continue
        metas_grouped.append(param_grid)

    return metas_grouped

def get_availabe_models_metas_second_stage(h3_res, time_interval_length):
    results = get_results_df(SVM_FIRST_STAGE_RESULTS_PATH)    
    best_model = results.sort_values(by=['mean_train_score'], ascending=False)
    meta = [
        h3_res,
        time_interval_length,
        best_model['param_kernel'].iloc[0],
        best_model['param_C'].iloc[0],
        best_model['param_gamma'].iloc[0],
        best_model['param_degree'].iloc[0]
    ]
    
    if ((not results.empty) & (check_if_model_result_empty(meta, results, h3_res, time_interval_length))):
        return [[{
            'kernel': [best_model['param_kernel'].iloc[0]],
            'C': [best_model['param_C'].iloc[0]],
            'gamma': [best_model['param_gamma'].iloc[0]],
            'degree': [best_model['param_degree'].iloc[0]],
            'max_iter': [best_model['param_max_iter'].iloc[0]]
        }]]

    return []

In [38]:
def execute_stage(path, h3_res, time_interval_length, get_metas_func, do_evaluate_model):
    metas = get_metas_func(h3_res, time_interval_length)
    
    for param_grid in tqdm(metas):
        feedback = f"h3: {h3_res} | t:{time_interval_length} | - " + param_grid[0]["kernel"][0]
        tqdm.write(feedback, end="\r")
        
        model_data = get_model_data(h3_res, time_interval_length)
        # print(model_data.isna().sum())
        print("len of nan")
        print(len(model_data.columns[model_data.isna().any()]))
        model_data = model_data.iloc[:1000] # to be deleted

        X_train, X_test, y_train, y_test = split_and_scale_data(model_data)
        models = train_model(param_grid, X_train, y_train)

        results = get_results(models, h3_res, time_interval_length, do_evaluate_model, X_test, y_test)
        store_results(results, path)  
        tqdm.write(feedback + " ✓")

In [39]:
execute_stage(SVM_FIRST_STAGE_RESULTS_PATH, TUNE_H3_RESOLUTION, TUNE_TIME_INTERVAL_LENGTH, get_availabe_models_metas_first_stage, False)

0it [00:00, ?it/s]

In [40]:
# results = pd.read_parquet(SVM_FIRST_STAGE_RESULTS_PATH)
# results.sort_values(by=['mean_train_score'], ascending=False).head(2)

In [41]:
for h3_res in PREDICTIVE_H3_RESOLUTIONS:
    for time_interval_length in CALC_TIME_INTERVAL_LENGTHS:
        execute_stage(SVM_SECOND_STAGE_RESULTS_PATH, h3_res, time_interval_length, get_availabe_models_metas_second_stage, True)

  0%|          | 0/1 [00:00<?, ?it/s]

len of nan1 | - poly
0


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Dev\miniconda\envs\AAA_MAGMA\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Dev\miniconda\envs\AAA_MAGMA\lib\site-packages\sklearn\svm\_base.py", line 269, in fit
    raise ValueError(
ValueError: The dual coefficients or intercepts are not finite. The input data may contain large values and need to bepreprocessed.


In [None]:
# results = pd.read_parquet(SVM_SECOND_STAGE_RESULTS_PATH)
# results