In [8]:
import datetime

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor

import warnings
warnings.filterwarnings("ignore")

In [9]:
def split(df: pd.DataFrame, n: int = 2) -> list:
    return np.array_split(df, n)


def blend_and_split(df: pd.DataFrame, n: int = 2, frac: int = 0.5, seed: int = 42) -> list:
    return list(df.sample(frac=frac, replace=False, random_state=seed) for i in range(0, n))


def init_svm():
    return SVR(kernel="rbf")


def init_sgd():
    return SGDRegressor(loss='squared_loss', alpha=0.0001)


def init_dnn():
    # TODO: implement
    raise Exception(f'Not implemented')


def init_model(model_type):
    """
    :param model_type: 'svm', 'sgd', 'neural_network'
    :return: model instance
    """
    # вибір типу та ініціалізація моделі
    if model_type == 'svm':
        model = init_svm()
    elif model_type == 'sgd':
        model = init_sgd()
    elif model_type == 'dnn':
        model = init_dnn()
    else:
        raise Exception(f'Unknown model type: {model_type}')

    return model


def init_metrics():
    return {
        'train': {
            'explained_variance_score': [],
            'max_error': [],
            'mean_absolute_error': [],
            'mean_squared_error': [],
            'root_mean_squared_error': [],
            'mean_absolute_percentage_error': [],
            'median_absolute_error': [],
            'r2_score': []
        },
        'test': {
            'explained_variance_score': [],
            'max_error': [],
            'mean_absolute_error': [],
            'mean_squared_error': [],
            'root_mean_squared_error': [],
            'mean_absolute_percentage_error': [],
            'median_absolute_error': [],
            'r2_score': []
        }
    }


def calc_train_metrics(metrics, train_y, train_pred):
    metrics['train']['explained_variance_score'].append(explained_variance_score(train_y, train_pred))
    metrics['train']['max_error'].append(max_error(train_y, train_pred))
    metrics['train']['mean_absolute_error'].append(mean_absolute_error(train_y, train_pred))
    metrics['train']['mean_squared_error'].append(mean_squared_error(train_y, train_pred, squared=True))
    metrics['train']['root_mean_squared_error'].append(mean_squared_error(train_y, train_pred, squared=False))
    metrics['train']['mean_absolute_percentage_error'].append(mean_absolute_percentage_error(train_y, train_pred))
    metrics['train']['median_absolute_error'].append(median_absolute_error(train_y, train_pred))
    metrics['train']['r2_score'].append(r2_score(train_y, train_pred))
    
    
def calc_test_metrics(metrics, test_y, test_pred):
    metrics['test']['explained_variance_score'].append(explained_variance_score(test_y, test_pred))
    metrics['test']['max_error'].append(max_error(test_y, test_pred))
    metrics['test']['mean_absolute_error'].append(mean_absolute_error(test_y, test_pred))
    metrics['test']['mean_squared_error'].append(mean_squared_error(test_y, test_pred, squared=True))
    metrics['test']['root_mean_squared_error'].append(mean_squared_error(test_y, test_pred, squared=False))
    metrics['test']['mean_absolute_percentage_error'].append(mean_absolute_percentage_error(test_y, test_pred))
    metrics['test']['median_absolute_error'].append(median_absolute_error(test_y, test_pred))
    metrics['test']['r2_score'].append(r2_score(test_y, test_pred))
    
    
def get_scaler(scaler_type):
    """
    :param scaler_type: 'max_abs_scaler',
                        'min_max_scaler',
                        'standard_scaler'
    :return: scaler instance
    """
    # вибір типу та ініціалізація скейлеру
    if scaler_type == 'max_abs_scaler':
        return MaxAbsScaler()
    elif scaler_type == 'min_max_scaler':
        return MinMaxScaler()
    elif scaler_type == 'standard_scaler':
        return StandardScaler()
    else:
        raise Exception(f'Unknown scaler type: {scaler_type}')
    
    
def scale(scaler_type, train_X, test_X):
    scaler = get_scaler(scaler_type)
    
    scaler.fit(train_X)

    train_X = pd.DataFrame(scaler.transform(train_X))
    test_X = pd.DataFrame(scaler.transform(test_X))

    return train_X, test_X


def add_poly_features(poly, train_X, test_X):
    train_X = pd.DataFrame(poly.fit_transform(train_X))
    test_X  = pd.DataFrame(poly.fit_transform(test_X))

    return train_X, test_X


def print_pretty(d, indent=0):
    for key, value in d.items():
        print('\t' * indent + str(key))
        if isinstance(value, dict):
            print_pretty(value, indent+1)
        else:
            print('\t' * (indent+2) + str(value))

In [14]:
def run_experiment(model_type:      str,  # 'svm', 'sgd', 'dnn'
                   scaler_type:     str,  # 'max_abs_scaler', 'min_max_scaler', or 'standard_scaler'
                   train_datasets:  list,
                   test_dataset:    pd.DataFrame,
                   n_pfeatures:     int,  # 0 -> do not generate polynomial features
                   export_to_excel: bool=True,
                   keep_last:       bool=True):
    
    """
    Note: number of cascades is set implicitly by the number of train data sets
    """
    if n_pfeatures > 0:
        poly = PolynomialFeatures(n_pfeatures)
    test_preds = []
    metrics = init_metrics()
    models = []

    for cascade in range(len(train_datasets)):
        print(f'cascade-{cascade}')
        train_dataset = train_datasets[cascade].copy()
        test_dataset_loc = test_dataset.copy()
        train_X = train_dataset.iloc[:, :-1]
        train_y = train_dataset.iloc[:,-1]
        test_X = test_dataset_loc.iloc[:, :-1]
        test_y = test_dataset_loc.iloc[:,-1]
        test_y_hist = test_dataset_loc.iloc[:,-1]
        model = init_model(model_type)
        
        # генеруємо і додаємо y_pred_i до фіч
        for i in range(cascade):
            train_X_alt = train_X.copy()
            test_X_alt = test_X.copy()
            
            if n_pfeatures > 0:
                train_X_alt, test_X_alt = add_poly_features(poly, train_X_alt, test_X_alt)
            train_X_alt, test_X_alt = scale(scaler_type, train_X_alt, test_X_alt)
            
            if keep_last:
                train_X['y_pred'] = models[i].predict(train_X_alt)
                test_X['y_pred'] = models[i].predict(test_X_alt)
            else:
                train_X[f'y_pred_{i+1}'] = models[i].predict(train_X_alt)
                test_X[f'y_pred_{i+1}'] = models[i].predict(test_X_alt)
        
        if n_pfeatures > 0:
            train_X, test_X = add_poly_features(poly, train_X, test_X)
        train_X, test_X = scale(scaler_type, train_X, test_X)
        
        model.fit(train_X, train_y)
        models.append(model)

        # рахуємо та зберігаємо метрики
        test_pred = model.predict(test_X)
        calc_train_metrics(metrics, train_y, model.predict(train_X))
        calc_test_metrics(metrics, test_y, test_pred)
        test_preds.append(test_pred)
    
    if export_to_excel:
        date_time_now = datetime.datetime.now()
        metrics_train_df = pd.DataFrame(data=metrics['train'])
        metrics_train_df.index.name = 'cascade'
        metrics_test_df = pd.DataFrame(data=metrics['test'])
        metrics_test_df.index.name = 'cascade'
        test_preds_df = pd.DataFrame(data=test_preds)
        
        with pd.ExcelWriter(f'experiment-run-{date_time_now}.xlsx') as writer:  
            metrics_train_df.to_excel(writer, sheet_name='train')
            metrics_test_df.to_excel(writer, sheet_name='test')
            test_preds_df.T.to_excel(writer, sheet_name='test-preds')
        
    return metrics

# Experiment 1

In [15]:
df_train = pd.read_csv('trainCO.txt', header=None)
df_test = pd.read_csv('testCO.txt', header=None)

In [16]:
train_X = df_train.iloc[:, :-1]
train_y = df_train.iloc[:,-1]
test_X = df_test.iloc[:, :-1]
test_y = df_test.iloc[:,-1]

metrics = init_metrics()
poly = PolynomialFeatures(2)
train_X, test_X = add_poly_features(poly, train_X, test_X)
train_X, test_X = scale('max_abs_scaler', train_X, test_X)
model = init_model('sgd')

model.fit(train_X, train_y)

calc_train_metrics(metrics, train_y, model.predict(train_X))
calc_test_metrics(metrics, test_y, model.predict(test_X))

print_pretty(metrics)

train
	explained_variance_score
			[0.8641072872723247]
	max_error
			[11.193751251320936]
	mean_absolute_error
			[0.2912024398238739]
	mean_squared_error
			[0.2874305608552335]
	root_mean_squared_error
			[0.5361255084914665]
	mean_absolute_percentage_error
			[0.23830794305338163]
	median_absolute_error
			[0.18285042966583626]
	r2_score
			[0.8641066412569429]
test
	explained_variance_score
			[0.885743912346448]
	max_error
			[8.465705486392427]
	mean_absolute_error
			[0.29024988998542706]
	mean_squared_error
			[0.24917312780718937]
	root_mean_squared_error
			[0.4991724429565292]
	mean_absolute_percentage_error
			[0.2127678185805919]
	median_absolute_error
			[0.1862377301771625]
	r2_score
			[0.885736063415919]


## Plain Split

In [17]:
%%time

train_split_plain = split(df_train, n=3)

print('len train_split_plain:', len(train_split_plain))

metrics = run_experiment(model_type = 'sgd', # TODO: 'dnn'
                         scaler_type = 'max_abs_scaler',
                         train_datasets = train_split_plain,
                         test_dataset = df_test,
                         n_pfeatures = 0)
print_pretty(metrics)

len train_split_plain: 3
cascade-0
cascade-1
cascade-2
train
	explained_variance_score
			[0.9316064900330141, 0.9046907339228347, 0.712472152415087]
	max_error
			[1.9168546236285504, 4.060536642367351, 11.171609093902006]
	mean_absolute_error
			[0.2451589958536199, 0.3224565780128606, 0.4260550361194906]
	mean_squared_error
			[0.10906777475897275, 0.23829204589219088, 0.6425013262017842]
	root_mean_squared_error
			[0.3302541063468746, 0.48815166279773226, 0.801561804355587]
	mean_absolute_percentage_error
			[0.18040336332097365, 0.2540939001716105, 0.39162180967196214]
	median_absolute_error
			[0.1890412375290786, 0.2259221748724669, 0.24706029695258752]
	r2_score
			[0.9316051539136831, 0.9045083909988478, 0.7124718000138446]
test
	explained_variance_score
			[0.7320473276240136, 0.8531792119822312, 0.8519342041309947]
	max_error
			[8.251200801984085, 8.371420250058634, 8.24808904859765]
	mean_absolute_error
			[0.6002981161730295, 0.36516409790515414, 0.3599220823870297]
	mea

## Blended Split

In [19]:
train_split_blended = blend_and_split(df_train, n=3, frac=.8)

print('len train_split_blended:', len(train_split_blended))

metrics = run_experiment(model_type = 'sgd',
                         scaler_type = 'max_abs_scaler',
                         train_datasets = train_split_blended,
                         test_dataset = df_test,
                         n_pfeatures = 0)
print_pretty(metrics)

len train_split_blended: 3
cascade-0
cascade-1
cascade-2
train
	explained_variance_score
			[0.8686069324115232, 0.8696635741485539, 0.8721661471808148]
	max_error
			[10.602040129740487, 10.612289851115175, 10.612374321059443]
	mean_absolute_error
			[0.33817728021416166, 0.3365323961725474, 0.3321187766289172]
	mean_squared_error
			[0.2770750883296699, 0.27484449692544444, 0.2695776720231015]
	root_mean_squared_error
			[0.5263792248271867, 0.5242561367551595, 0.519208697946309]
	mean_absolute_percentage_error
			[0.26926678356641387, 0.268022862322063, 0.26443116684119977]
	median_absolute_error
			[0.23622927977639718, 0.23661962615826382, 0.23724629343914283]
	r2_score
			[0.86860562835843, 0.8696634179912481, 0.872161048336731]
test
	explained_variance_score
			[0.862875332393821, 0.8637953235350101, 0.8661707747020093]
	max_error
			[8.392203030098235, 8.402915159160147, 8.408299862270015]
	mean_absolute_error
			[0.34398371543287853, 0.34238411463414103, 0.33792162474411136]
	

# Experiment 2

In [20]:
df_train = pd.read_csv('procom_train.csv', header=None)
df_test = pd.read_csv('procom_test.csv', header=None)

## Baseline

In [21]:
train_X = df_train.iloc[:, :-1]
train_y = df_train.iloc[:,-1]
test_X = df_test.iloc[:, :-1]
test_y = df_test.iloc[:,-1]

metrics = init_metrics()
poly = PolynomialFeatures(2)
train_X, test_X = add_poly_features(poly, train_X, test_X)
train_X, test_X = scale('max_abs_scaler', train_X, test_X)
model = init_model('sgd')

model.fit(train_X, train_y)

calc_train_metrics(metrics, train_y, model.predict(train_X))
calc_test_metrics(metrics, test_y, model.predict(test_X))

print_pretty(metrics)

train
	explained_variance_score
			[0.9436456683428726]
	max_error
			[77705.49057470186]
	mean_absolute_error
			[18583.867179830315]
	mean_squared_error
			[552319238.6434243]
	root_mean_squared_error
			[23501.4731164543]
	mean_absolute_percentage_error
			[0.04572135611756515]
	median_absolute_error
			[15780.236797051504]
	r2_score
			[0.9434624380713165]
test
	explained_variance_score
			[0.9377792882474798]
	max_error
			[101608.10019069718]
	mean_absolute_error
			[17918.881338531148]
	mean_squared_error
			[607538056.7433652]
	root_mean_squared_error
			[24648.287095523803]
	mean_absolute_percentage_error
			[0.04427424528974683]
	median_absolute_error
			[13982.254209578357]
	r2_score
			[0.9358699888451765]


## Plain Split

In [22]:
%%time

train_split_plain = split(df_train, n=3)

print('len train_split_plain:', len(train_split_plain))

metrics = run_experiment(model_type = 'sgd',
                         scaler_type = 'max_abs_scaler',
                         train_datasets = train_split_plain,
                         test_dataset = df_test,
                         n_pfeatures = 0)
print_pretty(metrics)

len train_split_plain: 3
cascade-0
cascade-1
cascade-2
train
	explained_variance_score
			[0.9501799528052095, 0.9544311687493802, 0.9662693323612902]
	max_error
			[73175.26949329441, 61369.27049331536, 70796.45578583912]
	mean_absolute_error
			[14103.713704901214, 16709.213983221936, 14405.019242923867]
	mean_squared_error
			[374374613.8874479, 456468911.8974759, 336112905.7050629]
	root_mean_squared_error
			[19348.76259318533, 21365.133088690927, 18333.382276739416]
	mean_absolute_percentage_error
			[0.039492135460351734, 0.039628875851587385, 0.03498999476179775]
	median_absolute_error
			[8849.576666942798, 12358.14030767235, 11766.982053210842]
	r2_score
			[0.9501786108264616, 0.9544311550612986, 0.9662692453040648]
test
	explained_variance_score
			[0.8851937232056044, 0.8527712229557596, 0.9255301542080799]
	max_error
			[106921.01702784572, 107394.57331118739, 112633.94981808099]
	mean_absolute_error
			[27643.603605281227, 42813.6477058302, 22098.837950500856]
	mean_squa

## Blended Split

In [23]:
train_split_blended = blend_and_split(df_train, n=4, frac=.8)

print('len train_split_blended:', len(train_split_blended))

metrics = run_experiment(model_type = 'sgd',
                         scaler_type = 'max_abs_scaler',
                         train_datasets = train_split_blended,
                         test_dataset = df_test,
                         n_pfeatures = 2)
print_pretty(metrics)

len train_split_blended: 4
cascade-0
cascade-1
cascade-2
cascade-3
train
	explained_variance_score
			[0.9401162638428308, 0.9405113014109258, 0.9384894270738083, 0.936995181458762]
	max_error
			[70855.54527165386, 69584.66947078641, 69993.51629091066, 71773.22716971562]
	mean_absolute_error
			[18244.552201973544, 18202.009012348233, 18557.410708821502, 18851.79387462468]
	mean_squared_error
			[551899720.1852795, 547488867.7208946, 565757276.3775729, 582227947.5255094]
	root_mean_squared_error
			[23492.546055829698, 23398.48003014073, 23785.652742306083, 24129.400065594447]
	mean_absolute_percentage_error
			[0.04434011200651712, 0.04420150882749385, 0.045009048200635136, 0.04557351494502158]
	median_absolute_error
			[14613.314930532564, 15061.470451993664, 15957.76921090926, 15579.257400357805]
	r2_score
			[0.9399765816088499, 0.9404562963708768, 0.938469463806852, 0.9366781492811557]
test
	explained_variance_score
			[0.9243918154924121, 0.9240177095514208, 0.9225970117862633, 