In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
# import xgboost as xgb

import os
import pickle

from functools import wraps
import time

In [2]:
# Декоратор измерване на време за работа на функция
def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        total_time = end_time - start_time
        print(f'Function took {total_time:.4f} seconds')
        return result
    return timeit_wrapper

In [3]:
# Четене на данни
def read_data(col_name, path):
    df_data = pd.read_csv(path, sep=',') #, index_col ="Unnamed: 0"
    target_cols = [col for col in df_data.columns if 'Target' in col]
    target = df_data[target_cols]       
    attribute = df_data.drop(["time", col_name], axis = 1)
    attribute = attribute.drop(target_cols, axis = 1)
    return target, attribute, df_data

In [4]:
# Скалиране на данни
def scaled_data(target, attribute, ts, scaler): 
    attribute_col_names = attribute.columns
    attribute = scaler.fit_transform(attribute)
    atribute_train, atribute_test, target_train, target_test = train_test_split( 
    attribute, target, test_size=ts, random_state=42)
    
    return atribute_train, atribute_test, target_train, target_test, attribute_col_names

In [5]:
# Запазване на модел във файл
def save_model(model, file_name):
    with open( os.path.join(r'saved_models/', file_name), 'wb') as file:
        pickle.dump(model, file)


In [6]:
# Зареждане на модел от файл
def load_model(file_name):
    with open( os.path.join(r'saved_models/', file_name), 'rb') as file:
        load_model = pickle.load(file)
    return load_model

In [7]:
# Плотва четирите графики за конкретен модел - прогнозирани стойности срещу наблюдавани
# Вика от тялото си plot_charts()
def get_and_plot_data(prediction, target_test, start_index, stop_index):
    
    for i in range (start_index, stop_index):
        e_minus_o = pd.DataFrame()
        df1 = pd.DataFrame(prediction[:, i])
        df2 = pd.DataFrame(target_test[f"Target_{i+1}"])
        e_minus_o = pd.concat([df1, df2.set_index(df1.index)], axis=1)
        e_minus_o["diff"] = e_minus_o[f"Target_{i+1}"]-e_minus_o[0]
        e_minus_o.columns = ["Actual", f"Predicted_Target_{i+1}", "Predicted-Actual"]

        plot_charts(e_minus_o, i)

    return e_minus_o

# 1. хистограма обща на - Predicted и Actual; 
# 2. Resudual plot - x-actual y-predicted] 
# 3.хистограма на - Residual = Predicted-Actual 
# 4. scaterplot - x- predicted  y resudual

In [8]:
# Ползва се за get_and_plot_data()
def plot_charts(df, i):
    title_fondsize = 16
    f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(20, 5)) 
    mu = df["Predicted-Actual"].mean()
    sigma = df["Predicted-Actual"].std()
    textstr = '\n'.join((
        r'$\mu=%.2f$' % (mu, ),
        r'$\sigma=%.2f$' % (sigma, ))) 
    mu_predicted_a = df["Actual"].mean()
    mu_predicted_t = df[f"Predicted_Target_{i+1}"].mean()
    textstr_ax1 = '\n'.join((
        r'$\mu_{actual}=%.2f$' % (mu_predicted_a, ),
        r'$\mu_{predicted}=%.2f$' % (mu_predicted_t, ))) 


    ax1.hist(df["Actual"], color="darkgreen", bins = "fd", alpha = 0.3, label='Actual')
    ax1.hist(df[f"Predicted_Target_{i+1}"], color="midnightblue", bins = "fd", alpha = 0.5, label='Predicted')
    ax1.axvline(df["Actual"].mean(), color='darkgreen', linestyle='dashed', linewidth=3)
    ax1.axvline(df[f"Predicted_Target_{i+1}"].mean(), color='midnightblue', linestyle='dashed', linewidth=1)    
    ax1.set_title(f'Histograms Actual vs Predicted for Target_{i+1}', fontsize=title_fondsize)
    ax1.text(0.05, 0.95, textstr_ax1, transform=ax1.transAxes, fontsize=14, verticalalignment='top', horizontalalignment='left')
    ax1.legend()
    ax1.set_xlabel(f"Actual  vs Predicted for Target_{i+1}") 
    ax1.set_ylabel("Density")

    
    ax2.scatter( df["Actual"], df[f"Predicted_Target_{i+1}"], color="darkgreen", s=1, alpha = 0.5)
    ax2.set_xlabel("Actual")
    ax2.set_ylabel(f"Predicted Price for Target_{i+1}")
    ax2.set_title(f'Actual  vs Predicted for Target_{i+1}', fontsize=title_fondsize)
    
    ax3.scatter( df[f"Predicted_Target_{i+1}"], df["Predicted-Actual"], color="midnightblue", alpha = 0.5, s=1)
    ax3.set_xlabel("Predicted")
    ax3.set_ylabel("Residuals")
    ax3.set_title('Residual vs Predicted', fontsize=title_fondsize)


    ax4.hist(df["Predicted-Actual"], color="crimson", bins = "fd", alpha = 0.8)
    ax4.axvline(mu, color='midnightblue', linestyle='dashed', linewidth=1)
    ax4.text(0.05, 0.95, textstr, transform=ax4.transAxes, fontsize=14, verticalalignment='top', horizontalalignment='left')
    ax4.set_xlabel(f"Residuals") 
    ax4.set_ylabel("Density")
    ax4.set_title('Residual plot', fontsize=title_fondsize)

    plt.tight_layout()
    plt.show()

In [9]:
def generate_score_distributions(model, atribute_train, target_train, atribute_test, target_test, bplot=True):
    r2_train = r2_score(model.predict(atribute_train), target_train, multioutput='raw_values')
    r2_test =  r2_score(model.predict(atribute_test), target_test, multioutput='raw_values')

    mae_train = mean_absolute_error(model.predict(atribute_train), target_train, multioutput='raw_values')
    mae_test =  mean_absolute_error(model.predict(atribute_test), target_test, multioutput='raw_values')

    mse_train = mean_squared_error(model.predict(atribute_train), target_train, multioutput='raw_values')
    mse_test =  mean_squared_error(model.predict(atribute_test), target_test, multioutput='raw_values')
    
    if bplot:
        plot_scores(r2_train, r2_test, mae_train, mae_test, mse_train, mse_test)
    
    return r2_train, r2_test, mae_train, mae_test, mse_train, mse_test

In [10]:
def plot_scores(r2_train, r2_test, mae_train, mae_test, mse_train, mse_test):
    title_fondsize = 14
    f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 5)) 

    ax1.hist(r2_train, color="darkgreen", bins = "fd", alpha = 0.5, label='Train R2')
    ax1.hist(r2_test, color="midnightblue", bins = "fd", alpha = 0.5, label='Test R2')
    ax1.set_title("Coefficient of Determination Distribution")
    ax1.set_xlabel("R2") 
    ax1.set_ylabel("Count")
    ax1.legend()

    ax2.hist(mae_train, color="darkgreen", bins = "fd", alpha = 0.5, label='Train MAE')
    ax2.hist(mae_test, color="midnightblue", bins = "fd", alpha = 0.5, label='Test MAE')
    ax2.set_title("Mean Absolute Error Distribution")
    ax2.set_xlabel("MAE") 
    ax2.set_ylabel("Count")
    ax2.legend()

    ax3.hist(mse_train, color="darkgreen", bins = "fd", alpha = 0.5, label='Train MSE')
    ax3.hist(mse_test, color="midnightblue", bins = "fd", alpha = 0.5, label='Test MSE')
    ax3.set_title("Mean Squared Error Distribution")
    ax3.set_xlabel("MSE") 
    ax3.set_ylabel("Count")
    ax3.legend()

    plt.show()

In [11]:
# Ползва се за get_ttest() и get_ztest ()
def composite_dataframe(price_prediction, price_target_test, idx):
    e_minus_o = pd.DataFrame()
    df1 = pd.DataFrame(price_prediction[:, idx])
    df2 = pd.DataFrame(price_target_test[f"Target_{idx+1}"])
    e_minus_o = pd.concat([df1, df2.set_index(df1.index)], axis=1)
    e_minus_o["diff"] = e_minus_o[f"Target_{idx+1}"]-e_minus_o[0]
    e_minus_o.columns = ["Actual", f"Predicted_Target_{idx+1}", "Predicted-Actual"]
    return e_minus_o

In [12]:
# t-test normality
def get_ttest(price_prediction, price_target_test, idx):
    df = composite_dataframe(price_prediction, price_target_test, idx)
    str = ""
    tset, pval = ttest_1samp(df["Predicted-Actual"], 0)
    str = str + f"p-value: {pval}\n"

    if pval < 0.05:    # alpha value is 0.05 or 5%
        str = str + "we are rejecting null hypothesis"
    else:
        str = str + "we are accepting null hypothesis" #"fail to reject the null hypothesis"
    return str
# The two hypotheses for this particular two sample t-test are as follows:
    # H0: µ1 = 0 
    # HA: µ1 ≠ 0 

In [13]:
def get_ttest_2samples(sample_train, sample_test, bprint = True):
    str = ""
    tset, pval = ttest_ind(sample_train, sample_test, equal_var=True)
    
    str = str + f"p-value: {pval}\n"

    if pval < 0.05:    # alpha value is 0.05 or 5%
        str = str + "we are rejecting null hypothesis"
    else:
        str = str + "fail to reject the null hypothesis"
    
    if bprint:
        print(str)
    return tset, pval

# The two hypotheses for this particular two sample t-test are as follows:
    # H0: µ1 = µ2 (the two population means are equal)
    # HA: µ1 ≠µ2 (the two population means are not equal)
# Because the p-value of our test (0.53005) is greater than alpha = 0.05, we fail to reject the null hypothesis of the test. 
# We do not have sufficient evidence to say that the mean height of plants between the two populations is different.
# Нулевата хипотеза гласи, че няма никаква статистическа значимост между двете средни стойности на съвкупността (H0),  

In [14]:
# z-test normality
def get_ztest (price_prediction, price_target_test, idx):
    df = composite_dataframe(price_prediction, price_target_test, idx)
    str = ""
    ztest, pval = stests.ztest(df["Predicted-Actual"], x2=None, value=0)
    str = str + f"p-value: {pval}\n"

    if pval < 0.05:    # alpha value is 0.05 or 5%
        str = str + "reject null hypothesis"
    else:
        str = str + "accept null hypothesis" #"fail to reject the null hypothesis"
    return str

# A two sample z-test uses the following null and alternative hypotheses:    
    # H0: µ1 = 0 
    # HA: µ1 ≠ 0 

In [15]:
def get_ztest_2samples(sample_train, sample_test, bprint = True):
    str = ""
    ztest, pval = stests.ztest(sample_train, sample_test, value=0)
    
    str = str + f"p-value: {pval}\n"

    if pval < 0.05:    # alpha value is 0.05 or 5%
        str = str + "reject null hypothesis"
    else:
        str = str + "fail to reject the null hypothesis"    
    
    if bprint:
        print(str)    
    return ztest, pval

# A two sample z-test uses the following null and alternative hypotheses:

#     H0: μ1 = μ2 (the two population means are equal)
#     HA: μ1 ≠ μ2 (the two population means are not equal)


In [16]:
# Ползва при анализа на данните - за dimentionality reduction
def get_feature_importance_data(df, col_name):
    data = df.copy()
    y = data[col_name]
    data = data.drop(col_name, axis=1)
    X = data.iloc[:, 1:]
    X = (X - X.min ()) / (X.max () - X.min ())
   
    train_samples = int(X.shape[0] * (1-TEST_SIZE ))
 
    X_train = X.iloc[:train_samples]
    X_test = X.iloc[train_samples:]

    y_train = y.iloc[:train_samples]
    y_test = y.iloc[train_samples:]
    
    return (X_train, y_train), (X_test, y_test)

In [17]:
# PCA data preprocessing
# n_comp = 0.95 ---> explained variance ratio - запазваме 95% от вариацията на данните
@timeit
def PCA_feature_preprocessing(atribute_train, atribute_test, n_comp = 0.95, col_names = []):
    
    pca = PCA(n_components=n_comp)
    PC_train = pca.fit_transform(atribute_train)
    PC_test = pca.transform(atribute_test)
    
    n_pcs= pca.components_.shape[0]
    initial_feature_names = col_names #atribute_train.columns
    most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
    most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]
    draw_cut_off_threshold(most_important_names, pca, n_comp)

    print("Transformed train shape:", PC_train.shape)
    print("Transformed test shape:", PC_test.shape)

    return most_important_names, PC_train, PC_test

In [18]:
# PCA data preprocessing visualization
def draw_cut_off_threshold (most_important_names, pca, n_comp = 0.90): 
    plt.rcParams["figure.figsize"] = (20,10)

    fig, ax = plt.subplots()
    xi = np.arange(1, len(most_important_names)+1, step=1)
    y = np.cumsum(pca.explained_variance_ratio_)

    plt.ylim(0.0,1.1)
    plt.plot(xi, y, marker='o', linestyle='--', color='darkgreen')

    plt.xlabel('Number of Components')
    plt.xticks(np.arange(0, len(most_important_names)+2, step=10))
    plt.ylabel('Cumulative variance (%)')
    plt.title('The number of components needed to explain variance')

    plt.axhline(y = n_comp, color='midnightblue', linestyle='-')
    plt.text(0.2, 0.85, f'{n_comp*100}% cut-off threshold', color = 'midnightblue', fontsize=16)
    ax.grid(axis='x')
    plt.show()

In [19]:
#  Плотва прогнозирани стойности срещу наблюдавани за всяко едно наблюдение
def plot_actual_vs_prediction (prediction, target_test, variable_name, start_inx = 0, end_inx = 1):    
    for i in range (start_inx, end_inx):
        df1 = pd.DataFrame(prediction[i, :])
        df2 = pd.DataFrame(target_test.iloc[i, :])
        df_char = pd.concat([df1, df2.set_index(df1.index)], axis=1)
        df_char.columns = ["Predicted", "Actual"]       
        sigma = df_char["Predicted"].std()
        df_char["Predicted + sigma"] = df_char["Predicted"]+sigma
        df_char["Predicted - sigma"] = df_char["Predicted"]-sigma
        
        lablestr1 = r'Predicted + $\sigma (%.2f)$' % (sigma, )
        lablestr2 = r'Predicted - $\sigma (%.2f)$' % (sigma, )
       
        plt.figure(figsize=(14, 3), dpi=100)
        plt.plot(df_char.index, df_char['Actual'], label=f'Actual {variable_name}', color='darkgreen', alpha = 0.7) 
        plt.plot(df_char.index, df_char['Predicted'], label=f'Predicted {variable_name}',color='midnightblue', alpha = 0.7)
        plt.plot(df_char.index, df_char['Predicted + sigma'], label=lablestr1, color='lightslategray', alpha = 0.5, linestyle='dashed', linewidth=1)
        plt.plot(df_char.index, df_char['Predicted - sigma'], label=lablestr2, color='lightslategray', alpha = 0.5, linestyle='dashed', linewidth=1)
        plt.fill_between(df_char.index, df_char['Predicted - sigma'], df_char['Predicted + sigma'], color='lightslategray', alpha=0.15)        
        plt.title(f'Predicted vs Actual for observation {i}', fontsize=14)
        plt.xlabel('Hours ahead')
        plt.ylabel(variable_name)
        plt.legend(loc='upper left')
    #     plt.savefig(f"pictures/company_base_model.png")
        plt.show()

In [20]:
#  Плотва разпределенията на стойността на зависимата променлива от остатъка - за таргет по избор
def plot_feature_vs_residuals_PCA(att, prediction, target_test, start_index=0, stop_index=1, target_indx =0, col_name = None): #0 to count(features)   
    df = pd.DataFrame(att)
    att_col_names = [f"PCA_{j}" for j in range(0, att.shape[1])]
    df.columns = att_col_names

    e_minus_o = composite_dataframe(prediction, target_test, idx=target_indx)
    df['Residuals'] = e_minus_o['Predicted-Actual']
    if col_name:
        plt.figure(figsize=(5, 5))
        plt.scatter(df[col_name], df["Residuals"], color="midnightblue", s=2, alpha = 0.5)
        plt.xlabel(col_name)
        plt.ylabel("Residuals")
        plt.title(f'Feature {col_name} vs Residuals', fontsize=14)
    else:
        for i in range (start_index, stop_index):
            plt.figure(figsize=(5, 5))
            index = att_col_names.index(att_col_names[i])
            plt.scatter(df.iloc[:, index], df["Residuals"], color="midnightblue", s=2, alpha = 0.5)                      
            plt.title(f'Feature {att_col_names[i]} vs Residuals', fontsize=14)
            plt.xlabel(f"{att_col_names[i]}") 
            plt.ylabel("Residuals")
    plt.show()

In [21]:
#  Плотва разпределенията на стойността на зависимата променлива от остатъка - за таргет по избор
def plot_feature_vs_residuals(att, att_col_names, prediction, target_test, start_index=0, stop_index=1, target_indx =0, col_name = None): #0 to count(features)
         
    df = pd.DataFrame(att)
    df.columns = att_col_names
    e_minus_o = composite_dataframe(prediction, target_test, idx=target_indx)
    df['Residuals'] = e_minus_o['Predicted-Actual']
    if col_name:
        plt.figure(figsize=(5, 5))
        plt.scatter(df[col_name], df["Residuals"], color="midnightblue", s=2, alpha = 0.5)
        plt.xlabel(col_name)
        plt.ylabel("Residuals")
        plt.title(f'Feature {col_name} vs Residuals', fontsize=14)
    else:
        for i in range (start_index, stop_index):
            plt.figure(figsize=(5, 5))
            plt.scatter(df[att_col_names[i]], df["Residuals"], color="midnightblue", s=2, alpha = 0.5)                                 
            plt.title(f'Feature {att_col_names[i]} vs Residuals', fontsize=14)
            plt.xlabel(f"{att_col_names[i]}") 
            plt.ylabel("Residuals")
    plt.show()
    
    
    prediction, target_test

In [22]:
# изчислява и отпечатва резултат от кросвалидация на модел
@timeit
def cv_score(model, trian_atribute, train_target):
    
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
    mae = cross_val_score(model, trian_atribute, train_target, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    mse = cross_val_score(model, trian_atribute, train_target, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    rmse = cross_val_score(model, trian_atribute, train_target, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
    r2 = cross_val_score(model, trian_atribute, train_target, scoring='r2', cv=cv, n_jobs=-1)
    
    mae = np.absolute(mae)
    mse = np.absolute(mse)
    rmse = np.absolute(rmse)
    r2 = np.absolute(r2)
    
    print('Cross validated Mean Squared Error: %.3f (%.3f)' % (np.mean(mse), np.std(mse)))
    print('Cross validated Root Mean Squared Error: %.3f (%.3f)' % (np.mean(rmse), np.std(rmse)))
    print('Cross validated Mean Absolute Error: %.3f (%.3f)' % (np.mean(mae), np.std(mae)))    
    print('Cross validated Coefficient of Determination: %.3f (%.3f)' % (np.mean(r2), np.std(r2)))
    
    return np.mean(mse), np.mean(rmse), np.mean(mae), np.mean(r2)

In [23]:
# изчислява и отпечатва резултат от обучение на модел
def evaluation_report(model, atribute, target):
    
    prediction = model.predict(atribute)
    mse = mean_squared_error(target, prediction)
    rmse = mean_squared_error(target, prediction, squared=False)
    mae = mean_absolute_error(target, prediction)
    r2 = r2_score(target, prediction)
    
    print("Score Results:")
    print("Mean squared error: %.2f" % mse)
    print("Root mean squared error: %.2f" % rmse)
    print("Mean absolute error: %.2f" % mae)
    print("Coefficient of determination: %.4f" % r2)
    print('Predicted mean: %.3f (%.3f)' % (np.mean(prediction), np.std(prediction)))
#     print('Actual mean: %.3f (%.3f)' % (np.mean(target), np.std(target)))
    
    return mse, mae, r2

In [24]:
# изчислява и връща резултат от обучение на модел
def evaluation_results(model, atribute, target):
    
    prediction = model.predict(atribute)
    mse = mean_squared_error(target, prediction)
    rmse = mean_squared_error(target, prediction, squared=False)
    mae = mean_absolute_error(target, prediction)
    r2 = r2_score(target, prediction)
    
    return mse, mae, rmse, r2

In [25]:
# GridSearchCV за подбор на хиперпараметри на модел
@timeit
def make_grid(estimator, params, cv, atribute, target):

    grid = GridSearchCV(estimator=estimator, param_grid=params, cv=cv, return_train_score=True, n_jobs=-1)
    grid.fit(atribute, target)
    prediction = grid.predict(atribute)  
    
    print('Grid Results:')     
    evaluation_report(gred.best_estimator_, atribute, target)
    print(f'Best grid params: {grid.best_params_}')    
   
    return grid

In [26]:
# GridSearchCV за подбор на хиперпараметри на модел
@timeit
def get_grid_search(estimator,  params, cv, atribute_train, target_train):
    grid_search = GridSearchCV(estimator=estimator,  param_grid=params, scoring='neg_mean_absolute_error',cv=cv, return_train_score=True, n_jobs=-1)
    grid_search.fit(atribute_train, target_train)
    print(grid_search.best_estimator_)
    
    return grid_search


In [27]:
# RandomizedSearchCV за подбор на хиперпараметри на модел
@timeit
def make_random_grid(estimator, params, cv, atribute_train, target_train, n_iter=100):

    grid = RandomizedSearchCV(estimator, param_distributions=params, random_state=42, n_iter=n_iter, cv=cv, verbose=1, n_jobs=-1, return_train_score=True)
    grid.fit(atribute_train, target_train)
    
    print(grid.best_estimator_)    
 
    return grid

In [28]:
# Генерира DataFrame с резултатите от всички модели
def evaluation_report_df(model, atribute_train, target_train, atribute_test, target_test, model_filename, tranformed_data = False, wrapper = None):
    
    model_name = type(model).__name__
    ddata = ''
    if wrapper: 
        try:
            model_name +=f"_{type(model.estimator).__name__}"
            try:
                model_name +=f"_{type(model.estimator.base_estimator).__name__}"
            except:
                pass           
        except:
            model_name +=f"_{type(model.base_estimator).__name__}"
         
    
    if tranformed_data == False:
        model_name+="_full_data"
        ddata+="_full_data"
    else:
        model_name+="_PCA_data"
        ddata+="_PCA_data"
    
        
    
    results = pd.DataFrame(columns=['Model','Data', 'mse_train', 'mae_train', 'rmse_train', 'r2_train', 'predicted_train_mean', 
                                    'predicted_train_std', 'mse_test', 'mae_test', 'rmse_test', 'r2_test', 
                                    'predicted_test_mean', 'predicted_test_std', 'pval_ztest_r2','pval_ttest_r2', 'pval_ztest_mae', 'pval_ttest_mae', 'model_filename'])
    
    prediction_train = model.predict(atribute_train)
    prediction_test = model.predict(atribute_test)
    
    mse_train = mean_squared_error(target_train, prediction_train)
    rmse_train = mean_squared_error(target_train, prediction_train, squared=False)
    mae_train = mean_absolute_error(target_train, prediction_train)
    r2_train = r2_score(target_train, prediction_train)
    predicted_mean_train = np.mean(prediction_train)
    predicted_std_train = np.std(prediction_train)

    mse_test = mean_squared_error(target_test, prediction_test)
    rmse_test = mean_squared_error(target_test, prediction_test, squared=False)
    mae_test = mean_absolute_error(target_test, prediction_test)
    r2_test = r2_score(target_test, prediction_test)
    predicted_mean_test = np.mean(prediction_test)
    predicted_std_test = np.std(prediction_test)
    
    r2_train_distribution, r2_test_distribution, mae_train_distribution, mae_test_distribution, _, _ = generate_score_distributions(model, atribute_train, target_train, atribute_test, target_test, bplot=False)   
        
    _, z_pval_r2 = get_ztest_2samples(r2_train_distribution, r2_test_distribution, bprint = False)
    _, z_pval_mae = get_ztest_2samples(mae_train_distribution, mae_test_distribution, bprint = False)
#     _, z_pval_mse = get_ztest_2samples(mse_train_distribution, mse_test_distribution, bprint = False)
    _, t_pval_r2 = get_ttest_2samples(r2_train_distribution, r2_test_distribution, bprint = False)
    _, t_pval_mae = get_ttest_2samples(mae_train_distribution, mae_test_distribution, bprint = False)
#     _, t_pval_mse =get_ttest_2samples(mse_train_distribution, mse_test_distribution, bprint = False)

    df = pd.DataFrame()
    df = {'Model':model_name ,'Data': ddata, 'mse_train':mse_train, 'mae_train':mae_train, 'rmse_train':rmse_train, 'r2_train':r2_train, 
                       'predicted_train_mean': predicted_mean_train, 'predicted_train_std': predicted_std_train, 'mse_test':mse_test, 'mae_test':mae_test, 
                       'rmse_test':rmse_test, 'r2_test':r2_test, 'predicted_test_mean': predicted_mean_test, 
                       'predicted_test_std':predicted_std_test, 'pval_ztest_r2': z_pval_r2 , 'pval_ttest_r2': t_pval_r2, 
                       'pval_ztest_mae': z_pval_mae , 'pval_ttest_mae':t_pval_mae ,'model_filename': model_filename}
    
    
    results = results.append(df, ignore_index = True)

    return results

In [29]:
# Генерира DataFrame с резултатите от текущо разглеждан модел
def get_current_model_evaluation_report(df):
    result = pd.DataFrame(columns=['Metric', 'Train', 'Test'])
    result['Metric']  =  np.array(['Mean squared error', 'Mean absolute error', 'Root mean squared error', 'Coefficient of determination', 'Predicted mean', 'Predicted_std'])   
    result['Train']  =  np.array(df.round(decimals = 4).T[2:8])  
    result['Test']  =  np.array(df.round(decimals = 4).T[8:14])  
    result = result.set_index('Metric')
    return result