In [None]:
# PREDIKCIA HODNOT FYZIKALNYCH PARAMETROV ODDELENYCH SYSTEMOCH METODOU SVR S PRIDANYM SUMOM A BEZ SUMU
# VYTVORENIE A ULOZENIE MODELOV

In [1]:
# BLOK 1
# Importovanie kniznic.

import numpy as np
import pandas as pd

import pickle
import os

from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

In [2]:
# BLOK 2
# Funkcie pre generovanie sumu. Nastavenie pseudo-nahodneho generatora.

def generate_observation_sigma(space_obs_frac=0.5):
    """
    Draws a standard deviation of noise in light curve points from a "true" value provided in synthetic light curve.
    Noise sigma is drawn from bimodal distribution taking into account contributions from space based and earth based
    observations which have different levels of stochastic noise.

    :param space_obs_frac: ratio between earth based and space based observations
    :return: float; standard deviation of the light curve noise
    """
    earth_based_sigma = 4e-3
    space_based_sigma = 2e-4
    sigma = np.random.choice([earth_based_sigma, space_based_sigma], p=[1-space_obs_frac, space_obs_frac])
    return np.random.rayleigh(sigma)

def stochastic_noise_generator(curve):
    """
    Introduces gaussian noise into synthetic observation provided in `curve`.

    :param curve: numpy.array; normalized light curve
    :return: Tuple(numpy.array, float); normalized light curve with added noise, standard deviation of observations
    """
    sigma = generate_observation_sigma()
    return np.random.normal(curve, sigma), np.full(curve.shape, sigma)

np.random.seed(1234)

In [3]:
# BLOK 2
# Nacitanie vzorky oddelenych systemov

data_sample = pd.read_pickle("detached_curves_samples_svr.pkl")

In [4]:
len(data_sample)

50000

In [5]:
data_sample = data_sample.sample(400)

In [18]:
# BLOK 4
# Vyber a priprava dat. Skontrolovanie vyberu podla filtrov.

np.random.seed(1)

data_sample["t2/t1"]=data_sample["secondary__t_eff"]/data_sample["primary__t_eff"]
data_sample=data_sample.round({"mass_ratio":14})

y = data_sample[["t2/t1", "inclination", "mass_ratio", "primary__surface_potential", "secondary__surface_potential"]]

X=[]
for row in data_sample["curve"]:
    X.append(row)

print(data_sample["filter"].value_counts())

SLOAN_g      44
Bessell_U    39
Bessell_R    35
SLOAN_z      32
GaiaDR2      31
Kepler       30
SLOAN_r      29
SLOAN_u      28
TESS         27
Bessell_B    27
Bessell_V    26
Bessell_I    26
SLOAN_i      26
Name: filter, dtype: int64


In [19]:
# BLOK 4 - a
# rozdelenie dat na trenovaciu a testovaciu mnozinu v pomere 80:20
# mnoziny pozostavaju z kriviek bez pridaneho sumu

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2
)

In [13]:
# BLOK 4 - b
# rozdelenie dat na trenovaciu a testovaciu mnozinu v pomere 80:20
# svetelnym krivkam je pridany umely sum

X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(
    X, y, test_size=0.2
)

X_train=[]
y_train=[]
for i in range(len(X_train_n)):
    for j in range(3):
        curve=stochastic_noise_generator(X_train_n[i])
        X_train.append(curve[0])
        y_train.append(y_train_n.iloc[i])
        j+=1        
X_train=np.array(X_train)
y_train=np.array(y_train)

X_test=[]
y_test=[]
for i in range(len(X_test_n)):
    for j in range(3):
        curve=stochastic_noise_generator(X_test_n[i])
        X_test.append(curve[0])
        y_test.append(y_test_n.iloc[i])
        j+=1
X_test=np.array(X_test)
y_test=np.array(y_test)

In [20]:
# Uprava testovacej mnoziny na dataframe

y_test = pd.DataFrame(y_test, columns=["t2/t1", "inclination", "mass_ratio", "primary__surface_potential", "secondary__surface_potential"])
y_test = y_test.reset_index()
y_test.head()

Unnamed: 0,index,t2/t1,inclination,mass_ratio,primary__surface_potential,secondary__surface_potential
0,300001,0.8,1.556795,10.0,110.00005,996.5005
1,862041,0.222222,1.327042,0.7,8.400832,5.278068
2,225466,0.75,1.228534,0.3,3.073046,3.119347
3,105000,1.0,1.522997,10.0,17.701814,27.351609
4,1168798,0.333333,1.183098,0.7,3.475571,3.313406


In [13]:
# BLOK 5
# Najdenie optimalnych hodnot hyper-parametrov pre model, vypis najlepsieho modelu

pipe_svr = Pipeline([('reg', MultiOutputRegressor(SVR()))])
grid_param_svr = {
    "reg__estimator__C": range(3, 6),
    "reg__estimator__gamma": [3.4, 3.6, 3.8, 4.0]
}
gs_svr = (GridSearchCV(estimator=pipe_svr, 
                      param_grid=grid_param_svr, 
                      ))
gs_svr = gs_svr.fit(X_train,y_train)
best_params = gs_svr.best_params_  
gs_svr.best_estimator_  

In [21]:
# BLOK 6
# Vytvorenie architektury modelu. Spustenie trenovania na trenovacej mnozine. Vypis architektury modelu

svr_model = SVR(kernel='rbf', cache_size=200, C=5, gamma=3.8, epsilon=0.05)
regr_model = MultiOutputRegressor(svr_model)
regr_model.fit(X_train, y_train)

svr_model.get_params()

{'C': 5,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.05,
 'gamma': 3.8,
 'kernel': 'rbf',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [22]:
# BLOK 7
# Predikcia a vytvorenie dataframu z predikovanych hodnot

y_pred = regr_model.predict(X_test)

df_y_pred = pd.DataFrame(y_pred, columns=["T2/T1", "Inc", "mass_ratio", "PSP", "SSP"])
df_y_pred.head()

Unnamed: 0,T2/T1,Inc,mass_ratio,PSP,SSP
0,0.611525,1.530339,1.245752,22.461194,27.117091
1,0.311634,1.343245,2.221697,8.881535,9.588908
2,0.865132,1.243845,1.002594,4.932487,5.332868
3,0.805127,1.31388,1.61473,6.776146,5.476783
4,0.287644,1.321247,1.027968,5.797593,5.27026


In [23]:
# BLOK 8
# Chybovost pri predikcii pre kazdy parameter zvlast na testovacich krivkach

mse_t = mean_squared_error(y_test["t2/t1"], df_y_pred['T2/T1'])
mape_t = mean_absolute_percentage_error(y_test["t2/t1"], df_y_pred['T2/T1'])
mae_t = mean_absolute_error(y_test["t2/t1"], df_y_pred['T2/T1'])

mse_inc = mean_squared_error(y_test["inclination"], df_y_pred['Inc'])
mape_inc = mean_absolute_percentage_error(y_test["inclination"], df_y_pred['Inc'])
mae_inc = mean_absolute_error(y_test["inclination"], df_y_pred['Inc'])

mse_mass = mean_squared_error(y_test["mass_ratio"], df_y_pred['mass_ratio'])
mape_mass = mean_absolute_percentage_error(y_test["mass_ratio"], df_y_pred['mass_ratio'])
mae_mass = mean_absolute_error(y_test["mass_ratio"], df_y_pred['mass_ratio'])

mse_psp = mean_squared_error(y_test["primary__surface_potential"], df_y_pred['PSP'])
mape_psp = mean_absolute_percentage_error(y_test["primary__surface_potential"], df_y_pred['PSP'])
mae_psp = mean_absolute_error(y_test["primary__surface_potential"], df_y_pred['PSP'])

mse_ssp = mean_squared_error(y_test["secondary__surface_potential"], df_y_pred['SSP'])
mape_ssp = mean_absolute_percentage_error(y_test["secondary__surface_potential"], df_y_pred['SSP'])
mae_ssp = mean_absolute_error(y_test["secondary__surface_potential"], df_y_pred['SSP'])

print("MSE T2/T1: " + str(mse_t) + " MAE T2/T1: " + str(mae_t) + " MAPE T2/T1: " + str(mape_t * 100) + "%")
print("MSE INC: " + str(mse_inc) + " MAE INC: " + str(mae_inc) + " MAPE INC: " + str(mape_inc * 100) + "%")
print("MSE MASS: " + str(mse_mass) + " MAE MASS: " + str(mae_mass) + " MAPE MASS: " + str(mape_mass * 100) + "%")
print("MSE PSP: " + str(mse_psp) + " MAE PSP: " + str(mae_psp) + " MAPE PSP: " + str(mape_psp * 100) + "%")
print("MSE SSP: " + str(mse_ssp) + " MAE SSP: " + str(mae_ssp) + " MAPE SSP: " + str(mape_ssp * 100) + "%")

MSE T2/T1: 0.02567271040511119 MAE T2/T1: 0.1258568555671594 MAPE T2/T1: 32.04557146750492%
MSE INC: 0.015480920578780288 MAE INC: 0.09041495431531629 MAPE INC: 6.979264668516863%
MSE MASS: 6.1763167427517685 MAE MASS: 1.4879528097667345 MAPE MASS: 122.15526820638583%
MSE PSP: 790.517197033381 MAE PSP: 12.154156348762417 MAPE PSP: 48.00999822759513%
MSE SSP: 12697.58801807669 MAE SSP: 23.577828426305103 MAPE SSP: 64.92672210673305%


In [21]:
# BLOK 9
# ulozenie modelu

os.makedirs(os.path.dirname("models/svr_detached_model.pkl"), exist_ok=True)
pickle.dump(regr_model, open("models/svr_detached_model.pkl", 'wb'))