In [7]:
# PREDIKCIA HODNOT FYZIKALNYCH PARAMETROV ODDELENYCH SYSTEMOCH METODOU k-NN S PRIDANYM SUMOM A BEZ SUMU
# VYTVORENIE A ULOZENIE MODELOV

In [1]:
# BLOK 1
# Importovanie kniznic.

import numpy as np
import pandas as pd

import pickle
import os

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

In [2]:
# BLOK 2
# Funkcie pre generovanie sumu. Nastavenie pseudo-nahodneho generatora.

def generate_observation_sigma(space_obs_frac=0.5):
    """
    Draws a standard deviation of noise in light curve points from a "true" value provided in synthetic light curve.
    Noise sigma is drawn from bimodal distribution taking into account contributions from space based and earth based
    observations which have different levels of stochastic noise.

    :param space_obs_frac: ratio between earth based and space based observations
    :return: float; standard deviation of the light curve noise
    """
    earth_based_sigma = 4e-3
    space_based_sigma = 2e-4
    sigma = np.random.choice([earth_based_sigma, space_based_sigma], p=[1-space_obs_frac, space_obs_frac])
    return np.random.rayleigh(sigma)

def stochastic_noise_generator(curve):
    """
    Introduces gaussian noise into synthetic observation provided in `curve`.

    :param curve: numpy.array; normalized light curve
    :return: Tuple(numpy.array, float); normalized light curve with added noise, standard deviation of observations
    """
    sigma = generate_observation_sigma()
    return np.random.normal(curve, sigma), np.full(curve.shape, sigma)

np.random.seed(1234)

In [3]:
# BLOK 2
# Nacitanie vzorky oddelenych systemov

data_sample = pd.read_pickle("detached_curves_samples_knn.pkl").reset_index() 
len(data_sample)

500000

In [4]:
# BLOK 3
# Vyber a priprava dat. Skontrolovanie vyberu podla filtrov.
np.random.seed(1)

# data_sample=data_sample.sample(n=120000, random_state=1234)
data_sample["t2/t1"]=data_sample["secondary__t_eff"]/data_sample["primary__t_eff"]
data_sample=data_sample.round({"mass_ratio":14})


y = data_sample[["t2/t1", "inclination", "mass_ratio", "primary__surface_potential", "secondary__surface_potential"]]
X=[]
for row in data_sample["curve"]:
    X.append(row)

print(data_sample["filter"].value_counts())

Bessell_V    12
Kepler       11
SLOAN_g      11
SLOAN_u       9
Bessell_U     8
GaiaDR2       8
Bessell_B     7
Bessell_I     6
SLOAN_i       6
SLOAN_r       6
Bessell_R     6
SLOAN_z       6
TESS          4
Name: filter, dtype: int64


In [5]:
# BLOK 4 - a
# rozdelenie dat na trenovaciu a testovaciu mnozinu v pomere 80:20
# mnoziny pozostavaju z kriviek bez pridaneho sumu

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2
)

In [5]:
# BLOK 4 - b
# rozdelenie dat na trenovaciu a testovaciu mnozinu v pomere 80:20
# svetelnym krivkam je pridany umely sum

X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(
    X, y, test_size=0.2
)

X_train=[]
y_train=[]
for i in range(len(X_train_n)):
    for j in range(3):
        curve=stochastic_noise_generator(X_train_n[i])
        X_train.append(curve[0])
        y_train.append(y_train_n.iloc[i])
        j+=1        
X_train=np.array(X_train)
y_train=np.array(y_train)

X_test=[]
y_test=[]
for i in range(len(X_test_n)):
    for j in range(3):
        curve=stochastic_noise_generator(X_test_n[i])
        X_test.append(curve[0])
        y_test.append(y_test_n.iloc[i])
        j+=1
X_test=np.array(X_test)
y_test=np.array(y_test)

In [None]:
# Uprava testovacej mnoziny na dataframe

y_test = pd.DataFrame(y_test, columns=["t2/t1", "inclination", "mass_ratio", "primary__surface_potential", "secondary__surface_potential"])
y_test = y_test.reset_index()

In [None]:
# BLOK 5
# Najdenie optimalnych hodnot hyper-parametrov pre model, vypis najlepsieho modelu

pipe_knn = Pipeline([('reg', MultiOutputRegressor(KNeighborsRegressor()))])
grid_param_knn = {
    "reg__estimator__n_neighbors": range(2, 7),
    "reg__estimator__metric": ["cosine", "euclidean", "l1"]
}
gs_knn = (GridSearchCV(estimator=pipe_knn, 
                      param_grid=grid_param_knn, 
                      ))
gs_knn = gs_knn.fit(X_train,y_train)
best_params = gs_knn.best_params_  
best_params

In [6]:
best_params = {'reg__estimator__metric': 'cosine', 'reg__estimator__n_neighbors': 4}

In [7]:
# BLOK 6
# Vytvorenie architektury modelu. Spustenie trenovania na trenovacej mnozine. Vypis architektury modelu

knn_model = KNeighborsRegressor(n_neighbors=best_params['reg__estimator__n_neighbors'],
                                metric=best_params['reg__estimator__metric'], 
                                weights='distance')
knn_model.fit(X_train, y_train)

knn_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'cosine',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 4,
 'p': 2,
 'weights': 'distance'}

In [8]:
# BLOK 7
# Predikcia a vytvorenie dataframu z predikovanych hodnot

y_pred = knn_model.predict(X_test)

df_y_pred = pd.DataFrame(y_pred, columns=["T2/T1", "Inc", "mass_ratio", "PSP", "SSP"])
df_y_pred.head()

Unnamed: 0,T2/T1,Inc,mass_ratio,PSP,SSP
0,0.439903,1.260368,3.228042,12.583271,13.768857
1,0.662404,1.258459,2.578563,10.895826,8.336098
2,0.755419,1.414649,3.2703,9.311629,10.969059
3,0.553959,1.370343,2.69515,11.744353,9.652769
4,0.527975,1.526631,1.251499,6.249356,9.727266


In [9]:
# BLOK 8 
# Chybovost pri predikcii pre kazdy parameter zvlast na testovacich krivkach

mse_t = mean_squared_error(y_test["t2/t1"], df_y_pred['T2/T1'])
mape_t = mean_absolute_percentage_error(y_test["t2/t1"], df_y_pred['T2/T1'])
mae_t = mean_absolute_error(y_test["t2/t1"], df_y_pred['T2/T1'])

mse_inc = mean_squared_error(y_test["inclination"], df_y_pred['Inc'])
mape_inc = mean_absolute_percentage_error(y_test["inclination"], df_y_pred['Inc'])
mae_inc = mean_absolute_error(y_test["inclination"], df_y_pred['Inc'])

mse_mass = mean_squared_error(y_test["mass_ratio"], df_y_pred['mass_ratio'])
mape_mass = mean_absolute_percentage_error(y_test["mass_ratio"], df_y_pred['mass_ratio'])
mae_mass = mean_absolute_error(y_test["mass_ratio"], df_y_pred['mass_ratio'])

mse_psp = mean_squared_error(y_test["primary__surface_potential"], df_y_pred['PSP'])
mape_psp = mean_absolute_percentage_error(y_test["primary__surface_potential"], df_y_pred['PSP'])
mae_psp = mean_absolute_error(y_test["primary__surface_potential"], df_y_pred['PSP'])

mse_ssp = mean_squared_error(y_test["secondary__surface_potential"], df_y_pred['SSP'])
mape_ssp = mean_absolute_percentage_error(y_test["secondary__surface_potential"], df_y_pred['SSP'])
mae_ssp = mean_absolute_error(y_test["secondary__surface_potential"], df_y_pred['SSP'])

print("MSE T2/T1: " + str(mse_t) + " MAE T2/T1: " + str(mae_t) + " MAPE T2/T1: " + str(mape_t * 100) + "%")
print("MSE INC: " + str(mse_inc) + " MAE INC: " + str(mae_inc) + " MAPE INC: " + str(mape_inc * 100) + "%")
print("MSE MASS: " + str(mse_mass) + " MAE MASS: " + str(mae_mass) + " MAPE MASS: " + str(mape_mass * 100) + "%")
print("MSE PSP: " + str(mse_psp) + " MAE PSP: " + str(mae_psp) + " MAPE PSP: " + str(mape_psp * 100) + "%")
print("MSE SSP: " + str(mse_ssp) + " MAE SSP: " + str(mae_ssp) + " MAPE SSP: " + str(mape_ssp * 100) + "%")

MSE T2/T1: 0.03354484416551359 MAE T2/T1: 0.14805782885493424 MAPE T2/T1: 47.591555885090685%
MSE INC: 0.02035557673809263 MAE INC: 0.11448295445249179 MAPE INC: 8.800371153924537%
MSE MASS: 6.703206691184773 MAE MASS: 1.7015482040688799 MAPE MASS: 329.67753670984064%
MSE PSP: 606.9149872260191 MAE PSP: 13.787857468691096 MAPE PSP: 195.91913519329088%
MSE SSP: 899.8656250905367 MAE SSP: 13.822134756947449 MAPE SSP: 128.16381874204598%


In [53]:
# BLOK 9
# ulozenie modelu

os.makedirs(os.path.dirname("models/knn_detached_model.pkl"), exist_ok=True)
pickle.dump(knn_model, open("models/knn_detached_model.pkl", 'wb'))