In [None]:
# VYTVORENIE DATASETOV overcontact_one_curve.pkl, overcontact_curves_samples_knn.pkl, overcontact_curves_samples_svr.pkl

In [2]:
# BLOK 1
# Importovanie kniznic.

import numpy as np
import pandas as pd
import sqlite3
import io

In [2]:
# BLOK 2
# Funkcia pre vyber svetelnej krivky z nespracovanych dat. Zadefinovanie cesty k databaze (subor .db).

def get_curve(txt):
    out = io.BytesIO(txt)
    out.seek(0)
    return np.load(out)

FILEPATH = '../data-upjs/overcontact.db'

In [3]:
# BLOK 3 
# Vytvorenie pripojenia na subor. Vypis vsetkych tabuliek v databaze.

conn = sqlite3.connect(FILEPATH)

sql_query = """SELECT name FROM sqlite_master WHERE type='table';"""
cursor = conn.cursor()
cursor.execute(sql_query)
print(cursor.fetchall())

[('parameters',), ('curves',), ('auxiliary',)]


In [4]:
# BLOK 4
# Nacitanie parametrov.

df_parameters = pd.read_sql_query("SELECT id, mass_ratio, primary__surface_potential, secondary__surface_potential, primary__t_eff, secondary__t_eff, inclination FROM parameters", conn)

In [5]:
# BLOK 5
# Nacitanie svetelnych kriviek.

df_curves = pd.read_sql_query("SELECT * FROM curves", conn)

In [6]:
# BLOK 6
# Vytvorenie tabulky. Jeden riadok = jeden system zakrytovych premennych hviezd v 13 filtroch.

df1 = df_curves[["id", "Bessell_U", "Bessell_B", "Bessell_V", "Bessell_R", "Bessell_I", "SLOAN_u", "SLOAN_g", "SLOAN_r", "SLOAN_i", "SLOAN_z", "Kepler", "GaiaDR2", "TESS"]]
df2 = df_parameters[["id","primary__t_eff", "secondary__t_eff", "inclination", "mass_ratio", "primary__surface_potential", "secondary__surface_potential"]]
df_merged = pd.merge(df1,df2,on="id")
df_merged["Bessell_U"] = df_merged["Bessell_U"].apply(get_curve)
df_merged["Bessell_V"] = df_merged["Bessell_V"].apply(get_curve)
df_merged["Bessell_R"] = df_merged["Bessell_R"].apply(get_curve)
df_merged["Bessell_I"] = df_merged["Bessell_I"].apply(get_curve)
df_merged["Bessell_B"] = df_merged["Bessell_B"].apply(get_curve)
df_merged["SLOAN_u"] = df_merged["SLOAN_u"].apply(get_curve)
df_merged["SLOAN_g"] = df_merged["SLOAN_g"].apply(get_curve)
df_merged["SLOAN_r"] = df_merged["SLOAN_r"].apply(get_curve)
df_merged["SLOAN_i"] = df_merged["SLOAN_i"].apply(get_curve)
df_merged["SLOAN_z"] = df_merged["SLOAN_z"].apply(get_curve)
df_merged["Kepler"] = df_merged["Kepler"].apply(get_curve)
df_merged["GaiaDR2"] = df_merged["GaiaDR2"].apply(get_curve)
df_merged["TESS"] = df_merged["TESS"].apply(get_curve)

In [7]:
# BLOK 7
# Vytvorenie tabulky. Jeden riadok = jedna svetelna krivka podla jedneho filtra.

df_final = pd.melt(df_merged, 
                   id_vars=["id","primary__t_eff", "secondary__t_eff", "inclination", "mass_ratio", 
                            "primary__surface_potential", "secondary__surface_potential"],
                   var_name="filter", value_name="curve")
df_final.describe()

Unnamed: 0,id,primary__t_eff,secondary__t_eff,inclination,mass_ratio,primary__surface_potential,secondary__surface_potential
count,1212796.0,1212796.0,1212796.0,1212796.0,1212796.0,1212796.0,1212796.0
mean,49519990.0,6537.04,6246.543,1.222625,1.420822,3.985917,3.985917
std,19234430.0,969.1032,972.5411,0.2181624,1.389326,1.840244,1.840244
min,5525038.0,4250.0,4250.0,0.4949341,0.1,1.912756,1.912756
25%,34415750.0,5750.0,5500.0,1.048325,0.9,3.208226,3.208226
50%,55656330.0,6500.0,6250.0,1.230496,1.0,3.469409,3.469409
75%,64577600.0,7250.0,7000.0,1.409921,1.428571,4.197352,4.197352
max,74138900.0,8000.0,8000.0,1.570796,10.0,14.98052,14.98052


In [12]:
# BLOK 8
# Uzavretie pripojenia na databazu. Ulozenie dat v podobe tabulky do suboru .pkl.

conn.close()
df_final.to_pickle("over_curves.pkl")

In [13]:
# BLOK 9
# Vytvorenie vzorky 500000 kriviek pre ucely predikcie knn

data_sample_knn = df_final.sample(n=500000, random_state=1234)
data_sample_svr = df_final.sample(n=40000, random_state=1234)

data_sample_knn.head()

In [16]:
# BLOK 10
# Ulozenie vzorky v podobe tabulky do suboru .pkl

data_sample_knn.to_pickle("over_curves_samples_knn.pkl")
data_sample_svr.to_pickle("over_curves_samples_svr.pkl")