In [1]:
import os
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

%matplotlib inline

print("Current working directory:", os.getcwd())

Current working directory: /home/nastyaj/git_blyat


# Leakage fields


In [2]:
LEAKAGE_FIELDS = [
    'toi', 'tid','rastr','ra','decstr','st_pmra','st_pmdec','pl_tranmid','pl_insol','pl_eqt',
    'st_dist','toi_created','rowupdate','st_pmralim','pl_tranmidlim','pl_trandeplim','pl_trandurh',
    'pl_trandurhlim','st_tefflim','st_distlim','st_tmaglim','st_radlim','st_pmdeclim',
    'pl_orbperlim','pl_radelim','pl_insolerr1','pl_insolerr2','pl_insollim','pl_eqterr1',
    'pl_eqterr2','pl_eqtlim','st_logglim'
]

# Feature columns

In [3]:
FEATURE_COLS = [
    "log_orbper","sqrt_pl_rade","log_trandep","rel_radius",
    "depth_norm","compactness","pca_1","pca_2","pca_3","pca_4","pca_5"
]

# Load data function


In [2]:
def load_data(filename: str) -> pd.DataFrame:
    """
    Uploads a CSV file from several possible paths:
    - absolute path
    - current directory
    - ‘data’ folder on the laptop
    - user's home directory
    """
    candidates = [
        Path(filename),
        Path("data") / filename,
        Path.cwd() / filename,
        Path.cwd() / "data" / filename,
        Path.home() / "data" / filename
    ]
    
    for path in candidates:
        if path.exists():
            print(f"✅ Loading: {path}")
            return pd.read_csv(path, comment="#")
    
    raise FileNotFoundError(
        f"❌ File '{filename}' not found in any of:\n" + "\n".join(str(p) for p in candidates)
    )


NameError: name 'pd' is not defined

# Build features from raw data


In [1]:
def build_features_from_raw(df_raw: pd.DataFrame) -> pd.DataFrame:
    """
    Builds 13 engineering features for TOI and KOI datasets.
    Automatically unifies column names and removes leakage.
    """

    COLUMN_ALIASES = {
        # KOI -> стандартні
        "koi_period": "pl_orbper",
        "koi_prad": "pl_rade",
        "koi_depth": "pl_trandep",
        "koi_srad": "st_rad",
        "koi_disposition": "tfopwg_disp",
    }
    df_raw = df_raw.rename(columns={k: v for k, v in COLUMN_ALIASES.items() if k in df_raw.columns})

    df = df_raw.drop(columns=LEAKAGE_FIELDS, errors="ignore").copy()

    mapping = {
        "CP": "Planet", "KP": "Planet",
        "PC": "Candidate", "APC": "Candidate",
        "FP": "False", "FA": "False",
        "CONFIRMED": "Planet", "CANDIDATE": "Candidate", "FALSE POSITIVE": "False"
    }
    if "tfopwg_disp" in df.columns:
        df["disp_3class"] = df["tfopwg_disp"].map(lambda x: mapping.get(str(x).upper(), "Unknown"))


    if "pl_orbper" in df.columns:
        df["log_orbper"] = np.log1p(df["pl_orbper"])
    if "pl_rade" in df.columns:
        df["sqrt_pl_rade"] = np.sqrt(df["pl_rade"])
    if "pl_trandep" in df.columns:
        df["log_trandep"] = np.log1p(df["pl_trandep"] * 1e6)
    if {"pl_rade", "st_rad"} <= set(df.columns):
        df["rel_radius"] = df["pl_rade"] / df["st_rad"]
    if {"pl_trandep", "st_rad"} <= set(df.columns):
        df["depth_norm"] = df["pl_trandep"] / df["st_rad"]
    if {"pl_orbper", "st_rad"} <= set(df.columns):
        df["compactness"] = df["pl_orbper"] / df["st_rad"]

    numeric_df = df.select_dtypes(include=["float64", "int64"])
    if not numeric_df.empty:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(numeric_df.fillna(0))
        pca = PCA(n_components=min(5, X_scaled.shape[1]))
        X_pca = pca.fit_transform(X_scaled)
        for i in range(X_pca.shape[1]):
            df[f"pca_{i+1}"] = X_pca[:, i]

    return df


NameError: name 'pd' is not defined

# Load raw datasets


In [6]:
df_toi_raw = load_data("raw/TOI_raw_2025.10.01.csv")
df_koi_raw = load_data("raw/KOI_raw_2025.10.01.csv")

df_toi = build_features_from_raw(df_toi_raw)
df_koi = build_features_from_raw(df_koi_raw)

✅ Loading: data/raw/TOI_raw_2025.10.01.csv
✅ Loading: data/raw/KOI_raw_2025.10.01.csv


# Stability check


In [7]:
def stability_check(df: pd.DataFrame, feature_cols: list):
    desc = df[feature_cols].describe().T
    desc["IQR_low"] = df[feature_cols].quantile(0.25)
    desc["IQR_high"] = df[feature_cols].quantile(0.75)
    return desc

# TOI stability report


In [8]:
df_toi_raw = load_data("raw/TOI_raw_2025.10.01.csv")
df_toi_feat = build_features_from_raw(df_toi_raw)
stability_toi = stability_check(df_toi_feat, FEATURE_COLS)
print("\n✅ TOI stability report:")
print(stability_toi)

✅ Loading: data/raw/TOI_raw_2025.10.01.csv

✅ TOI stability report:
               count          mean           std        min          25%  \
log_orbper    7592.0  1.803518e+00      0.947233   0.141566     1.250438   
sqrt_pl_rade  7193.0  3.029431e+00      1.072767   0.743308     2.120281   
log_trandep   7699.0  2.207299e+01      1.322249  17.017586    21.072630   
rel_radius    7192.0  8.173140e+00      5.507944   0.503540     3.959835   
depth_norm    7192.0  9.719609e+03  33491.621104   7.742821  1132.130213   
compactness   7085.0  2.037250e+01    148.360182   0.012545     1.837035   
pca_1         7699.0  7.383221e-17      2.165230 -18.158859    -1.021984   
pca_2         7699.0  1.550476e-16      2.021264 -10.862412    -1.324300   
pca_3         7699.0  7.383221e-17      1.740942 -12.488132    -0.840514   
pca_4         7699.0  5.906577e-17      1.711439  -8.035163    -0.606886   
pca_5         7699.0  9.229026e-18      1.574622 -12.953801    -0.358502   

                   

# KOI stability report


In [9]:
df_koi_raw = load_data("raw/KOI_raw_2025.10.01.csv")
df_koi_feat = build_features_from_raw(df_koi_raw)
stability_koi = stability_check(df_koi_feat, FEATURE_COLS)
print("\n✅ KOI stability report:")
print(stability_koi)

✅ Loading: data/raw/KOI_raw_2025.10.01.csv

✅ KOI stability report:
               count          mean           std        min         25%  \
log_orbper    9564.0  2.680465e+00      1.664590   0.216596    1.317395   
sqrt_pl_rade  9201.0  3.389555e+00      9.560995   0.282843    1.183216   
log_trandep   9201.0  2.045687e+01      2.372316   0.000000   18.890059   
rel_radius    9201.0  3.097423e+01    361.076381   0.145719    1.348862   
depth_norm    9201.0  2.246971e+04  82781.214053   0.000000  129.940765   
compactness   9201.0  7.628393e+01   1870.838009   0.004106    2.416181   
pca_1         9564.0  1.188696e-17      2.383752  -2.302332   -0.709810   
pca_2         9564.0 -1.188696e-17      2.264175 -28.923500   -0.275881   
pca_3         9564.0  0.000000e+00      2.143389 -36.805707   -0.958970   
pca_4         9564.0 -1.783043e-17      2.060015 -20.693096   -0.748776   
pca_5         9564.0  2.377391e-17      1.688504  -6.445059   -0.939601   

                     50%       

# Export features.json to reports/features


In [10]:
output_dir = Path("reports") / "features"
output_dir.mkdir(parents=True, exist_ok=True)  # створює папки, якщо їх немає

output_file = output_dir / "features.json"

with open(output_file, "w") as f:
    json.dump(FEATURE_COLS, f, indent=2)

# 🧩 Feature Contract

**Purpose:**  
This file defines a set of agreed features that are used for both TOI and KOI snapshots.

**Included engineered features (13 total):**  
- log_orbper  
- sqrt_pl_rade  
- log_trandep  
- rel_radius  
- depth_norm  
- compactness  
- pca_1 … pca_5  

**Notes:**  
- Leakage fields are always removed before building.
- The `build_features_from_raw(df_raw)` function is applied equally to both datasets.
- Stability check (`stability_check`) confirms consistent feature ranges.
