# Kepler Exobolygó ML modell 

In [None]:
# import image module 
from IPython.display import Image 
  
# get the image 
Image(url="pictures/PIA18904~large.jpg", width=300, height=300) 

A Kepler űrteleszkópot 2009-ben indította a NASA, az első olyan misszióként, amely földszerű exobolygók felfedezésére irányult a lakható zónákban. Több mint 150 000 csillag fényességét figyelte meg, és több mint 2600 megerősített bolygót azonosított. 2013-ban a küldetést kiterjesztették K2 néven, új égboltterületek megfigyelésére. A Kepler forradalmasította a bolygókutatást, megmutatva, hogy több bolygó van a galaxisunkban, mint csillag. A misszió 2018-ban véget ért, de hatalmas örökséget hagyott maga után a kutatás számára.

- Indulás Dátuma: 2009. március 7.
- Küldetés Időtartama: Kezdetben 3,5 év, majd meghosszabbítva 2018. október 30-ig
- Elsődleges Küldetés: 4 év
- Másodlagos Küldetés: 5,5 év
- Megfigyelt Csillagok Száma: Több mint 150 000
- Felfedezett Bolygók Száma: Több mint 2 600 megerősített exobolygó
- Kepler Nézőpontja: 105 négyzetfok
- Fotométer Érzékenység: Képes észlelni a fényesség változását akár 20 ppm (egymillióból 20)
- Gyűjtött Adatok Mennyisége: Több mint 50 terabájt
- Felfedezett Bolygórendszerek Száma: Több mint 1 000

ADATKÉSZLET: Kepler Objects of Interest (KOI) táblázat a Kepler-misszió által azonosított bolygójelöltekről nyújt adatokat. Tartalmazza a jelöltek státuszát, pályaparamétereket, csillagtulajdonságokat és fotometriai adatokat. Ezek az információk segítenek az exobolygók azonosításában és megerősítésében. A részletesebb adatoszlopok a jelöltek besorolását és tulajdonságaik leírását szolgálják.

https://exoplanetarchive.ipac.caltech.edu/cgi-bin/TblView/nph-tblView?app=ExoTbls&config=cumulative

## 1. Könyvtárak és adatkészlet betöltése

In [None]:
import os
import sys
import json
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

print(f"Python version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")
print(f"Matplotlib version: {matplotlib.__version__}")

In [None]:
exo_column_names = pd.read_csv(r"data/cumulative_2024.09.03_11.45.57.csv", on_bad_lines="skip")
exo_data_clean = pd.read_csv(r"data/cumulative_2024.09.03_11.45.57.csv", skiprows=143, low_memory=False, header=0)

In [None]:
exo_column_names.head()

In [None]:
exo_data_clean.head()

In [None]:
exo_data_clean.to_csv(r"data/cumulative_2024.09.03_clean.csv")

In [None]:
print('Setup is completed')

## 2. Adatkészlet előfeldolgozása

In [None]:
exo_column_names.info()

In [None]:
exo_column_names = exo_column_names.iloc[:-1,:]
exo_column_names = exo_column_names.iloc[2:,:]

In [None]:
exo_column_names.head()

In [None]:
column_names = dict()
for index, row in exo_column_names.iterrows():
    temp = row.values[0][9:].split(": ")
    column_names[temp[0].lstrip()] = temp[1].lstrip().replace(' ', '_').replace('[', '').replace(']', '').replace('.', '')
    print(f"{temp[0].lstrip()}:{(25 - len(temp[0].lstrip())) * ' '}{temp[1].lstrip().replace(' ', '_').replace('[', '').replace(']', '').replace('.', '')}")

In [None]:
# Convert and write JSON object to file
with open("data/columns_names.json", "w") as outfile: 
    json.dump(column_names, outfile)

In [None]:
with open('data/data_types.json', 'r') as file:
    data_types = json.load(file)

In [None]:
exo_data = pd.read_csv(r"data/cumulative_2024.09.03_clean.csv", low_memory=False, skip_blank_lines=True, header=1, dtype=data_types)

In [None]:
exo_data.shape

In [None]:
exo_data.head()

In [None]:
exo_data.describe()

In [None]:
with open('data/columns_names.json', 'r') as file:
    column_names = json.load(file)

In [None]:
exo_data = exo_data.rename(columns = column_names, inplace=False)

In [None]:
for column in exo_data.columns:
    if not exo_data[column].any():
        exo_data = exo_data.drop(column, axis=1)
        print(f"{column} is deleted because its empty")

In [None]:
clear_exo = exo_data.dropna()

In [None]:
print(f"Number of found planets: {clear_exo['Number_of_Planets'].sum()}")

In [None]:
sns.set(rc = {'figure.figsize':(25,9)})
d1 = clear_exo[(clear_exo["Number_of_Planets"] >= 4)]
d2 = clear_exo.head(20)
op = sns.scatterplot(data=d1,
                     x="Orbital_Period_days",
                     y="KepID",
                     hue="Equilibrium_Temperature_K",
                     hue_norm=(0, 1800),
                     size="Planetary_Radius_Earth_radii",
                     legend="brief",
                     sizes=(20, 200))
op.set_yscale("log")

plt.title("Kepler Exoplanets sizes and temperatures")
plt.xlabel("Orbital Period days")
plt.ylabel("Number of planets in one system")

In [None]:
d3 = clear_exo
op = sns.scatterplot(data=d3,
                     x="Orbital_Period_days",
                     y="Planetary_Radius_Earth_radii",
                     hue="Equilibrium_Temperature_K",
                     hue_norm=(0, 1800),
                     size="Planetary_Radius_Earth_radii",
                     legend="brief",
                     sizes=(20, 200))

op.set_xscale("log")
op.set_yscale("log")

plt.title("Exoplanet Radius - Orbital Period")
plt.xlabel("Orbital Period days")
plt.ylabel("Planetary Radius Earth [rad]")

In [None]:
import plotly.express as px

fig = px.scatter_3d(clear_exo, x="Orbital_Period_days", y="Transit_Epoch_BKJD", z="Transit_Duration_hrs",
                    color="Transit_Depth_ppm", width=900, height=800)
fig.show()

In [None]:
# Hertzsprung-Russell Diagram (HR Diagram) for Stars

# The HR diagram is a cornerstone in astrophysics, classifying stars based on temperature and luminosity.
# It allows you to see star classification (main sequence, giants, white dwarfs, etc.) and their evolutionary stages.
# You can identify the type of stars hosting exoplanets, understanding their life stages and
# how they might affect planetary formation.

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12,10))
hr = sns.scatterplot(data=clear_exo,
                     x='Stellar_Effective_Temperature_K',
                     y='Insolation_Flux_Earth_flux',
                     hue='Stellar_Effective_Temperature_K',
                     size='Stellar_Radius_Solar_radii',
                     sizes=(0, 200),
                     palette='plasma',
                     legend="brief",
                     )
hr.set_yscale("log")
plt.gca().invert_xaxis()  # Invert x-axis for HR diagram
plt.xlabel('Effective Temperature (K)')
plt.ylabel('Stellar Luminosity (proxy: Incident Flux)')
plt.title('Hertzsprung-Russell Diagram of Stars')
plt.show()

In [None]:
# Mass-Radius Relationship of Exoplanets

# This chart helps classify planets as rocky, gaseous, or 
# something in between (e.g., super-Earths, mini-Neptunes, gas giants).
# It's essential in understanding planetary composition.
# Comparing different planets on this chart helps reveal trends in planet formation and structure.

import plotly.express as px

fig = px.scatter(clear_exo,
                 x='Planetary_Radius_Earth_radii',
                 y='Fitted_Stellar_Density_g/cm**3',
                 color='Stellar_Effective_Temperature_K',
                 size='Planetary_Radius_Earth_radii',
                 log_x=True,
                 log_y=True, 
                 labels={'Planetary_Radius_Earth_radii': 'Planet Radius (Earth Rad)', 
                         'Fitted_Stellar_Density_g/cm**3': 'Planet Mass (Earth Masses)'}, 
                 title='Mass-Radius Relationship of Exoplanets',
                )

fig.show()

In [None]:
# Orbital Period vs. Stellar Radius

# This chart can reveal how stellar size affects the orbital properties of planets.
# Larger stars may host planets with longer orbital periods due to greater habitable zones.
# Helps understand the architecture of planetary systems in relation to their host stars.

plt.figure(figsize=(14,8))
sns.scatterplot(data=clear_exo,
                x='Orbital_Period_days',
                y='Stellar_Radius_Solar_radii',
                size='Stellar_Radius_Solar_radii',
                hue='Stellar_Effective_Temperature_K',
                legend='brief',)

plt.xscale('log')
plt.yscale('log')
plt.xlabel('Orbital Period (days)')
plt.ylabel('Stellar Radius (Solar Radii)')
plt.title('Orbital Period vs. Stellar Radius')
plt.show()

In [None]:
# Planet Radius vs. Orbital Period

# This chart helps to investigate the relationship between a planet's size and how far it is from its star.
# It can differentiate gas giants and small rocky planets, possibly indicating trends in planet formation.
# Larger planets tend to form farther from their stars, while smaller planets are often found closer in.

fig = px.scatter(clear_exo,
                 x='Orbital_Period_days',
                 y='Planetary_Radius_Earth_radii', 
                 color='Equilibrium_Temperature_K',
                 size='Planetary_Radius_Earth_radii',
                 log_x=True,
                 log_y=True, 
                 labels={'Orbital_Period_days': 'Orbital Period (days)', 
                         'Planetary_Radius_Earth_radii': 'Planetary Radius (Earth Radii)'}, 
                 title='Planet Radius vs. Orbital Period')

fig.show()

In [None]:
# Incident Flux vs. Orbital Period

# This chart shows how the stellar energy a planet receives (insolation)
# decreases with increasing distance from the star (longer periods).
# It's useful for studying potential habitable zones and understanding planetary climates.
# Planets with short orbital periods are likely to be very hot, while those with long periods receive less energy and could be cooler.

plt.figure(figsize=(14,8))
sns.scatterplot(data=clear_exo,
                x='Orbital_Period_days',
                y='Insolation_Flux_Earth_flux',
                size='Stellar_Radius_Solar_radii',
                hue='Stellar_Effective_Temperature_K',
                legend='brief',)

plt.xscale('log')
plt.yscale('log')
plt.xlabel('Orbital Period (days)')
plt.ylabel('Incident Flux (Earth Units)')
plt.title('Incident Flux vs. Orbital Period')
plt.show()

In [None]:
# Planetary Density vs. Planet Radius

# This chart helps to distinguish between different types of planets, such as gas giants, rocky planets, and water worlds.
# Rocky planets tend to be smaller but denser, while gas giants are larger but less dense.
# Planetary density is a key indicator of a planet's composition. You can classify planets based on density ranges.

fig = px.scatter(clear_exo,
                 x='Planetary_Radius_Earth_radii',
                 y='Fitted_Stellar_Density_g/cm**3',
                 color='Equilibrium_Temperature_K',
                 size='Fitted_Stellar_Density_g/cm**3',
                 log_x=True,
                 log_y=True, 
                 labels={'Planetary_Radius_Earth_radii': 'Planetary Radius (Earth Radii)', 
                         'Fitted_Stellar_Density_g/cm**3': 'Planetary Density (g/cm³)'}, 
                 title='Planetary Density vs. Planet Radius')

fig.show()

In [None]:
# Transit Depth vs. Stellar Radius

# Transit depth is directly related to the planet’s size relative to the star.
# For larger stars, transit depth tends to be shallower for a planet of a given size.
# This chart helps to understand how the size of a planet’s transit signal is influenced by its host star’s size.

plt.figure(figsize=(14,8))
sns.scatterplot(data=clear_exo,
                x='Stellar_Radius_Solar_radii',
                y='Transit_Depth_ppm',
                size='Stellar_Radius_Solar_radii',
                hue='Stellar_Effective_Temperature_K',
                legend='brief'
               )

plt.xscale('log')
plt.yscale('log')
plt.xlabel('Stellar Radius (Solar Radii)')
plt.ylabel('Transit Depth (ppm)')
plt.title('Transit Depth vs. Stellar Radius')
plt.show()

In [None]:
exo_data.head(10)

In [None]:
for column in exo_data.columns:
    print(column)

In [None]:
drop_names = ["Vetting_Status",
              "rowid",
              "Date_of_Last_Parameter_Update",
              "Disposition_Using_Kepler_Data",
              "Disposition_Provenance", 
              "Link_to_DV_Report", 
              "Link_to_DV_Summary",  
              "KOI_Name",
              "Kepler_Name",
              "Disposition_Score"]

exo_data = exo_data.drop(drop_names, axis=1)

In [None]:
exo_data.head()

In [None]:
exo_data.to_csv(r"data/cumulative_2024.09.03_new.csv")

In [None]:
print(f"The dataframe have {exo_data.shape[0]} row and {exo_data.shape[1]} columns")

In [None]:
exo_data.isnull().sum()

In [None]:
exo_data.info(113)

## 3. Adatok elemzése

In [None]:
print(exo_data.RA_decimal_degrees.isnull().sum())
print(exo_data.Dec_decimal_degrees.isnull().sum())

In [None]:
exo_data.Exoplanet_Archive_Disposition.value_counts()

In [None]:
plt.figure(figsize=(8, 4))
exo_data.Exoplanet_Archive_Disposition.value_counts().plot(kind='bar', color=["red", "green", "blue"])
plt.show()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# Extract data
Candidates = exo_data[exo_data.Exoplanet_Archive_Disposition == "CANDIDATE"]
Confirmed = exo_data[exo_data.Exoplanet_Archive_Disposition == "CONFIRMED"]

# RA and DEC for Candidates and Confirmed
ra = exo_data.RA_decimal_degrees
dec = exo_data.Dec_decimal_degrees
ra_c = Confirmed.RA_decimal_degrees
dec_c = Confirmed.Dec_decimal_degrees

# Plot
fig, ax = plt.subplots(figsize=(10, 10))

ax.scatter(ra, dec, s=3, label="Candidates", alpha=0.7)
ax.scatter(ra_c, dec_c, s=3, label="Confirmed", alpha=0.7)

ax.set_xlabel("Right Ascension (RA)")
ax.set_ylabel("Declination (DEC)")
ax.set_title("Scatter Plot of Exoplanet Candidates and Confirmed Exoplanets")

ax.legend()
ax.set_aspect('equal', adjustable='box')
plt.tight_layout()

In [None]:
# Separiting the categorical features

categorical = exo_data.select_dtypes(include =[object])
print("Categorical Features in DataSet:",categorical.shape[1])
print(categorical.columns)

In [None]:
# from sklearn.preprocessing import LabelEncoder

# # Initialize the LabelEncoder
# enc = LabelEncoder()

# # Apply Label Encoding to koi_disposition
# exo_data["Exoplanet_Archive_Disposition_Encoded"] = enc.fit_transform(exo_data["Exoplanet_Archive_Disposition"])

# # Show the encoded column
# print(exo_data[["Exoplanet_Archive_Disposition", "Exoplanet_Archive_Disposition_Encoded"]].head())

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for column in ['Exoplanet_Archive_Disposition', 'Comment', 'Planetary_Fit_Type', 
               'Limb_Darkening_Model', 'Parameters_Provenance', 'TCE_Delivery', 
               'Quarters', 'Transit_Model', 'Stellar_Parameter_Provenance']:
    exo_data[column + "_Encoded"] = le.fit_transform(exo_data[column])

## 4. Adatok tisztítása

In [None]:
### Handle Missing Values: Identify columns with missing values and decide how to handle them (e.g., remove rows, fill with mean/median, or use more advanced imputation methods).
### Outlier Detection: Look for outliers that could negatively impact your model and decide whether to remove or transform them.
### Normalize/Scale Data: Depending on the model, scaling features (e.g., using StandardScaler or MinMaxScaler in sklearn) might be necessary.

In [None]:
nulls = exo_data.isnull().sum()
count = 1
for key, value in nulls.items():
    print(f"{count}  {key}{(70 - len(key)) * ' '}{value}")
    count += 1

In [None]:
exo_data.drop(categorical.columns, inplace=True, axis=1)

In [None]:
exo_data.dropna(subset=["Orbital_Period_Upper_Unc_days"], inplace=True)

In [None]:
nulls = exo_data.isnull().sum()
count = 1
for key, value in nulls.items():
    print(f"{count}  {key}{(70 - len(key)) * ' '}{value}")
    count += 1

In [None]:
exo_data.shape

In [None]:
inf_values = exo_data[np.isinf(exo_data).any(axis=1)]
print(inf_values)

In [None]:
exo_data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
for i in exo_data.columns:
    print(i)

In [None]:
exo_data.describe()

## 5. Feature selection

In [None]:
### Feature Selection: Identify the most important features for your model. You can do this through domain knowledge, statistical tests, or feature importance scores.
### Categorical Variables: Convert categorical features into numerical format if needed (e.g., using one-hot encoding or label encoding).

In [None]:
classification_features = [ "Orbital_Period_days",
                            "Orbital_Period_days_weight",
                            "Transit_Epoch_BKJD",
                            "Transit_Epoch_BKJD_weight",
                            "Transit_Epoch_BJD",
                            "Transit_Epoch_BJD_weight",
                            "Impact_Parameter",
                            "Impact_Parameter_weight",
                            "Transit_Duration_hrs",
                            "Transit_Duration_hrs_weight",
                            "Transit_Depth_ppm",
                            "Transit_Depth_ppm_weight",
                            "Planet-Star_Radius_Ratio",
                            "Planet-Star_Radius_Ratio_weight",
                            "Fitted_Stellar_Density_g/cm**3",
                            "Fitted_Stellar_Density_g/cm**3_weight",
                            "Planetary_Radius_Earth_radii",
                            "Planetary_Radius_Earth_radii_weight",
                            "Orbit_Semi-Major_Axis_au",
                            "Inclination_deg",
                            "Equilibrium_Temperature_K",
                            "Insolation_Flux_Earth_flux",
                            "Insolation_Flux_Earth_flux_weight",
                            "Planet-Star_Distance_over_Star_Radius",
                            "Planet-Star_Distance_over_Star_Radius_weight",
                            "Stellar_Effective_Temperature_K",
                            "Stellar_Surface_Gravity_log10(cm/s**2)",
                            "Stellar_Metallicity_dex",
                            "Stellar_Radius_Solar_radii",
                            "Stellar_Mass_Solar_mass",
                            "Comment_Encoded",
                            "Planetary_Fit_Type_Encoded",
                            "Parameters_Provenance_Encoded",
                            "TCE_Delivery_Encoded",
                            "Quarters_Encoded",
                            "Stellar_Parameter_Provenance_Encoded",
                            "Exoplanet_Archive_Disposition_Encoded",
                          ]

In [None]:
def create_weight(feature, feature_err1, feature_err2):
    # Calculate absolute and relative uncertainties
    exo_data[feature + "_abs_uncertainty"] = np.abs(exo_data[feature_err1] - exo_data[feature_err2])
    
    # Prevent division by zero in the relative uncertainty calculation
    epsilon = 1e-6  # small value to avoid division by zero
    exo_data[feature + "_rel_uncertainty"] = exo_data[feature + "_abs_uncertainty"] / (exo_data[feature] + epsilon)
    
    # Filter based on the relative uncertainty threshold
    threshold = 0.10
    filtered_exo_data = exo_data[exo_data[feature + "_rel_uncertainty"] < threshold]
    
    # Calculate weight and handle infinite values
    exo_data[feature + "_weight"] = 1 / (exo_data[feature + "_rel_uncertainty"] + epsilon)
    
    # Remove temporary columns for cleanliness
    exo_data.drop(columns=[feature + "_abs_uncertainty", feature + "_rel_uncertainty"], inplace=True)
    

In [None]:
create_weight("Orbital_Period_days", "Orbital_Period_Upper_Unc_days", "Orbital_Period_Lower_Unc_days")
print(exo_data[["Orbital_Period_days", "Orbital_Period_days_weight"]].head())

In [None]:
create_weight("Transit_Epoch_BKJD", "Transit_Epoch_Upper_Unc_BKJD", "Transit_Epoch_Lower_Unc_BKJD")
create_weight("Transit_Epoch_BJD", "Transit_Epoch_Upper_Unc_BJD", "Transit_Epoch_Lower_Unc_BJD")
create_weight("Impact_Parameter", "Impact_Parameter_Upper_Unc", "Impact_Parameter_Lower_Unc")
create_weight("Transit_Duration_hrs", "Transit_Duration_Upper_Unc_hrs", "Transit_Duration_Lower_Unc_hrs")
create_weight("Transit_Depth_ppm", "Transit_Depth_Upper_Unc_ppm", "Transit_Depth_Lower_Unc_ppm")
create_weight("Planet-Star_Radius_Ratio", "Planet-Star_Radius_Ratio_Upper_Unc", "Planet-Star_Radius_Ratio_Lower_Unc")
create_weight("Fitted_Stellar_Density_g/cm**3", "Fitted_Stellar_Density_Upper_Unc_g/cm**3", "Fitted_Stellar_Density_Lower_Unc_g/cm**3")
create_weight("Planetary_Radius_Earth_radii", "Planetary_Radius_Upper_Unc_Earth_radii", "Planetary_Radius_Lower_Unc_Earth_radii")
create_weight("Insolation_Flux_Earth_flux", "Insolation_Flux_Upper_Unc_Earth_flux", "Insolation_Flux_Lower_Unc_Earth_flux")
create_weight("Planet-Star_Distance_over_Star_Radius", "Planet-Star_Distance_over_Star_Radius_Upper_Unc", "Planet-Star_Distance_over_Star_Radius_Lower_Unc")

In [None]:
exo_data.head()

In [None]:
new_exo_data = exo_data[classification_features]

In [None]:
new_exo_data.shape

In [None]:
new_exo_data.isnull().sum()

In [None]:
new_exo_data = new_exo_data.dropna()

In [None]:
np.isinf(new_exo_data).sum()

In [None]:
new_exo_data.head(10)

In [None]:
import sweetviz as sv
from ydata_profiling import ProfileReport

# Report mappa létrehozása
directory_name = "EDA_report"
if not os.path.exists(directory_name):
    os.makedirs(directory_name)

In [None]:
# Ydata report elkészítése
report_ydata = ProfileReport(new_exo_data.head(100).copy(), title="My Data")
report_ydata.to_file("EDA_report/ydata_report.html")

In [None]:
plt.subplots(figsize=(20,20))
sns.heatmap(new_exo_data.corr(), square=True, annot=True)
plt.show()

In [None]:
new_exo_data.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
new_exo_data.describe()

In [None]:
reserve_exo_data = new_exo_data[-100:]
new_exo_data = new_exo_data.iloc[:-100]

In [None]:
reserve_exo_data.shape

In [None]:
new_exo_data.shape

## 6. Classification

In [None]:
### Predict whether a candidate is a confirmed exoplanet or a false positive.

### 6.1 KNN

In [None]:
# Outliers

import numpy as np
from scipy import stats

# Calculate Z-scores
z_scores = np.abs(stats.zscore(new_exo_data))
print("Z-scores calculated.")

# Define a threshold
threshold = 3

# Get boolean array of outliers
outliers = (z_scores > threshold)

# Identify outlier rows
outlier_indices = np.where(outliers.any(axis=1))[0]
print(f"Number of outlier rows: {len(outlier_indices)}")
print("Outlier rows indices:", outlier_indices)

In [None]:
new_exo_data = new_exo_data.reset_index(drop=True)
cleaned_data = new_exo_data.drop(outlier_indices, axis=0)
print(f"Number of rows after removing outliers: {len(cleaned_data)}")

In [None]:
import numpy as np

# Apply log transformation to specific columns
cleaned_data['Log_Orbital_Period'] = np.log(cleaned_data['Orbital_Period_days'] + 1)
cleaned_data['Log_Transit_Depth'] = np.log(cleaned_data['Transit_Depth_ppm'] + 1)

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

y = cleaned_data["Exoplanet_Archive_Disposition_Encoded"]
X = cleaned_data.drop(columns="Exoplanet_Archive_Disposition_Encoded", axis=1)

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)

In [None]:
def get_accuracy(model, X_train, X_test, y_train, y_test):
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    return test_accuracy

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33/2, random_state=42)

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
print(X_test.shape)
print(y_test.shape)

In [None]:
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(X_train, y_train)

y_train_pred = KNN.predict(X_train)
print(f"{round(accuracy_score(y_train, y_train_pred)* 100, 2)}")

y_test_pred = KNN.predict(X_test)
print(f"{round(accuracy_score(y_test, y_test_pred)* 100, 2)}")

In [None]:
for i in range(0, 10):
    print(f"{reserve_exo_data['Exoplanet_Archive_Disposition_Encoded'].values[i]} - {y_test_pred[i]}")

### 6.2 KNN with Cross Validation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33/2, random_state=42)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.33/2, random_state=20)

KNN_cv = KNeighborsClassifier(n_neighbors=3)
KNN_cv.fit(X_train, y_train)

y_train_pred = KNN_cv.predict(X_train)
y_test_pred = KNN_cv.predict(X_test)
y_cv_pred = KNN_cv.predict(X_cv)

print(f"Accuracy score Train: {round(accuracy_score(y_train, y_train_pred)* 100, 2)}")
print(f"Accuracy score Test: {round(accuracy_score(y_test, y_test_pred)* 100, 2)}")
print(f"Accuracy score Cross Validation: {round(accuracy_score(y_cv, y_cv_pred)* 100, 2)}")

### 6.3 K-Fold Validation

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

y_KF = cleaned_data["Exoplanet_Archive_Disposition_Encoded"]
X_KF = cleaned_data.drop(columns="Exoplanet_Archive_Disposition_Encoded", axis=1)

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_KF)
X_scaled = pd.DataFrame(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_KF, test_size=0.33, random_state=500)
accuracies = []

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

kf = KFold(n_splits=5)
kf.get_n_splits(X_train)
print(kf)

for i, (train_index, cv_index) in enumerate(kf.split(X_train)):
    KNN_KF = KNeighborsClassifier(n_neighbors=10)
    KNN_KF.fit(X_train.iloc[train_index], y_train.iloc[train_index])
    y_train_pred = KNN_KF.predict(X_train.iloc[train_index])
    y_cv_pred = KNN_KF.predict(X_train.iloc[cv_index])
    train_accuracy = accuracy_score(y_train[train_index], y_train_pred)
    cv_accuracy = accuracy_score(y_train[cv_index], y_cv_pred)
    print("CV Accuracy:", cv_accuracy)
    accuracies.append(cv_accuracy)
print("Mean CV accuracy: ", np.mean(accuracies))

In [None]:
train_index

### 6.4 Random Forrest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_KF, test_size=0.33, random_state=500)
accuracies = []


X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

kf2 = KFold(n_splits=5)
kf2.get_n_splits(X_train)
print(kf2)

for i, (train_index, cv_index) in enumerate(kf2.split(X_train)):
    rf_KF = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=0)
    rf_KF.fit(X_train.iloc[train_index], y_train.iloc[train_index])
    y_train_pred = rf_KF.predict(X_train.iloc[train_index])
    y_cv_pred = rf_KF.predict(X_train.iloc[cv_index])
    train_accuracy = accuracy_score(y_train[train_index], y_train_pred)
    cv_accuracy = accuracy_score(y_train[cv_index], y_cv_pred)
    print("CV Accuracy:", cv_accuracy)
    accuracies.append(cv_accuracy)
print("Mean CV accuracy: ", np.mean(accuracies))

In [None]:
from sklearn.metrics import confusion_matrix

y_test_pred = rf_KF.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_test_pred)
print(f"Confusion Matrix for the Test Set:\n {conf_matrix}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

feature_importances = rf_KF.feature_importances_
feature_names = X_train.columns

sorted_indices = np.argsort(feature_importances)[::-1]
sorted_feature_importances = np.array(feature_importances)[sorted_indices]
sorted_feature_names = np.array(feature_names)[sorted_indices]

top_n = 40
top_n_indices = sorted_indices[:top_n]
top_n_feature_importances = sorted_feature_importances[:top_n]
top_n_feature_names = sorted_feature_names[:top_n]

plt.figure(figsize=(20, 6))
plt.title("Top 10 Feature Importances in Random Forest")
plt.bar(top_n_feature_names, top_n_feature_importances, width=0.5)
plt.xlabel("Features")
plt.ylabel("Importance")
plt.xticks(rotation=45)
plt.show()

### 6.5 Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_KF, test_size=0.33, random_state=500)
accuracies = []


X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

kf3 = KFold(n_splits=5)
kf3.get_n_splits(X_train)
print(kf2)

for i, (train_index, cv_index) in enumerate(kf3.split(X_train)):
    gbc = GradientBoostingClassifier(max_depth=5, n_estimators=100, learning_rate=0.2)
    gbc.fit(X_train.iloc[train_index], y_train.iloc[train_index])
    y_train_pred = gbc.predict(X_train.iloc[train_index])
    y_cv_pred = gbc.predict(X_train.iloc[cv_index])
    train_accuracy = accuracy_score(y_train[train_index], y_train_pred)
    cv_accuracy = accuracy_score(y_train[cv_index], y_cv_pred)
    print("CV Accuracy:", cv_accuracy)
    accuracies.append(cv_accuracy)
print("Mean CV accuracy: ", np.mean(accuracies))

### 6.6 LightGBM

In [None]:
from lightgbm import LGBMClassifier

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_KF, test_size=0.33, random_state=500)
accuracies = []


X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

kf4 = KFold(n_splits=5)
kf4.get_n_splits(X_train)
print(kf4)

for i, (train_index, cv_index) in enumerate(kf4.split(X_train)):
    lgbm = LGBMClassifier(boosting_type='gbdt',
                          colsample_bytree=1.0,
                          learning_rate=0.1,
                          max_depth=5,
                          min_child_samples=10,
                          n_estimators=100,
                          num_leaves=20,
                          objective='multiclass',
                          subsample=1.0)
    lgbm.fit(X_train.iloc[train_index], y_train.iloc[train_index])
    y_train_pred = lgbm.predict(X_train.iloc[train_index])
    y_cv_pred = lgbm.predict(X_train.iloc[cv_index])
    train_accuracy = accuracy_score(y_train[train_index], y_train_pred)
    cv_accuracy = accuracy_score(y_train[cv_index], y_cv_pred)
    print("CV Accuracy:", cv_accuracy)
    accuracies.append(cv_accuracy)
print("Mean CV accuracy: ", np.mean(accuracies))

### 6.7 Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam


# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33/2, random_state=42)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.33/2, random_state=20)

# Scaling the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_cv = scaler.transform(X_cv)
X_test = scaler.transform(X_test)

# One-hot encode the labels for multiclass classification
y_train = to_categorical(y_train, num_classes=3)
y_cv = to_categorical(y_cv, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)

# Define the model architecture for multiclass classification
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(3, activation='sigmoid')  # 3 output neurons, one for each class
])

# Compile the model for multiclass classification
model.compile(optimizer=Adam(learning_rate=0.01), loss='categorical_crossentropy', metrics=['accuracy'])

# Learning rate scheduler callback
lr_scheduler = ReduceLROnPlateau(factor=0.5, patience=3)

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_cv, y_cv), batch_size=64,
                    callbacks=[lr_scheduler, early_stopping], verbose=1)


In [None]:
import matplotlib.pyplot as plt


training_accuracy = history.history['accuracy']
validation_accuracy = history.history['val_accuracy']

training_loss = history.history['loss']
validation_loss = history.history['val_loss']

epochs_range = range(len(history.history['loss']))

plt.figure(figsize=(12, 6))

# Plot Training and Validation Accuracy
plt.subplot(1, 2, 1)
plt.plot(epochs_range, training_accuracy, label='Training Accuracy')
plt.plot(epochs_range, validation_accuracy, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

# Plot Training and Validation Loss
plt.subplot(1, 2, 2)
plt.plot(epochs_range, training_loss, label='Training Loss')
plt.plot(epochs_range, validation_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc
from sklearn.preprocessing import label_binarize

# Get probability predictions for each class
y_pred_prob = model.predict(X_test, verbose=0)

# Binarize the output labels
y_test_binarized = label_binarize(y_test, classes=[0, 1, 2])

# Initialize lists to store precision-recall values
precision = {}
recall = {}
auc_score = {}

# Compute precision-recall curve and AUC for each class
for i in range(3):  # For each class
    precision[i], recall[i], _ = precision_recall_curve(y_test_binarized[:, i], y_pred_prob[:, i])
    auc_score[i] = auc(recall[i], precision[i])

# Plot Precision-Recall curves for each class
plt.figure(figsize=(8, 6))
for i in range(3):
    plt.plot(recall[i], precision[i], label=f'Class {i} PR-AUC = {auc_score[i]:.2f}')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

### 6.8 Comparsion of the Models

In [None]:
models = pd.DataFrame({
    'Model': ['K-Neighbors Classifier', 'RandomForestClassifier'],
    'Score': [get_accuracy(KNN_KF, X_train, X_test, y_train, y_test),
              get_accuracy(rf_KF, X_train, X_test, y_train, y_test),
              # get_accuracy(gbc, X_train, X_test, y_train, y_test),
              # get_accuracy(lgbm, X_train, X_test, y_train, y_test),
             ]})
models.sort_values(by='Score', ascending=False)

## 7. Regression

In [None]:
### Predict the size or other attributes of an exoplanet.

### 7.1 Linear Regression

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

regression_features = [ "Orbital_Period_days",
                        "Stellar_Radius_Solar_radii",
                        "Stellar_Effective_Temperature_K",
                        "Stellar_Mass_Solar_mass",
                        "Planet-Star_Distance_over_Star_Radius",
                      ]

In [None]:
reg_exo_data = exo_data[regression_features]

In [None]:
reg_exo_data.shape

In [None]:
reg_exo_data.isnull().sum()

In [None]:
reg_exo_data.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# Outliers

In [None]:
import numpy as np
from scipy import stats

# Calculate Z-scores
z_scores = np.abs(stats.zscore(reg_exo_data))
print("Z-scores calculated.")

# Define a threshold
threshold = 3

# Get boolean array of outliers
outliers = (z_scores > threshold)

# Identify outlier rows
outlier_indices = np.where(outliers.any(axis=1))[0]
print(f"Number of outlier rows: {len(outlier_indices)}")
print("Outlier rows indices:", outlier_indices)

In [None]:
reg_exo_data = reg_exo_data.reset_index(drop=True)
cleaned_data = reg_exo_data.drop(outlier_indices, axis=0)
print(f"Number of rows after removing outliers: {len(cleaned_data)}")

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler


# Prepare your data
y_LR = reg_exo_data["Planet-Star_Distance_over_Star_Radius"]
X_LR = reg_exo_data.drop(columns=["Planet-Star_Distance_over_Star_Radius"])

# Scaling
scaler = StandardScaler()
X_LR_scaled = scaler.fit_transform(X_LR)

X_LR_scaled = pd.DataFrame(X_LR_scaled)

X_LR_scaled.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler

y_orig = reg_exo_data["Planet-Star_Distance_over_Star_Radius"].values.reshape(-1, 1)

scaler = MinMaxScaler()
y = scaler.fit_transform(y_orig)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_LR_scaled, y, test_size=0.33/2, random_state=42)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.33/2, random_state=20)

In [None]:
import matplotlib.pyplot as plt

# Fit model on full training set
reg = LinearRegression()
reg.fit(X_train, y_train)

# Predict on test set
y_test_pred = reg.predict(X_test)

# Plot predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_test_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('True Values vs Predictions')
plt.show()

# Plot residuals
residuals = y_test - y_test_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_test_pred, residuals, alpha=0.5)
plt.axhline(0, color='r', linestyle='--')
plt.xlabel('Predictions')
plt.ylabel('Residuals')
plt.title('Residuals vs Predictions')
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# Assuming new_exo_data is your DataFrame
data = cleaned_data.copy()

# Z-Score Method
z_scores = np.abs(stats.zscore(data))
outliers_z_score = np.where(z_scores > 3)

# IQR Method
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
outliers_iqr = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR)))

# Plotting
plt.figure(figsize=(14, 7))

# Box Plot for each feature
plt.subplot(1, 2, 1)
data.boxplot()
plt.title('Box Plot of Features')

# Scatter Plot of the target feature vs. predictions (example)
plt.subplot(1, 2, 2)
plt.scatter(data["Orbital_Period_days"], data["Planet-Star_Distance_over_Star_Radius"], alpha=0.5)
plt.xlabel('Orbital_Period_days')
plt.ylabel('Planet-Star Distance over Star Radius')
plt.title('Scatter Plot of Feature1 vs Target')

plt.tight_layout()
plt.show()

# Show outliers
print("Outliers detected using Z-score:")
print(data.iloc[outliers_z_score[0]])

print("\nOutliers detected using IQR:")
print(data[outliers_iqr].dropna())

### 7.2 Ridge

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

ridge_model = Ridge(alpha=0.1)
ridge_model.fit(X_train, y_train)

y_pred_ridge = ridge_model.predict(X_test)

mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
print("Model coefficients:", ridge_model.coef_)
print("Intercept:", ridge_model.intercept_)

print("Mean Absolute Error (MAE) with Ridge: ", mae_ridge)
print("Ridge Score: ", ridge_model.score(X_test, y_test))

In [None]:
y_pred_ridge = ridge_model.predict(X_cv)

mae_ridge = mean_absolute_error(y_cv, y_pred_ridge)
print("Model coefficients:", ridge_model.coef_)
print("Intercept:", ridge_model.intercept_)

print("Mean Absolute Error (MAE) with Ridge: ", mae_ridge)
print("Ridge Score: ", ridge_model.score(X_cv, y_cv))

### 7.3 Lasso

In [None]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error

lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)

y_pred_lasso = lasso_model.predict(X_test)

mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
print("Model coefficients:", lasso_model.coef_)
print("Intercept:", lasso_model.intercept_)

print("Mean Absolute Error (MAE) with Lasso: ", mae_lasso)
print("Lasso Score: ", lasso_model.score(X_test, y_test))

In [None]:
y_pred_lasso = lasso_model.predict(X_cv)

mae_lasso = mean_absolute_error(y_cv, y_pred_lasso)
print("Model coefficients:", lasso_model.coef_)
print("Intercept:", lasso_model.intercept_)

print("Mean Absolute Error (MAE) with Lasso: ", mae_lasso)
print("Lasso Score: ", lasso_model.score(X_cv, y_cv))

### 7.4 RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=0)
rfr.fit(X_train, y_train)

y_pred_rfr = rfr.predict(X_test)

mae_rfr = mean_absolute_error(y_test, y_pred_rfr)

print("Mean Absolute Error (MAE) with Random Forest Regressor: ", mae_rfr)

### 7.5 Comparsion of the Models

In [None]:
def get_accuracy_reg(model, X_tr, y_tr):
    return model.score(X_tr, y_tr)

In [None]:
models = pd.DataFrame({
    'Model': ['LinearRegression', 'Ridge', 'Lasso', 'RandomForestRegressor'],
    'Score': [get_accuracy_reg(reg, X_test, y_test),
              get_accuracy_reg(ridge_model, X_test, y_test),
              get_accuracy_reg(lasso_model, X_test, y_test),
              get_accuracy_reg(rfr, X_test, y_test),
             ]})
models.sort_values(by='Score', ascending=False)

## 8. Clustering

### 8.1 Features

In [None]:
### Group similar exoplanet candidates together.

In [None]:
clustering_features = [ "Orbital_Period_days",
                        "Planetary_Radius_Earth_radii",
                        "Stellar_Radius_Solar_radii",
                        "Stellar_Surface_Gravity_log10(cm/s**2)",
                        "Stellar_Effective_Temperature_K",
                        "Transit_Depth_ppm",
                        "Transit_Duration_hrs",
                        "Insolation_Flux_Earth_flux",
                        "Stellar_Mass_Solar_mass",
                        "Stellar_Metallicity_dex",
                      ]

In [None]:
cluster_exo_data = exo_data[clustering_features]

In [None]:
cluster_exo_data.shape

In [None]:
cluster_exo_data.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
cluster_exo_data.isnull().sum()

In [None]:
cluster_exo_data = cluster_exo_data.dropna()

In [None]:
cluster_exo_data.isnull().sum()

In [None]:
cluster_exo_data.shape

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(cluster_exo_data)

### 8.2 KMeans

In [None]:
from sklearn.cluster import KMeans

# Apply K-Means clustering with an initial guess of 3 clusters
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)

# Get the cluster labels
labels = kmeans.labels_

# Add the labels back to your DataFrame for analysis
cluster_exo_data['Cluster'] = labels

In [None]:
import matplotlib.pyplot as plt
inertia = []

# Try different numbers of clusters (k) and calculate inertia
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot inertia to see the 'elbow'
plt.plot(range(1, 11), inertia)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

score = silhouette_score(X_scaled, labels)
print(f'Silhouette Score: {score}')

### 8.3 PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis')
plt.title('Clusters Visualization')
plt.show()

In [None]:
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA

# Apply PCA to reduce the data to 3 components
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

# Create a DataFrame with the PCA results and cluster labels
df_pca = pd.DataFrame(X_pca, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])
df_pca['Cluster'] = cluster_exo_data['Cluster']

# Plot the clusters in 3D using Plotly
fig = px.scatter_3d(df_pca, x='PC1', y='PC2', z='PC3', color='Cluster',
                    title='3D Cluster Visualization with PCA',
                    labels={'PC1': 'Principal Component 1', 'PC2': 'Principal Component 2', 'PC3': 'Principal Component 3'},
                    opacity=0.7)

# Show the interactive plot
fig.show()


## 9. Fine-tuning and Optimization

In [None]:
### Fine-Tuning Gradient Boosting

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold


y_KF = cleaned_data["Exoplanet_Archive_Disposition_Encoded"]
X_KF = cleaned_data.drop(columns="Exoplanet_Archive_Disposition_Encoded", axis=1)

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_KF)
X_scaled = pd.DataFrame(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_KF, test_size=0.33, random_state=500)
accuracies = []

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

kf_gb = KFold(n_splits=5)
kf_gb.get_n_splits(X_train)
param_grid_gb_expanded = {
    'boosting_type': ['gbdt', 'dart'],
    'objective': ['multiclass'],
    'num_leaves': [20, 31, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [30, 50, 70, 100],
    'max_depth': [3, 5, 7],
    'min_child_samples': [5, 10, 20],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

grid_search_lgb = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=param_grid_gb_expanded,
                               cv=kf_gb, scoring='accuracy',  # Use an appropriate metric for classification
                               verbose=1, n_jobs=-1, return_train_score=True)

# Ensure y_train is 1D and binary
y_train = y_train.to_numpy()

grid_search_gb.fit(X_train, y_train)

mean_test_scores = grid_search_gb.cv_results_['mean_test_score']
mean_train_scores = grid_search_gb.cv_results_['mean_train_score']
n_estimators_range = param_grid_gb['n_estimators']

best_params_gb = grid_search_gb.best_params_
best_score_gb = grid_search_gb.best_score_

plt.figure(figsize=(10, 6))
plt.plot(n_estimators_range, mean_test_scores, label='Validation Accuracy', marker='o')
plt.plot(n_estimators_range, mean_train_scores, label='Train Accuracy', marker='x')
plt.xlabel('Number of Estimators (n_estimators)')
plt.ylabel('Accuracy')
plt.title('Train and Validation Accuracy vs Number of Estimators')
plt.legend()
plt.grid(True)
plt.show()

print(f"Best Parameters: {best_params_gb}")
print(f"Best Score (Accuracy): {best_score_gb}")

In [None]:
### Fine-Tuning LGBM

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold


y_KF = cleaned_data["Exoplanet_Archive_Disposition_Encoded"]
X_KF = cleaned_data.drop(columns="Exoplanet_Archive_Disposition_Encoded", axis=1)

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_KF)
X_scaled = pd.DataFrame(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_KF, test_size=0.33, random_state=500)
accuracies = []

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

kf_lgb = KFold(n_splits=5)
kf_lgb.get_n_splits(X_train)
param_grid_lgb_expanded = {
    'boosting_type': ['gbdt', 'dart'],
    'objective': ['multiclass'],
    'num_leaves': [20, 31, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [30, 50, 70, 100],
    'max_depth': [3, 5, 7],
    'min_child_samples': [5, 10, 20],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

grid_search_lgb = GridSearchCV(estimator=LGBMClassifier(), param_grid=param_grid_lgb_expanded,
                               cv=kf_lgb, scoring='accuracy',  # Use an appropriate metric for classification
                               verbose=1, n_jobs=-1, return_train_score=True)

# Ensure y_train is 1D and binary
y_train = y_train.to_numpy()

grid_search_lgb.fit(X_train, y_train)

mean_test_scores = grid_search_lgb.cv_results_['mean_test_score']
mean_train_scores = grid_search_lgb.cv_results_['mean_train_score']
n_estimators_range = param_grid_lgb['n_estimators']

best_params_lgb = grid_search_lgb.best_params_
best_score_lgb = grid_search_lgb.best_score_

plt.figure(figsize=(10, 6))
plt.plot(n_estimators_range, mean_test_scores, label='Validation Accuracy', marker='o')
plt.plot(n_estimators_range, mean_train_scores, label='Train Accuracy', marker='x')
plt.xlabel('Number of Estimators (n_estimators)')
plt.ylabel('Accuracy')
plt.title('Train and Validation Accuracy vs Number of Estimators')
plt.legend()
plt.grid(True)
plt.show()

print(f"Best Parameters: {best_params_lgb}")
print(f"Best Score (Accuracy): {best_score_lgb}")

In [None]:
### Fine-Tuning Neural Network

## 10. Deployment

In [None]:
### Once the model is performing well, consider deploying it as an API or a web app to make predictions on new data.