In [10]:
# DATASET is exoplanet data from NASA
# https://exoplanetarchive.ipac.caltech.edu/cgi-bin/TblView/nph-tblView?app=ExoTbls&config=k2pandc

In [19]:
import pandas as pd
df = pd.read_csv('sample_data/exoplanet_dataset.csv', on_bad_lines='skip')

# Creating train and test datasets based on disposition categories
train = df[df['disposition'].isin(['CONFIRMED', 'FALSE POSITIVE'])]
test = df[df['disposition'] == 'CANDIDATE']

# Checking the sizes of train and test datasets
print("Train dataset size:", len(train))
print("Test dataset size:", len(test))

Train dataset size: 2402
Test dataset size: 1370


In [22]:
train.columns

Index(['pl_name', 'hostname', 'default_flag', 'disposition', 'disp_refname',
       'sy_snum', 'sy_pnum', 'discoverymethod', 'disc_year', 'disc_facility',
       'soltype', 'pl_controv_flag', 'pl_refname', 'pl_orbper',
       'pl_orbpererr1', 'pl_orbpererr2', 'pl_orbperlim', 'pl_orbsmax',
       'pl_orbsmaxerr1', 'pl_orbsmaxerr2', 'pl_orbsmaxlim', 'pl_rade',
       'pl_radeerr1', 'pl_radeerr2', 'pl_radelim', 'pl_radj', 'pl_radjerr1',
       'pl_radjerr2', 'pl_radjlim', 'pl_bmasse', 'pl_bmasseerr1',
       'pl_bmasseerr2', 'pl_bmasselim', 'pl_bmassj', 'pl_bmassjerr1',
       'pl_bmassjerr2', 'pl_bmassjlim', 'pl_bmassprov', 'pl_orbeccen',
       'pl_orbeccenerr1', 'pl_orbeccenerr2', 'pl_orbeccenlim', 'pl_insol',
       'pl_insolerr1', 'pl_insolerr2', 'pl_insollim', 'pl_eqt', 'pl_eqterr1',
       'pl_eqterr2', 'pl_eqtlim', 'ttv_flag', 'st_refname', 'st_spectype',
       'st_teff', 'st_tefferr1', 'st_tefferr2', 'st_tefflim', 'st_rad',
       'st_raderr1', 'st_raderr2', 'st_radlim', 'st_mas

In [74]:
selected_columns = ['disposition', 'pl_name', 'hostname','sy_snum', 'sy_pnum', 'discoverymethod',
                     'pl_controv_flag', 'pl_orbper',
                    'pl_orbpererr1', 'pl_orbpererr2', 'pl_orbperlim', 'pl_orbsmax',
                    'pl_orbsmaxerr1', 'pl_orbsmaxerr2', 'pl_orbsmaxlim', 'pl_rade',
                    'pl_radeerr1', 'pl_radeerr2', 'pl_radelim', 'pl_radj', 'pl_radjerr1',
                    'pl_radjerr2', 'pl_radjlim', 'pl_bmasse', 'pl_bmasseerr1',
                    'pl_bmasseerr2', 'pl_bmasselim', 'pl_bmassj', 'pl_bmassjerr1',
                    'pl_bmassjerr2', 'pl_bmassjlim', 'pl_bmassprov', 'pl_orbeccen',
                    'pl_orbeccenerr1', 'pl_orbeccenerr2', 'pl_orbeccenlim', 'pl_insol',
                    'pl_insolerr1', 'pl_insolerr2', 'pl_insollim', 'pl_eqt', 'pl_eqterr1',
                    'pl_eqterr2', 'pl_eqtlim', 'ttv_flag', 'st_spectype',
                    'st_teff', 'st_tefferr1', 'st_tefferr2', 'st_tefflim', 'st_rad',
                    'st_raderr1', 'st_raderr2', 'st_radlim', 'st_mass', 'st_masserr1',
                    'st_masserr2', 'st_masslim', 'st_met', 'st_meterr1', 'st_meterr2',
                    'st_metlim', 'st_metratio', 'st_logg', 'st_loggerr1', 'st_loggerr2',
                    'st_logglim', 'rastr', 'ra', 'decstr', 'dec', 'sy_dist',
                    'sy_disterr1', 'sy_disterr2', 'sy_vmag', 'sy_vmagerr1', 'sy_vmagerr2',
                    'sy_kmag', 'sy_kmagerr1', 'sy_kmagerr2', 'sy_gaiamag', 'sy_gaiamagerr1',
                    'sy_gaiamagerr2']

selected_labeled = train[selected_columns]
selected_unlabeled = test[selected_columns]

In [75]:
import pandas as pd

# Assuming processed_df contains the transformed DataFrame

# Separating numeric columns
numeric_columns = selected_labeled.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Separating non-numeric columns (categorical columns after one-hot encoding)
non_numeric_columns = [col for col in selected_labeled.columns if col not in numeric_columns]


# Creating DataFrames for numeric and non-numeric columns
x_labeled = selected_labeled[numeric_columns].fillna(0)
non_numeric_df = selected_labeled[non_numeric_columns]
y_labeled = selected_labeled['disposition']

x_unlabeled = selected_unlabeled[numeric_columns].fillna(0)

In [76]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming x_train contains the features and y_train contains the target variable

# Splitting data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_labeled, y_labeled, test_size=0.2, random_state=42)

# Initialize Gradient Boosting Classifier
gradient_boosting = GradientBoostingClassifier()

# Fit the model
gradient_boosting.fit(x_train, y_train)

# Predict on the test set
predictions = gradient_boosting.predict(x_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")


Accuracy: 1.0


In [77]:
predictions_unlabeled = gradient_boosting.predict(x_unlabeled)

# Create a copy of x_unlabeled DataFrame to avoid modifying the original data
selected_unlabeled_with_predictions = selected_unlabeled.copy()

# Add predictions as a new column named 'prediction' at the beginning of the DataFrame
selected_unlabeled_with_predictions.insert(0, 'prediction', predictions_unlabeled)

# Save the modified DataFrame with predictions as a CSV file
selected_unlabeled_with_predictions.to_csv('predictions.csv', index=False)
