# Machine Learning project
- Validate exoplanets Kepler disposition with machine learning methods

In [402]:
%config Completer.use_jedi = False  # enable code auto-completion
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns  #data visualization library
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix  # evaluation metrics

from sklearn.datasets import fetch_openml 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

## Dataset


In [403]:
df = pd.read_csv('testData.csv')
df.columns =['KepID',"KOIName","KeplerName","ExoplanetArchiveDisposition",
"ExoplanetDispositionKepler","DispositionScore","NotTransit-LikeFalsePositiveFlag",
"koi_fpflag_ss","CentroidOffsetFalsePositiveFlag","EphemerisMatchIndicatesContaminationFalsePositiveFlag",
"OrbitalPeriod.[Days]","OrbitalPeriodErr1.[Days]","OrbitalPeriodErr2.[Days]",
"TransitEpoch-bk","TransitEpoch-bkErr1","TransitEpoch-bkErr2",
"ImpactParameter","ImpactParameterErr1","ImpactParameterErr2",
"TransitDuration.[Hours]","TransitDurationErr1.[Hours]","TransitDurationErr2.[Hours]",
"TransitDepth.[ppm]","TransitDepthErr1.[ppm]","TransitDepthErr2.[ppm]",
"PlanetaryRadius","PlanetaryRadiusErr1","PlanetaryRadiusErr2",
"EquilibriumTemperature.[K]","EquilibriumTemperatureErr1.[K]","EquilibriumTemperatureErr2.[K]",
"InsolationFlux","InsolationFluxErr1","InsolationFlux2",
"TransitSignalToNoise",
"TCEPlanetNumber","TCEDeliveryName",
"StellarEffectiveTemperature.[K]","StellarEffectiveTemperatureErr1.[K]","StellarEffectiveTemperatureErr2.[K]",
"StellarSurfaceGravity","StellarSurfaceGravityErr1","StellarSurfaceGravityErr2",
"StellarRadius","StellarRadiusErr1","StellarRadiusErr2",
"RA.[deg]","Dec.[deg]","KeplerMagnitude.[mag]"]


df.head()
#df['EquilibriumTemperatureErr2.[K]']

Unnamed: 0,KepID,KOIName,KeplerName,ExoplanetArchiveDisposition,ExoplanetDispositionKepler,DispositionScore,NotTransit-LikeFalsePositiveFlag,koi_fpflag_ss,CentroidOffsetFalsePositiveFlag,EphemerisMatchIndicatesContaminationFalsePositiveFlag,...,StellarEffectiveTemperatureErr2.[K],StellarSurfaceGravity,StellarSurfaceGravityErr1,StellarSurfaceGravityErr2,StellarRadius,StellarRadiusErr1,StellarRadiusErr2,RA.[deg],Dec.[deg],KeplerMagnitude.[mag]
0,11446443,K00001.01,Kepler-1 b,CONFIRMED,CANDIDATE,0.811,0,0,0,0,...,-78.0,4.457,0.024,-0.024,0.964,0.038,-0.038,286.80847,49.316399,11.338
1,10666592,K00002.01,Kepler-2 b,CONFIRMED,CANDIDATE,1.0,0,1,0,0,...,-89.0,4.019,0.033,-0.027,1.952,0.099,-0.11,292.24728,47.969521,10.463
2,10748390,K00003.01,Kepler-3 b,CONFIRMED,CANDIDATE,0.913,0,0,0,0,...,-95.0,4.591,0.015,-0.036,0.763,0.028,-0.028,297.70935,48.080853,9.174
3,3861595,K00004.01,Kepler-1658 b,CONFIRMED,CANDIDATE,1.0,0,1,0,0,...,-114.0,3.657,0.27,-0.09,2.992,0.416,-0.971,294.35654,38.94738,11.432
4,8554498,K00005.01,,CANDIDATE,CANDIDATE,1.0,0,0,0,0,...,-65.0,4.012,0.03,-0.03,1.787,0.132,-0.076,289.73972,44.647419,11.665


# Modify dispositions
Modify KOI pdisposition and KOI disposition into binary form

In [404]:
df['ExoplanetCandidate'] = df['ExoplanetArchiveDisposition'].apply(lambda x: 1 if x == 'CANDIDATE' else 0)
df['ExoplanetConfirmed'] = df['ExoplanetDispositionKepler'].apply(lambda x: 2 if x == 'CONFIRMED' else 1 if x == 'CANDIDATE' else 0 )

In [405]:
df.drop(columns=['KeplerName','KOIName','EquilibriumTemperatureErr1.[K]', 'StellarEffectiveTemperatureErr2.[K]',
                 'KepID','ExoplanetArchiveDisposition','ExoplanetDispositionKepler',
                 'NotTransit-LikeFalsePositiveFlag','koi_fpflag_ss','CentroidOffsetFalsePositiveFlag',
                 'EphemerisMatchIndicatesContaminationFalsePositiveFlag','TCEDeliveryName', 'EquilibriumTemperatureErr2.[K]'], inplace=True)
df.head()

Unnamed: 0,DispositionScore,OrbitalPeriod.[Days],OrbitalPeriodErr1.[Days],OrbitalPeriodErr2.[Days],TransitEpoch-bk,TransitEpoch-bkErr1,TransitEpoch-bkErr2,ImpactParameter,ImpactParameterErr1,ImpactParameterErr2,...,StellarSurfaceGravityErr1,StellarSurfaceGravityErr2,StellarRadius,StellarRadiusErr1,StellarRadiusErr2,RA.[deg],Dec.[deg],KeplerMagnitude.[mag],ExoplanetCandidate,ExoplanetConfirmed
0,0.811,2.470613,2.7e-08,-2.7e-08,122.763305,9e-06,-9e-06,0.818,0.001,-0.001,...,0.024,-0.024,0.964,0.038,-0.038,286.80847,49.316399,11.338,0,1
1,1.0,2.204735,4.3e-08,-4.3e-08,121.358542,1.6e-05,-1.6e-05,0.224,0.159,-0.216,...,0.033,-0.027,1.952,0.099,-0.11,292.24728,47.969521,10.463,0,1
2,0.913,4.887803,4.66e-07,-4.66e-07,124.813081,7.5e-05,-7.5e-05,0.054,0.145,-0.054,...,0.015,-0.036,0.763,0.028,-0.028,297.70935,48.080853,9.174,0,1
3,1.0,3.849372,2.344e-06,-2.344e-06,157.526686,0.000488,-0.000488,0.915,0.02,-0.056,...,0.27,-0.09,2.992,0.416,-0.971,294.35654,38.94738,11.432,0,1
4,1.0,4.780328,8.52e-07,-8.52e-07,132.974086,0.000148,-0.000148,0.952,0.001,-0.002,...,0.03,-0.03,1.787,0.132,-0.076,289.73972,44.647419,11.665,1,1


In [406]:
df.isna().any()
df.shape


(8054, 38)

In [407]:
df_cleaned = df.dropna()

In [408]:
df_cleaned.shape

(7818, 38)

# Features and labels

In [409]:
# Create the feature matrix X and label vector y. 
y = df_cleaned['ExoplanetCandidate'].to_numpy()
X = df_cleaned.drop(columns=['ExoplanetCandidate', 'ExoplanetConfirmed']).to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=1000)

In [411]:
clf = LogisticRegression(C=100, max_iter=1000, class_weight='balanced')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)



0.484