In [1]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade
import sklearn
print("sklearn version:", sklearn.__version__)   #sklearn version: 0.23.2

sklearn version: 0.24.1


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib
import numpy as np


# Read the CSV and Perform Basic Data Cleaning

In [3]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()  # empty values, 
df.describe()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
count,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,...,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0,6991.0
mean,0.157059,0.244743,0.202975,0.125018,56.191248,0.001851122,-0.001851122,164.48882,0.00934,-0.00934,...,-161.20698,4.305049,0.121091,-0.14048,1.740749,0.35271,-0.388568,292.082406,43.812143,14.271508
std,0.363882,0.429966,0.402243,0.330763,117.570962,0.007184503,0.007184503,67.020475,0.021989,0.021989,...,71.448481,0.439238,0.132048,0.08199,5.903415,0.839017,1.907797,4.762908,3.606167,1.350802
min,0.0,0.0,0.0,0.0,0.25982,1.1e-08,-0.1568,120.515914,9e-06,-0.569,...,-1733.0,0.047,0.0,-1.007,0.109,0.0,-103.825,279.85608,36.577381,6.966
25%,0.0,0.0,0.0,0.0,2.620126,5.005e-06,-0.0002401,132.683917,0.001145,-0.01,...,-197.0,4.209,0.044,-0.195,0.829,0.128,-0.252,288.70473,40.79776,13.455
50%,0.0,0.0,0.0,0.0,8.947426,3.3e-05,-3.3e-05,136.73923,0.00399,-0.00399,...,-159.0,4.436,0.07,-0.127,0.999,0.248,-0.111,292.31476,43.679661,14.534
75%,0.0,0.0,0.0,0.0,34.282605,0.0002401,-5.005e-06,169.937005,0.01,-0.001145,...,-112.0,4.543,0.149,-0.088,1.357,0.357,-0.069,295.88855,46.693659,15.322
max,1.0,1.0,1.0,1.0,1071.232624,0.1568,-1.1e-08,1472.522306,0.569,-9e-06,...,0.0,5.364,1.472,0.0,180.013,25.956,0.0,301.72076,52.33601,19.065


# Select features (columns)
Decision Trees were used to select the features for X

In [4]:
# X are for x axis and key features
X = df.drop(['koi_disposition','koi_srad','koi_slogg', 'koi_insol_err2','koi_slogg_err1'
            ,'koi_prad_err2','koi_tce_plnt_num','koi_teq','koi_insol','koi_impact_err2'
            ,'koi_depth','koi_srad_err2','koi_kepmag','koi_period_err1','koi_duration_err1'
            , 'koi_time0bk_err1', 'koi_period_err2','koi_steff', 'koi_prad_err1', 'koi_impact_err1'
            , 'koi_depth_err2', 'koi_slogg_err2', 'koi_depth_err1', 'koi_duration_err2', 'koi_duration'
             ], axis = 1)   # 'koi_steff_err1', 'koi_time0bk_err2', 'koi_period', 'koi_srad_err1','dec'
X = X.values.reshape(X.shape[0],X.shape[1])

# Target values
y = df['koi_disposition']

# Label encode y set to 0, 1 or 2
label_encoder = LabelEncoder()
label_encoder.fit(y)
y = label_encoder.transform(y)   # label_encoder.classes_ # the variables

## Use the below code to convert the target from 3 variables to 2 variables by combining Confirmed and Candidate
# y = np.where(y== 1, 0, y)
# y = np.where(y== 2,1, y)

print(y)

[1 2 2 ... 0 2 2]


# Create a Train Test Split

Use `koi_disposition` for the y values

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [6]:
# Scale your data  May or may not affect the accuracy of the model
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model Logistic Regression


In [7]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)   #(max_iter=100)
model.fit(X_train_scaled, y_train)

print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.8313942399389662
Testing Data Score: 0.8220823798627003


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters
Source: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
sklearn Logisics: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [8]:
# Create the GridSearchCV model   
from sklearn.model_selection import GridSearchCV  # algorythm based on light look for best possible accuracy
param_grid = {"C": [0.01, 0.1, 1, 10, 100, 150],    # adjustments, Note that regularization is applied by default.
             'penalty': ['l1', 'l2', 'elasticnet', 'none'],
             'solver':['liblinear'],
             'max_iter':[1200]}
        
model = LogisticRegression(max_iter=1000)  # solver='liblinear'
print(model.get_params())
grid = GridSearchCV(model, param_grid, verbose = 3)

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [9]:
# Train the model with GridSearch fit the model using the grid search extimator
grid.fit(X_train_scaled, y_train)  # replaced grid with model2

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END C=0.01, max_iter=1200, penalty=l1, solver=liblinear; total time=   0.0s
[CV 2/5] END C=0.01, max_iter=1200, penalty=l1, solver=liblinear; total time=   0.0s
[CV 3/5] END C=0.01, max_iter=1200, penalty=l1, solver=liblinear; total time=   0.0s
[CV 4/5] END C=0.01, max_iter=1200, penalty=l1, solver=liblinear; total time=   0.0s
[CV 5/5] END C=0.01, max_iter=1200, penalty=l1, solver=liblinear; total time=   0.0s
[CV 1/5] END C=0.01, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 2/5] END C=0.01, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 3/5] END C=0.01, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 4/5] END C=0.01, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 5/5] END C=0.01, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 1/5] END C=0.01, max_iter=1200, penalty=elasticnet, solver=liblinear; total time=   

Traceback (most recent call last):
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 451, in _check_solver
    " got solver={}.".format(solver))
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

Traceback (most recent call last):
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit


[CV 3/5] END C=0.1, max_iter=1200, penalty=l1, solver=liblinear; total time=   0.0s
[CV 4/5] END C=0.1, max_iter=1200, penalty=l1, solver=liblinear; total time=   0.0s
[CV 5/5] END C=0.1, max_iter=1200, penalty=l1, solver=liblinear; total time=   0.0s
[CV 1/5] END C=0.1, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 2/5] END C=0.1, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 3/5] END C=0.1, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 4/5] END C=0.1, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 5/5] END C=0.1, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 1/5] END C=0.1, max_iter=1200, penalty=elasticnet, solver=liblinear; total time=   0.0s


Traceback (most recent call last):
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 451, in _check_solver
    " got solver={}.".format(solver))
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

Traceback (most recent call last):
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit


[CV 2/5] END C=0.1, max_iter=1200, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV 3/5] END C=0.1, max_iter=1200, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV 4/5] END C=0.1, max_iter=1200, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV 5/5] END C=0.1, max_iter=1200, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV 1/5] END C=0.1, max_iter=1200, penalty=none, solver=liblinear; total time=   0.0s
[CV 2/5] END C=0.1, max_iter=1200, penalty=none, solver=liblinear; total time=   0.0s
[CV 3/5] END C=0.1, max_iter=1200, penalty=none, solver=liblinear; total time=   0.0s
[CV 4/5] END C=0.1, max_iter=1200, penalty=none, solver=liblinear; total time=   0.0s
[CV 5/5] END C=0.1, max_iter=1200, penalty=none, solver=liblinear; total time=   0.0s
[CV 1/5] END C=1, max_iter=1200, penalty=l1, solver=liblinear; total time=   2.7s
[CV 2/5] END C=1, max_iter=1200, penalty=l1, solver=liblinear; total time=   2.5s
[CV 3/5] END C=1, max_iter=1200, penal

Traceback (most recent call last):
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 451, in _check_solver
    " got solver={}.".format(solver))
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

Traceback (most recent call last):
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit


[CV 1/5] END C=10, max_iter=1200, penalty=l1, solver=liblinear; total time=  10.0s
[CV 2/5] END C=10, max_iter=1200, penalty=l1, solver=liblinear; total time=  10.3s
[CV 3/5] END C=10, max_iter=1200, penalty=l1, solver=liblinear; total time=  10.5s
[CV 4/5] END C=10, max_iter=1200, penalty=l1, solver=liblinear; total time=  12.0s
[CV 5/5] END C=10, max_iter=1200, penalty=l1, solver=liblinear; total time=  10.8s
[CV 1/5] END C=10, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 2/5] END C=10, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 3/5] END C=10, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 4/5] END C=10, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 5/5] END C=10, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 1/5] END C=10, max_iter=1200, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV 2/5] END C=10, max_iter=1200, penalty=elasticnet, solver=liblinear; total t

Traceback (most recent call last):
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 451, in _check_solver
    " got solver={}.".format(solver))
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

Traceback (most recent call last):
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit


[CV 1/5] END C=100, max_iter=1200, penalty=l1, solver=liblinear; total time=  12.9s
[CV 2/5] END C=100, max_iter=1200, penalty=l1, solver=liblinear; total time=  13.4s
[CV 3/5] END C=100, max_iter=1200, penalty=l1, solver=liblinear; total time=  13.3s
[CV 4/5] END C=100, max_iter=1200, penalty=l1, solver=liblinear; total time=  16.0s
[CV 5/5] END C=100, max_iter=1200, penalty=l1, solver=liblinear; total time=  13.1s
[CV 1/5] END C=100, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 2/5] END C=100, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 3/5] END C=100, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 4/5] END C=100, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 5/5] END C=100, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 1/5] END C=100, max_iter=1200, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV 2/5] END C=100, max_iter=1200, penalty=elasticnet, solver=liblin

Traceback (most recent call last):
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 451, in _check_solver
    " got solver={}.".format(solver))
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

Traceback (most recent call last):
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit


[CV 1/5] END C=150, max_iter=1200, penalty=l1, solver=liblinear; total time=  13.5s
[CV 2/5] END C=150, max_iter=1200, penalty=l1, solver=liblinear; total time=  13.9s
[CV 3/5] END C=150, max_iter=1200, penalty=l1, solver=liblinear; total time=  13.9s
[CV 4/5] END C=150, max_iter=1200, penalty=l1, solver=liblinear; total time=  16.2s
[CV 5/5] END C=150, max_iter=1200, penalty=l1, solver=liblinear; total time=  13.8s
[CV 1/5] END C=150, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 2/5] END C=150, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 3/5] END C=150, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 4/5] END C=150, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 5/5] END C=150, max_iter=1200, penalty=l2, solver=liblinear; total time=   0.0s
[CV 1/5] END C=150, max_iter=1200, penalty=elasticnet, solver=liblinear; total time=   0.0s
[CV 2/5] END C=150, max_iter=1200, penalty=elasticnet, solver=liblin

Traceback (most recent call last):
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 451, in _check_solver
    " got solver={}.".format(solver))
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

Traceback (most recent call last):
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\paule\anaconda3\envs\PythonData\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit


GridSearchCV(estimator=LogisticRegression(max_iter=1000),
             param_grid={'C': [0.01, 0.1, 1, 10, 100, 150], 'max_iter': [1200],
                         'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                         'solver': ['liblinear']},
             verbose=3)

In [10]:
#Make predictions with the hpyertuned model
predictions = grid.predict(X_test_scaled)

#List the best score
print(grid.best_params_) #List the best parameters for this dataset
print(grid.best_score_)

{'C': 100, 'max_iter': 1200, 'penalty': 'l1', 'solver': 'liblinear'}
0.8582825155182325


In [11]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions)) #, target_names=['']))

              precision    recall  f1-score   support

           0       0.72      0.62      0.66       411
           1       0.71      0.78      0.75       484
           2       0.98      1.00      0.99       853

    accuracy                           0.85      1748
   macro avg       0.81      0.80      0.80      1748
weighted avg       0.85      0.85      0.85      1748



# Save the Model

In [12]:
# import joblib
# filename = 'planet_logistic.sav'
# joblib.dump(your_model, filename)