In [1]:
# Update sklearn to prevent version mismatches
# !pip3 install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
# !pip3 install joblib

In [3]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [4]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [5]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [6]:
# Set features. This will also be used as your x values.
# selected_features = df[['names', 'of', 'selected', 'features', 'here']]

In [7]:
target = df["koi_disposition"]

data = df.drop("koi_disposition", axis=1)
feature_names = data.columns

data.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Create a Train Test Split

Use `koi_disposition` for the y values

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [9]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model - Random Forest



In [10]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [11]:
model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [12]:
# Train Model 

model.fit(X_train_scaled, y_train)

print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.996185390043868
Testing Data Score: 0.8884439359267735


In [13]:
# sort the features by their importance

sorted(zip(model.feature_importances_, feature_names), reverse=True)

[(0.10837766444148203, 'koi_fpflag_nt'),
 (0.10130725567388273, 'koi_fpflag_co'),
 (0.09905803092622907, 'koi_fpflag_ss'),
 (0.07977042326487702, 'koi_prad_err1'),
 (0.05369798205826469, 'koi_model_snr'),
 (0.049774553938080834, 'koi_duration_err2'),
 (0.03296127375094372, 'koi_fpflag_ec'),
 (0.027752110300453964, 'koi_steff_err1'),
 (0.026078066099155483, 'koi_time0bk_err1'),
 (0.02425387076767618, 'koi_duration'),
 (0.022999346660941382, 'koi_duration_err1'),
 (0.022815125593303974, 'koi_teq'),
 (0.02169527677906597, 'koi_prad'),
 (0.021478330248396786, 'koi_period'),
 (0.019831424251064423, 'koi_insol_err2'),
 (0.019150109261641377, 'koi_steff_err2'),
 (0.019147184211576, 'koi_period_err1'),
 (0.018060712340466193, 'koi_period_err2'),
 (0.016197342500867386, 'koi_time0bk_err2'),
 (0.015163888590862835, 'koi_srad_err1'),
 (0.014589654842132826, 'koi_impact'),
 (0.014143107641983546, 'koi_depth'),
 (0.013146780135805326, 'ra'),
 (0.012271175397576467, 'koi_insol'),
 (0.011739581249361

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [14]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust

from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [250,300,350],
              'max_depth': [175, 190, 200]}

grid = GridSearchCV(model, param_grid, verbose=3)

In [15]:
# Train the model with GridSearch

grid.fit(X_train_scaled, y_train)


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] max_depth=175, n_estimators=250 .................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..... max_depth=175, n_estimators=250, score=0.906, total=   3.0s
[CV] max_depth=175, n_estimators=250 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.0s remaining:    0.0s


[CV] ..... max_depth=175, n_estimators=250, score=0.889, total=   2.9s
[CV] max_depth=175, n_estimators=250 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.9s remaining:    0.0s


[CV] ..... max_depth=175, n_estimators=250, score=0.883, total=   3.0s
[CV] max_depth=175, n_estimators=300 .................................
[CV] ..... max_depth=175, n_estimators=300, score=0.906, total=   3.6s
[CV] max_depth=175, n_estimators=300 .................................
[CV] ..... max_depth=175, n_estimators=300, score=0.890, total=   3.5s
[CV] max_depth=175, n_estimators=300 .................................
[CV] ..... max_depth=175, n_estimators=300, score=0.882, total=   3.8s
[CV] max_depth=175, n_estimators=350 .................................
[CV] ..... max_depth=175, n_estimators=350, score=0.902, total=   4.9s
[CV] max_depth=175, n_estimators=350 .................................
[CV] ..... max_depth=175, n_estimators=350, score=0.888, total=   4.9s
[CV] max_depth=175, n_estimators=350 .................................
[CV] ..... max_depth=175, n_estimators=350, score=0.883, total=   4.5s
[CV] max_depth=190, n_estimators=250 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  1.7min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

In [16]:
print(f"Best_Grid_Params: {grid.best_params_}")
print(f"Best_Grid_Score: {grid.best_score_}")

Best_Grid_Params: {'max_depth': 175, 'n_estimators': 300}
Best_Grid_Score: 0.8926187297348847


In [17]:
print(f"Training Data Grid Score: {grid.score(X_train_scaled, y_train)}")
print(f"Testing Data Grid Score: {grid.score(X_test_scaled, y_test)}")

Training Data Grid Score: 1.0
Testing Data Grid Score: 0.898741418764302


In [19]:
# Update Model with New Params

model_new = RandomForestClassifier(n_estimators=350, max_depth=190)
model_new.fit(X_train_scaled, y_train)

print(f"New Training Data Score: {model_new.score(X_train_scaled, y_train)}")
print(f"New Testing Data Score: {model_new.score(X_test_scaled, y_test)}")

New Training Data Score: 1.0
New Testing Data Score: 0.9016018306636155


In [20]:
# Make Prediction and Save Variable for Classification Report

predictions = model_new.predict(X_test_scaled)

In [21]:
# Print Classification Report with Predictions

from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.83      0.76      0.80       411
     CONFIRMED       0.84      0.85      0.85       484
FALSE POSITIVE       0.97      1.00      0.98       853

      accuracy                           0.90      1748
     macro avg       0.88      0.87      0.87      1748
  weighted avg       0.90      0.90      0.90      1748



# Save the Model

In [22]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'model_1_RandomForest.sav'
joblib.dump(model, filename)

['model_1_RandomForest.sav']