In [1]:
from sklearn import tree
import pandas as pd
import os
import numpy as np


In [2]:
df = pd.read_csv("processed.csv").drop('Unnamed: 0',axis=1).dropna(how='any')
df.columns

Index(['kepoi_name', 'koi_disposition', 'koi_pdisposition', 'rowid', 'kepid',
       'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
       'koi_score', 'koi_period_err1', 'koi_period_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_prad',
       'koi_insol', 'koi_model_snr'],
      dtype='object')

In [3]:
target = df['koi_disposition']
print(target)

0            CONFIRMED
1            CONFIRMED
2       FALSE POSITIVE
3       FALSE POSITIVE
4            CONFIRMED
             ...      
9559    FALSE POSITIVE
9560    FALSE POSITIVE
9561         CANDIDATE
9562    FALSE POSITIVE
9563    FALSE POSITIVE
Name: koi_disposition, Length: 7904, dtype: object


In [4]:
features = df.drop(['kepoi_name','koi_disposition','koi_pdisposition','rowid','kepid'],axis=1)
feature_names = features.columns
feature_names

Index(['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
       'koi_score', 'koi_period_err1', 'koi_period_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_prad',
       'koi_insol', 'koi_model_snr'],
      dtype='object')

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=420)

In [6]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8395748987854251

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.882085020242915

In [8]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.3256226110078029, 'koi_score'),
 (0.08644426802675177, 'koi_model_snr'),
 (0.0684554990826965, 'koi_fpflag_ss'),
 (0.06607397846072767, 'koi_fpflag_co'),
 (0.06520280638318961, 'koi_prad'),
 (0.04501352938830773, 'koi_depth'),
 (0.04118730015644557, 'koi_duration_err2'),
 (0.040500663371761766, 'koi_duration_err1'),
 (0.03780036932190961, 'koi_duration'),
 (0.03650276743779996, 'koi_impact'),
 (0.034723746316213146, 'koi_fpflag_nt'),
 (0.031044933035740477, 'koi_insol'),
 (0.027043710142928817, 'koi_period_err1'),
 (0.026350308387461503, 'koi_period_err2'),
 (0.024351304447749737, 'koi_fpflag_ec'),
 (0.02262825372553169, 'koi_impact_err1'),
 (0.021053951306981726, 'koi_impact_err2')]

In [9]:
print('Parameters currently in use:\n')
print(rf.get_params())

Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [11]:
# Create the parameter grid based on the results of random search 
feature_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
#     'max_features': [2, 3],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
print(feature_grid)


{'bootstrap': [True], 'max_depth': [80, 90, 100, 110], 'n_estimators': [100, 200, 300, 1000]}


In [13]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(rf, feature_grid, verbose=3, n_jobs=-1)

In [14]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  6.2min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=200, n_jobs=None,
                                              oob_score=False,
                                              ra

In [16]:
print(grid.best_params_)
print(grid.best_score_)
predictions = grid.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

{'bootstrap': True, 'max_depth': 110, 'n_estimators': 300}
0.8891714161703703
                precision    recall  f1-score   support

     CANDIDATE       0.77      0.69      0.73       438
     CONFIRMED       0.79      0.83      0.81       576
FALSE POSITIVE       0.99      1.00      0.99       962

      accuracy                           0.88      1976
     macro avg       0.85      0.84      0.84      1976
  weighted avg       0.88      0.88      0.88      1976



In [18]:
import joblib
joblib.dump(rf, 'model_RandomForest.sav')

['model_RandomForest.sav']