In [None]:
from sklearn import tree
import pandas as pd
import os
import numpy as np


In [4]:
df = pd.read_csv("processed.csv").drop('Unnamed: 0',axis=1).dropna(how='any')
df.columns

Index(['kepoi_name', 'koi_disposition', 'koi_pdisposition', 'rowid', 'kepid',
       'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
       'koi_score', 'koi_period_err1', 'koi_period_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_prad',
       'koi_insol', 'koi_model_snr'],
      dtype='object')

In [5]:
target = df['koi_disposition']
print(target)

0            CONFIRMED
1            CONFIRMED
2       FALSE POSITIVE
3       FALSE POSITIVE
4            CONFIRMED
             ...      
9559    FALSE POSITIVE
9560    FALSE POSITIVE
9561         CANDIDATE
9562    FALSE POSITIVE
9563    FALSE POSITIVE
Name: koi_disposition, Length: 7904, dtype: object


In [6]:
features = df.drop(['kepoi_name','koi_disposition','koi_pdisposition','rowid','kepid'],axis=1)
feature_names = features.columns
feature_names

Index(['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
       'koi_score', 'koi_period_err1', 'koi_period_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_prad',
       'koi_insol', 'koi_model_snr'],
      dtype='object')

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=420)

In [8]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8395748987854251

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.8810728744939271

In [10]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.3267905765486819, 'koi_score'),
 (0.08139544698083025, 'koi_model_snr'),
 (0.0705435715294289, 'koi_fpflag_co'),
 (0.06760628715080565, 'koi_prad'),
 (0.06277681944740927, 'koi_fpflag_ss'),
 (0.047450276561673506, 'koi_depth'),
 (0.045210119788588236, 'koi_duration_err2'),
 (0.04258634223139488, 'koi_duration_err1'),
 (0.03817510921072573, 'koi_duration'),
 (0.03721403735435645, 'koi_impact'),
 (0.03125403665706858, 'koi_fpflag_nt'),
 (0.029639400308047667, 'koi_insol'),
 (0.027721195656447598, 'koi_period_err2'),
 (0.02691390721472833, 'koi_period_err1'),
 (0.022973061367746878, 'koi_impact_err1'),
 (0.02124392323836542, 'koi_fpflag_ec'),
 (0.020505888753700748, 'koi_impact_err2')]

In [11]:
print('Parameters currently in use:\n')
print(rf.get_params())

Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [12]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
feature_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(feature_grid)


{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [13]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(rf, feature_grid, verbose=1, n_jobs=-1)

In [None]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 4320 candidates, totalling 21600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
