## Modeling

This notebook begins the modeling process for our data set. I will work through several different machine learning algorithms and choose which one is the best based on several metrics including precision and recall.

In [162]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keplerutils

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

from imblearn.over_sampling import SMOTE

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df = pd.read_csv('objects-of-interest.csv', index_col = 0)

In [3]:
df = df.drop(labels=['rowid', 'kepoi_name', 'pdisposition', 'tce_delivname', 'kepid'], axis=1)

In [4]:
df = df.drop(labels=['tce_plnt_num'], axis=1)

In [5]:
df.head()

Unnamed: 0,disposition,period,time0bk,impact,duration,depth,prad,teq,insol,model_snr,steff,slogg,srad,ra,dec,kepmag
0,CONFIRMED,9.488036,170.53875,0.146,2.9575,615.8,2.26,793.0,93.59,35.8,5455.0,4.467,0.927,291.93423,48.141651,15.347
1,CONFIRMED,54.418383,162.51384,0.586,4.507,874.8,2.83,443.0,9.11,25.8,5455.0,4.467,0.927,291.93423,48.141651,15.347
2,FALSE POSITIVE,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638.0,39.3,76.3,5853.0,4.544,0.868,297.00482,48.134129,15.436
3,FALSE POSITIVE,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395.0,891.96,505.6,5805.0,4.564,0.791,285.53461,48.28521,15.597
4,CONFIRMED,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406.0,926.16,40.9,6031.0,4.438,1.046,288.75488,48.2262,15.509


## K Nearest Neighbors

In [165]:
def knnCV(df):
    '''Splits data in to test and training data, 10-fold cross-validates on training data,
       calculates scoring metric, and returns the mean of that scoring metric
       ----Parameters----
       df: Pandas dataframe with response in 0th column, features in rest of columns
       ----Returns----
       np.mean(scores): the mean of the scores calculated by cross_val_score
    ''' 
    X_train_res, X_test, y_train_res, y_test = keplerutils.split_and_upsample(df)
    
    pipe = make_pipeline(StandardScaler(), KNeighborsClassifier())
    
    tuning_params = {'kneighborsclassifier__n_neighbors': [i for i in range(2, 20)],
                     'kneighborsclassifier__weights': ['distance', 'uniform']}
    
    g = RandomizedSearchCV(pipe, tuning_params, scoring='recall', cv=5)
    
    g.fit(X_train_res, y_train_res)
        
    y_preds = g.predict(X_test)
    
    
    print('Accuracy:', accuracy_score(y_test, y_preds))
        
    print('Best parameters: ', g.best_params_)
    
    print('Mean grid scores: ', g.cv_results_['mean_test_score'])
    
    print('Recall score of predictions:', recall_score(y_preds, y_test))

In [166]:
knnCV(df)

Accuracy: 0.801762114537445
Best parameters:  {'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__n_neighbors': 8}
Mean grid scores:  [0.95615142 0.93028391 0.9555205  0.93564669 0.93217666 0.95457413
 0.88864353 0.93659306 0.95362776 0.93943218]
Recall score of predictions: 0.6497890295358649


## Logistic Regression

In [131]:
def logisticCV(df):
    '''
        Performs logistic regression with 5-fold cross validation on training data and returns 
        f1 macro score.
       ----Parameters----
       df: Pandas dataframe with response in 0th column, features in rest of columns
       ----Returns----
       np.mean(scores): the mean of the scores calculated by GridSearchCV
    '''
    X_train_res, X_test, y_train_res, y_test = split_and_upsample(df)
    
    pipe = make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear', max_iter=1000))
        
    tuning_params = {'logisticregression__C': [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7]}
    
    g = GridSearchCV(pipe, tuning_params, scoring='f1_macro', cv=5)
    
    g.fit(X_train_res, y_train_res)
    
    y_preds = g.predict(X_test)
    
    
    
    print('Best parameters: ', g.best_params_)
    
    print('Mean grid scores: ', g.cv_results_['mean_test_score'])
    
    y_test = list(map(keplerutils.encode_response, y_test))
    y_preds = list(map(keplerutils.encode_response, y_preds))
    
    print('Accuracy score:', accuracy_score(y_test, y_preds))
    
    print('F1 score of predictions:', f1_score(y_test, y_preds))

In [132]:
logisticCV(df)

Best parameters:  {'logisticregression__C': 10000.0}
Mean grid scores:  [0.81229695 0.83325926 0.84806775 0.85236257 0.85431071 0.85746323
 0.85789924 0.85740406 0.85740417 0.85757661]
Accuracy score: 0.8360254527655409
F1 score of predictions: 0.7848426461143224


## Random Forest

In [157]:
def RFCCV(df):
    '''
        Performs Random Forest Classificiation with 5 fold cross validation on training data and
        returns f1 macro score.
        ----Parameters----
       df: Pandas dataframe with response in 0th column, features in rest of columns
       ----Returns----
       np.mean(scores): the mean of the scores calculated by GridSearchCV
    '''
    X_train_res, X_test, y_train_res, y_test = keplerutils.split_and_upsample(df)
                
    pipe = make_pipeline(StandardScaler(), RandomForestClassifier())
        
    tuning_params = {'randomforestclassifier__max_depth': [50, 100, 150, 200], 
                     'randomforestclassifier__n_estimators': [50, 100, 150, 200]}
    
    g = RandomizedSearchCV(pipe, tuning_params, scoring='f1_macro', cv=5)
    
    g.fit(X_train_res, y_train_res)
    
    y_preds = g.predict(X_test)
    
    
    
    print('Best parameters: ', g.best_params_)
    
    print('Mean grid scores: ', g.cv_results_['mean_test_score'])
    
    print('Accuracy score:', accuracy_score(y_test, y_preds))
    
    print('F1 score of predictions:', f1_score(y_test, y_preds))

In [159]:
RFCCV(df)

Best parameters:  {'randomforestclassifier__n_estimators': 100, 'randomforestclassifier__max_depth': 100}
Mean grid scores:  [0.93924386 0.94098129 0.9393992  0.9403477  0.94209396 0.94145864
 0.94098197 0.9406682  0.9411421  0.94161415]
Accuracy score: 0.922173274596182
F1 score of predictions: 0.8853640951694305


## Naive Bayes