## Modeling

This notebook begins the modeling process for our data set. I will work through several different machine learning algorithms and choose which one is the best based on several metrics including precision and recall.

In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keplerutils

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

from imblearn.over_sampling import SMOTE

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
df = pd.read_csv('objects-of-interest.csv', index_col = 0)

In [3]:
df = df.drop(labels=['rowid', 'kepoi_name', 'pdisposition', 'tce_delivname', 'kepid'], axis=1)

In [4]:
df = df.drop(labels=['tce_plnt_num'], axis=1)

In [5]:
df.head()

Unnamed: 0,disposition,period,time0bk,impact,duration,depth,prad,teq,insol,model_snr,steff,slogg,srad,ra,dec,kepmag
0,CONFIRMED,9.488036,170.53875,0.146,2.9575,615.8,2.26,793.0,93.59,35.8,5455.0,4.467,0.927,291.93423,48.141651,15.347
1,CONFIRMED,54.418383,162.51384,0.586,4.507,874.8,2.83,443.0,9.11,25.8,5455.0,4.467,0.927,291.93423,48.141651,15.347
2,FALSE POSITIVE,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638.0,39.3,76.3,5853.0,4.544,0.868,297.00482,48.134129,15.436
3,FALSE POSITIVE,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395.0,891.96,505.6,5805.0,4.564,0.791,285.53461,48.28521,15.597
4,CONFIRMED,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406.0,926.16,40.9,6031.0,4.438,1.046,288.75488,48.2262,15.509


In [17]:
def split_and_upsample(df):
    '''
        Does a train test split on response and features, uses SMOTE to upsample imputed values
        from minority class to achieve 1:1 ratio of classes, returns the upsampled training response
        and features.
        ----Parameters----
        df: Pandas dataframe with response in 0th column, features in rest of columns
        ----Returns----
        X_train_res: upsampled imputed features
        y_train_res: upsampled imputed response
        X_test: unchanged test features
        y_test: unchanged test response
    '''
    
    y = df.iloc[:, 0]
    X = df.iloc[:, 1:]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    sm = SMOTE()
    
    X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())
    
    return X_train_res, X_test, y_train_res, y_test

In [94]:
def knnCV(df):
    '''Splits data in to test and training data, 10-fold cross-validates on training data,
       calculates scoring metric, and returns the mean of that scoring metric
       ----Parameters----
       df: Pandas dataframe with response in 0th column, features in rest of columns
       ----Returns----
       np.mean(scores): the mean of the scores calculated by cross_val_score
    ''' 
    X_train_res, X_test, y_train_res, y_test = split_and_upsample(df)
    
    y_train_res = list(map(encode_))
        
    pipe = make_pipeline(StandardScaler(), KNeighborsClassifier())
    
    tuning_params = {'kneighborsclassifier__n_neighbors': [i for i in range(2, 20)],
                     'kneighborsclassifier__weights': ['distance', 'uniform']}
    
    g = GridSearchCV(pipe, tuning_params, scoring='f1', cv=5)
    
    g.fit(X_train_res, y_train_res)
        
    y_preds = g.predict(X_test)
    
    y_test = list(map(keplerutils.encode_response, y_test))
    y_preds = list(map(keplerutils.encode_response, y_preds))
        
    print('Best parameters: ', g.best_params_)
    
    print('Mean grid scores: ', g.cv_results_['mean_test_score'])
    
    print('F1 score of predictions:', f1_score(y_test, y_preds))

In [95]:
knnCV(df)

  if pos_label not in present_labels:


ValueError: pos_label=1 is not a valid label: array(['CONFIRMED', 'FALSE POSITIVE'], dtype='<U14')

In [None]:
def logisticCV(df):
    '''
        Performs logistic regression with 5-fold cross validation on training data and returns 
        f1 macro score.
       ----Parameters----
       df: Pandas dataframe with response in 0th column, features in rest of columns
       ----Returns----
       np.mean(scores): the mean of the scores calculated by GridSearchCV
    '''
    X_train_res, X_test, y_train_res, y_test = split_and_upsample(df)
    
    pipe = make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear'))
        
    tuning_params = {'logisticregression__C': [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7]}
    
    g = GridSearchCV(pipe, tuning_params, scoring='f1_macro', cv=5)
    
    g.fit(X_train_res, y_train_res)
    
    y_preds = g.predict(X_test)
    
    print('Best parameters: ', g.best_params_)
    
    print('Mean grid scores: ', g.cv_results_['mean_test_score'])
    
    y_test = list(map(encode_response, y_test))
    y_preds = list(map(encode_response, y_preds))
    
    print('F1 score of predictions:', f1_score(y_test, y_preds))

In [84]:
logisticCV(df)

Best parameters:  {'logisticregression__C': 1000.0}
Mean grid scores:  [0.81403963 0.83781706 0.85073907 0.85677794 0.8586777  0.85913294
 0.85866169 0.8585007  0.85865571 0.85866169]
F1 score of predictions: 0.7936305732484077


In [None]:
def RFCCV(df):
    '''
        Performs Random Forest Classificiation with 5 fold cross validation on training data and
        returns f1 macro score.
        ----Parameters----
       df: Pandas dataframe with response in 0th column, features in rest of columns
       ----Returns----
       np.mean(scores): the mean of the scores calculated by GridSearchCV
    '''
    X_train_res, X_test, y_train_res, y_test = split_and_upsample(df)
    
    pipe = make_pipeline(StandardScaler(), RandomForestClassifier())
    
    print(pipe.get_params_().keys())
    
    tuning_params = {}

In [None]:
RFCCV(df)