In [5]:
import pandas as pd
import numpy as np
from sklearn import svm 
import matplotlib.pyplot as plt

from matplotlib import style
style.use("ggplot")

# Read the CSV and Perform Basic Data Cleaning

In [6]:
df = pd.read_csv("cumulative.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


# Create a Train Test Split

Use `koi_disposition` for the y values

In [7]:
# Y VALUES
target = df["koi_disposition"]
target_names = ["CONFIRMED", "FALSE POSITIVE"]

In [8]:
# X VALUES
data = df.drop("koi_disposition", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,-0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,-0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,-0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target,
                                                    train_size=80,
                                                    test_size=20,
                                                    random_state=42)

In [10]:
target.head()

0         CONFIRMED
1         CONFIRMED
2    FALSE POSITIVE
3    FALSE POSITIVE
4         CONFIRMED
Name: koi_disposition, dtype: object

In [11]:
y_target = target.copy()

data_binary_encoded = pd.get_dummies(y_target, columns=["koi_disposition"])
data_binary_encoded.head()

Unnamed: 0,CANDIDATE,CONFIRMED,FALSE POSITIVE
0,0,1,0
1,0,1,0
2,0,0,1
3,0,0,1
4,0,1,0


In [12]:
# I wanted to reshape and distribute these values so there was an equal amount of values but didn't figure it out
df["koi_disposition"].value_counts()

FALSE POSITIVE    4358
CONFIRMED         2272
CANDIDATE         2114
Name: koi_disposition, dtype: int64

# Pre-processing

Scale the data using the MinMaxScaler

Pre-processing
Scale the data using the MinMaxScaler

### Sample code using StandardScaler
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Sample Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

### Sample Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

### Sample code using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler()
print(scaler.fit(data))
MinMaxScaler(copy=True, feature_range=(0, 1))
print(scaler.data_max_)
[ 1. 18.]
print(scaler.transform(data))
[[0.   0.  ]
[0.25 0.25]
[0.5  0.5 ]
[1.   1.  ]]
print(scaler.transform([[2, 2]]))
[[1.5 0. ]]

In [13]:
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
scaled_Xtrain = scaler.fit_transform(X_train)
scaled_Xtest = scaler.fit_transform(X_test)

print(scaler)
print(scaled_Xtrain)

MinMaxScaler(copy=True, feature_range=(0, 1))
[[0.         1.         0.         ... 0.59724069 0.34694501 0.30067633]
 [0.         0.         0.         ... 0.65132413 0.76237121 0.93777778]
 [0.         1.         1.         ... 0.83697557 0.42877287 0.91246377]
 ...
 [0.         0.         0.         ... 0.12160206 0.68339223 0.81352657]
 [0.         0.         0.         ... 0.77916261 0.305052   0.78028986]
 [0.         0.         1.         ... 1.         0.58921787 0.77256039]]


In [44]:
#LabelEncoder()
#Encode classes for target data // not using thooo

#turning the target into a nurmerical value

le = preprocessing.LabelEncoder()
le.fit(target)

y_train_encoded = le.transform(y_train)
y_test_encoded = le.transform(y_test)

print(le.classes_)
print(y_test_encoded)
print(y_train_encoded)

['CANDIDATE' 'CONFIRMED' 'FALSE POSITIVE']
[2 0 1 1 0 2 0 1 1 2 2 2 1 1 1 2 2 2 2 2]
[2 1 2 0 0 2 1 0 2 2 0 0 2 1 2 1 2 2 2 1 1 2 2 1 2 2 1 0 1 1 0 0 2 0 2 1 2
 2 0 0 2 2 2 2 0 0 0 1 2 1 1 1 0 2 2 0 1 2 2 2 0 2 2 2 2 2 0 1 1 2 2 1 1 2
 1 2 1 0 0 2]


# Train the Support Vector Machine

In [15]:
from sklearn.svm import SVC 
from sklearn.metrics import classification_report

In [16]:
model_SVC = SVC(kernel='linear')
model_SVC.fit(scaled_Xtrain, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [17]:
# SVM score 
print(f"Training Data Score: {model_SVC.score(scaled_Xtrain, y_train)}")
print(f"Testing Data Score: {model_SVC.score(scaled_Xtest, y_test)}")

Training Data Score: 0.8625
Testing Data Score: 0.8


# Logistic Regression Model

In [18]:
# classifier is our model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
classifier.fit(scaled_Xtrain, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
print(f"Training Data Score: {classifier.score(scaled_Xtrain, y_train)}")
print(f"Testing Data Score: {classifier.score(scaled_Xtest, y_test)}")

Training Data Score: 0.9
Testing Data Score: 0.8


### Analysis comments
The SVM and Logistic Regression both had the same score on the Testing Data but they both had the same score for the testing data. 

# Deep Learning

In [None]:
from keras.utils import to_categorical

# One-hot encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

In [52]:
# first, create a normal neural network with 2 inputs, 6 hidden nodes, and 2 outputs
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(units=6, activation='relu', input_dim=40))
model.add(Dense(units=3, activation='softmax'))

In [53]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 6)                 246       
_________________________________________________________________
dense_8 (Dense)              (None, 3)                 21        
Total params: 267
Trainable params: 267
Non-trainable params: 0
_________________________________________________________________


In [54]:
# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [55]:
# Fit the model to the training data
model.fit(
    scaled_Xtrain,
    y_train_encoded,
    epochs=40,
    shuffle=True,
    verbose=2
)

Epoch 1/40
 - 2s - loss: 1.1920 - acc: 0.3000
Epoch 2/40
 - 0s - loss: 1.1738 - acc: 0.3125
Epoch 3/40
 - 0s - loss: 1.1601 - acc: 0.3000
Epoch 4/40
 - 0s - loss: 1.1472 - acc: 0.2875
Epoch 5/40
 - 0s - loss: 1.1381 - acc: 0.3000
Epoch 6/40
 - 0s - loss: 1.1305 - acc: 0.3000
Epoch 7/40
 - 0s - loss: 1.1239 - acc: 0.3000
Epoch 8/40
 - 0s - loss: 1.1165 - acc: 0.2875
Epoch 9/40
 - 0s - loss: 1.1115 - acc: 0.2750
Epoch 10/40
 - 0s - loss: 1.1067 - acc: 0.2500
Epoch 11/40
 - 0s - loss: 1.1018 - acc: 0.2375
Epoch 12/40
 - 0s - loss: 1.0979 - acc: 0.2750
Epoch 13/40
 - 0s - loss: 1.0943 - acc: 0.2875
Epoch 14/40
 - 0s - loss: 1.0914 - acc: 0.3000
Epoch 15/40
 - 0s - loss: 1.0883 - acc: 0.3125
Epoch 16/40
 - 0s - loss: 1.0859 - acc: 0.3000
Epoch 17/40
 - 0s - loss: 1.0833 - acc: 0.3750
Epoch 18/40
 - 0s - loss: 1.0808 - acc: 0.4125
Epoch 19/40
 - 0s - loss: 1.0784 - acc: 0.4750
Epoch 20/40
 - 0s - loss: 1.0756 - acc: 0.5125
Epoch 21/40
 - 0s - loss: 1.0734 - acc: 0.5375
Epoch 22/40
 - 0s - lo

<keras.callbacks.History at 0x1a33972470>

**I would love feedback on what this means lol**

# Hyperparameter Tuning 

Use `GridSearchCV` to tune the `C` and `gamma` parameters (¬‿¬)

In [96]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model_SVC, param_grid, verbose=3)

In [97]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.571, total=   6.9s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.9s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.519, total=  30.8s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   37.8s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.440, total=   0.8s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.571, total=  10.0s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.519, total=  35.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.440, total=   0.5s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.571, total=   8.9s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.519, total=  30.5s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.440, total=   0.5s
[CV] C=1, gamma=0.005 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 15.5min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [100]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 5, 'gamma': 0.0001}
0.525
