In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("cumulative.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [3]:
smaller_df = df[['koi_disposition','koi_fpflag_nt','koi_fpflag_ss','koi_fpflag_co','koi_fpflag_ec']].copy()
smaller_df.head()


Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec
0,CONFIRMED,0,0,0,0
1,CONFIRMED,0,0,0,0
2,FALSE POSITIVE,0,1,0,0
3,FALSE POSITIVE,0,1,0,0
4,CONFIRMED,0,0,0,0


# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
X = smaller_df.drop("koi_disposition", axis=1)
y = smaller_df["koi_disposition"]
print(X.shape, y.shape)

(8744, 4) (8744,)


In [5]:
# Split the data using train_test_split
# YOUR CODE HERE
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [6]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec
5964,1,0,0,0
9410,1,0,0,0
4204,0,0,1,0
5933,0,0,0,0
6996,1,0,1,0


# Pre-processing

Scale the data using the MinMaxScaler

In [7]:
# activity 3-5
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Step 1: Label-encode data set
# label_encoder = LabelEncoder()
# label_encoder.fit(y_train)
# encoded_y_train = label_encoder.transform(y_train)
# encoded_y_test = label_encoder.transform(y_test)

# Step 2: Convert encoded labels to one-hot-encoding
# y_train_categorical = to_categorical(encoded_y_train)
# y_test_categorical = to_categorical(encoded_y_test)

# Train the Support Vector Machine

In [8]:
# activity 2-8
# Support vector machine linear classifier
from sklearn.svm import SVC 
model2 = SVC(kernel='linear')
model2.fit(X_train_scaled, y_train)
predictions = model2.predict(X_test)

In [9]:
print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

Training Data Score: 0.7514486123818237
Testing Data Score: 0.7502287282708143


# Hyperparameter Tuning

Use `GridSearchCV` to tune the `C` and `gamma` parameters

In [16]:
# activity 2-9 / 2-10
# Create the GridSearchCV model
# Create the GridSearch estimator along with a parameter object containing the values to adjust
# Try adjusting `C` with values of 1, 5, and 10. Adjust `gamma` using .0001, 0.001, and 0.01
# YOUR CODE HERE
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 15, 20],
              'gamma': [0.0001, 0.001, 0.01, 0.1, 1]}
grid = GridSearchCV(model2, param_grid, verbose=3)

In [17]:
# Train the model with GridSearch

# Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
# YOUR CODE HERE
grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.751, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.749, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.754, total=   0.2s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.751, total=   0.2s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.749, total=   0.2s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.754, total=   0.2s
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.751, total=   0.2s
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.749, total=   0.2s
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.754, total=   0.2s
[CV] C=1, gamma=0.1 ..................................................
[CV] .

[CV] .................. C=20, gamma=0.0001, score=0.749, total=   0.1s
[CV] C=20, gamma=0.0001 ..............................................
[CV] .................. C=20, gamma=0.0001, score=0.754, total=   0.1s
[CV] C=20, gamma=0.001 ...............................................
[CV] ................... C=20, gamma=0.001, score=0.751, total=   0.2s
[CV] C=20, gamma=0.001 ...............................................
[CV] ................... C=20, gamma=0.001, score=0.749, total=   0.2s
[CV] C=20, gamma=0.001 ...............................................
[CV] ................... C=20, gamma=0.001, score=0.754, total=   0.2s
[CV] C=20, gamma=0.01 ................................................
[CV] .................... C=20, gamma=0.01, score=0.751, total=   0.2s
[CV] C=20, gamma=0.01 ................................................
[CV] .................... C=20, gamma=0.01, score=0.749, total=   0.2s
[CV] C=20, gamma=0.01 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:   13.4s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10, 15, 20],
                         'gamma': [0.0001, 0.001, 0.01, 0.1, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [18]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 1, 'gamma': 0.0001}
0.7514486123818237


In [13]:
# not on homework
# Make predictions with the hypertuned model
# YOUR CODE HERE
predictions = grid.predict(X_test)

In [14]:
# Calculate classification report
# YOUR CODE HERE
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["positive", "candidate", "negative"]))

              precision    recall  f1-score   support

    positive       0.00      0.00      0.00       528
   candidate       0.51      0.97      0.67       568
    negative       0.98      1.00      0.99      1090

    accuracy                           0.75      2186
   macro avg       0.50      0.66      0.55      2186
weighted avg       0.62      0.75      0.67      2186



  'precision', 'predicted', average, warn_for)
