In [None]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [1]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
#!pip install joblib

In [1]:
import pandas as pd
import os

# Read the CSV and Perform Basic Data Cleaning

In [12]:
df = pd.read_csv(os.path.join('..', 'InputData', 'cumulative.csv'))
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()

# Select your features (columns)

In [13]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_disposition', 'koi_score', 'koi_period', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_prad', 
                        'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_steff', 'koi_slogg', 'koi_srad']]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [22]:
X = selected_features.drop("koi_disposition", axis=1)
y = selected_features["koi_disposition"]
selected_features["koi_disposition"].unique()

array(['CONFIRMED', 'FALSE POSITIVE'], dtype=object)

In [15]:
# Split data into training and testing, use default split
from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [17]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
# Scale your data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
print(X_test_scaled.shape,  X_train_scaled.shape, y_train.shape, y_test.shape )

(568, 12) (1701, 12) (1701,) (568,)
6526    CONFIRMED
2241    CONFIRMED
5046    CONFIRMED
683     CONFIRMED
2452    CONFIRMED
3458    CONFIRMED
435     CONFIRMED
2318    CONFIRMED
4055    CONFIRMED
9178    CONFIRMED
4005    CONFIRMED
149     CONFIRMED
258     CONFIRMED
418     CONFIRMED
2304    CONFIRMED
228     CONFIRMED
2607    CONFIRMED
2486    CONFIRMED
3222    CONFIRMED
1165    CONFIRMED
1410    CONFIRMED
1930    CONFIRMED
2485    CONFIRMED
1048    CONFIRMED
1724    CONFIRMED
125     CONFIRMED
3129    CONFIRMED
2847    CONFIRMED
9060    CONFIRMED
904     CONFIRMED
          ...    
1020    CONFIRMED
1185    CONFIRMED
777     CONFIRMED
1366    CONFIRMED
2354    CONFIRMED
554     CONFIRMED
3599    CONFIRMED
2461    CONFIRMED
558     CONFIRMED
1814    CONFIRMED
4423    CONFIRMED
5830    CONFIRMED
1936    CONFIRMED
3529    CONFIRMED
2913    CONFIRMED
1402    CONFIRMED
1052    CONFIRMED
1265    CONFIRMED
407     CONFIRMED
639     CONFIRMED
2321    CONFIRMED
2378    CONFIRMED
2138    CO

In [18]:
# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)

label_encoder.fit(y_test)
encoded_y_test = label_encoder.transform(y_test)

# Convert encoded labels to one-hot-encoding
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
print(y_train_categorical.shape, X_train_scaled.shape)
print(y_train_categorical)

(1701, 1) (1701, 12)
[[1.]
 [1.]
 [1.]
 ...
 [1.]
 [1.]
 [1.]]


# Train the Model



In [19]:
# create a normal neural network with 12 inputs, 12 hidden nodes, and 3 outputs
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(units=12, activation='relu', input_dim=12))
model.add(Dense(units=3, activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 12)                156       
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 39        
Total params: 195
Trainable params: 195
Non-trainable params: 0
_________________________________________________________________


In [20]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [21]:
#Fit the model to the training data
model.fit( X_train_scaled, y_train_categorical, epochs=10, shuffle=True, verbose=2 )

ValueError: You are passing a target array of shape (1701, 1) while using as loss `categorical_crossentropy`. `categorical_crossentropy` expects targets to be binary matrices (1s and 0s) of shape (samples, classes). If your targets are integer classes, you can convert them to the expected format via:
```
from keras.utils import to_categorical
y_binary = to_categorical(y_int)
```

Alternatively, you can use the loss function `sparse_categorical_crossentropy` instead, which does expect integer targets.

In [72]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

AttributeError: 'Sequential' object has no attribute 'score'

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model

In [None]:
# Train the model with GridSearch

In [None]:
print(grid2.best_params_)
print(grid2.best_score_)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)