### Lab 6 - Return of the Exoplanets!
#### Authors: Chandra Adhikari and Naomi Hoffman

### Re-use code from Lab 2

In [None]:
# Python â‰¥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn â‰¥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
# rc means runtime configuration
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [None]:
import pandas as pd
DOWNLOAD_ROOT = "cumulative.csv"

raw_data = pd.read_csv (DOWNLOAD_ROOT)
#print (df.head)

In [None]:
raw_data["koi_disposition"].value_counts()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(raw_data, raw_data["koi_disposition"]):
    strat_train_set = raw_data.loc[train_index]
    strat_test_set = raw_data.loc[test_index]

In [None]:
def visualize_correlation(data):
    corr_matrix = data.corr()
    plt.figure(num=None, figsize=(10, 10), dpi=80, facecolor='w', edgecolor='k')
    corr_plot = plt.matshow(corr_matrix, fignum = 1)
    plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns, rotation=90)
    plt.yticks(range(len(corr_matrix.columns)), corr_matrix.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corr_plot)
    plt.title(f'Correlation Matrix for kepler data', fontsize=15)
    plt.show()
    
visualize_correlation(strat_train_set)

In [None]:
def drop_unnecessary_col(data):
    to_drop=["koi_pdisposition","kepid", "kepler_name", "kepoi_name","koi_tce_plnt_num", "koi_tce_delivname"]
    cols = data.columns
    for c in cols: # Thanks kirktobridge (on Kaggle) for this nice way of dropping error categories :)
        if 'err' in c:
            to_drop.append(c)
        if 'fpflag' in c:
            to_drop.append(c)
    updated_data = data.drop(to_drop, axis = 1)
    print("Dropped:\n\n", to_drop)
    print(f"\nYour dataset had {data.shape[1]} columns.\nIt now has {updated_data.shape[1]} columns.")
    return updated_data

In [None]:
strat_train_set=drop_unnecessary_col(strat_train_set)
strat_test_set=drop_unnecessary_col(strat_test_set)

In [None]:
#drop koi_Score as we're not allowed to use it
def drop_forbidden_col(data):
    data=data.drop(["koi_score"], axis=1)
    return data

In [None]:
strat_train_set = drop_forbidden_col(strat_train_set)
strat_test_set = drop_forbidden_col(strat_test_set)

In [None]:
def drop_missing_data(missing_data):
    data = missing_data.dropna()
    print(f"The dataset had {missing_data.shape[0]} rows. It now has {data.shape[0]} rows.\n({missing_data.shape[0]-data.shape[0]} rows were dropped, leaving you with {round(((data.shape[0]/missing_data.shape[0])*100),2)}% of the original number of entries.)")
    return data

In [None]:
strat_train_set = drop_missing_data(strat_train_set)
strat_test_set = drop_missing_data(strat_test_set)

In [None]:
strat_train_set.describe()

In [None]:
%matplotlib inline
# only in a Jupyter notebook
import matplotlib.pyplot as plt
strat_train_set.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

#('attribs_adder', CombinedAttributesAdder()),
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),    
])


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

strat_num = list(strat_train_set.drop("koi_disposition", axis=1))
strat_cat = ["koi_disposition"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, strat_num)
])
#("cat", OneHotEncoder(), strat_cat),    



In [None]:
from sklearn.model_selection import train_test_split

X_training_data = full_pipeline.fit_transform(strat_train_set)
y_training_data = (strat_train_set["koi_disposition"] == "CONFIRMED")

x_train, x_val, y_train, y_val = train_test_split(X_training_data, y_training_data, test_size=0.12, shuffle=False)

X_test = strat_test_set.drop("koi_disposition", axis=1)
y_test = (strat_test_set["koi_disposition"] == "CONFIRMED")

In [None]:
visualize_correlation(strat_train_set)

### Add Neural Network stuff!

10. Train a deep MLP on the MNIST dataset and see if you can get over 98% preci‐
sion. Try adding all the bells and whistles (i.e., save checkpoints, use early stop‐
ping, plot learning curves using TensorBoard, and so on)

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
tf.__version__

In [None]:
keras.__version__

In [None]:
K = keras.backend

class ExponentialLearningRate(keras.callbacks.Callback):
    def __init__(self, factor):
        self.factor = factor
        self.rates = []
        self.losses = []
    def on_batch_end(self, batch, logs):
        self.rates.append(K.get_value(self.model.optimizer.lr))
        self.losses.append(logs["loss"])
        K.set_value(self.model.optimizer.lr, self.model.optimizer.lr * self.factor)

In [None]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
x_train.shape[1]

In [None]:
model = keras.models.Sequential([
    keras.layers.Dense(300, input_shape=[16], activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(2, activation="softmax")
])

In [None]:
model.layers

In [None]:
model.summary() # doesn't work unless we build or give input_layer first

We will start with a small learning rate of 1e-3, and grow it by 0.5% at each iteration:

In [None]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(lr=1e-3),
              metrics=["accuracy"])
expon_lr = ExponentialLearningRate(factor=1.005)

Now let's train the model for just 1 epoch:

In [None]:
history = model.fit(x_train, y_train, epochs=1,
                    validation_data=(x_val, y_val),
                    callbacks=[expon_lr])

We can now plot the loss as a functionof the learning rate:

In [None]:
plt.plot(expon_lr.rates, expon_lr.losses)
plt.gca().set_xscale('log')
plt.hlines(min(expon_lr.losses), min(expon_lr.rates), max(expon_lr.rates))
plt.axis([min(expon_lr.rates), max(expon_lr.rates), 0, expon_lr.losses[0]])
plt.grid()
plt.xlabel("Learning rate")
plt.ylabel("Loss")

In [None]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
model = keras.models.Sequential([
    keras.layers.Dense(300, input_shape=[16], activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(2, activation="softmax")
])

In [None]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(lr=3e-1),
              metrics=["accuracy"])

In [None]:
run_index = 1 # increment this at every run
run_logdir = os.path.join(os.curdir, "my_kepler_logs", "run_{:03d}".format(run_index))
run_logdir

In [None]:
early_stopping_cb = keras.callbacks.EarlyStopping(patience=20)
checkpoint_cb = keras.callbacks.ModelCheckpoint("my_kepler_model.h5", save_best_only=True)
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)

history = model.fit(x_train, y_train, epochs=100,
                    validation_data=(x_val, y_val),
                    callbacks=[checkpoint_cb, early_stopping_cb, tensorboard_cb])

In [None]:
model = keras.models.load_model("my_kepler_model.h5") # rollback to best model
model.evaluate(X_test, y_test)

We got 75% accuracy! We can do better than that

In [None]:
%tensorboard --logdir=./my_kepler_logs --port=6006 # gets UsageError: Line magic function `%tensorboard` not found.

### Progress Notes
#### Problems
+ Problem 1: Installing Tensorflow took us a very long time, we followed the tutorial in the book and several tutorials online, none of which worked. We solved it with some investigation into conda issue #5219 on github and manually installing with pip within conda.
#### Investigations