In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_validate
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pandas as pd
import mglearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import time
from sklearn.metrics import classification_report
from scipy import stats
import matplotlib
from collections import Counter
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Term Project
## Imports
I like to start off all notebooks with a cell at the beginning that has every used import so that I don't need to check all over to ensure that I've met all dependencies.

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from matplotlib import colors

from scipy import stats
import pandas as pd
import numpy as np

## Data Preparation
Here I'm using the data from the exoplanet catalogue and the Week 4 notebook for this project. This first cell is just to single out which data I'm using and cleaning it up.

Note that in the original notebook, some features were manually selected to focus on. Here, we're replacing that with PCA.

In [None]:
df = pd.read_csv('/blue/ast4930/share/phl_exoplanet_catalog.csv', sep = ',')
#Making a new pandas df that combines both 'conservative' and 'optmistic' hability labels into one category
data_original = df.drop('P_HABITABLE', axis = 1)
#Combining the two habitability types
data_original['P_HABITABLE'] = (np.logical_or((df.P_HABITABLE == 1) , (df.P_HABITABLE == 2)))
#Making the data into integer format
data_original['P_HABITABLE'] = data_original['P_HABITABLE'].astype(int) 

#Sectioning off the hability labels specifically
targets = data_original.P_HABITABLE

#Getting rid of missing data
data_original = bindf.dropna(axis = 0) 
data_original = data_original[(np.abs(stats.zscore(data_original)) < 5).all(axis=1)] 
#Fixing index issues caused by the above
targets = targets[data_original.index]
data_original = data_original.reset_index(drop=True)
targets = targets.reset_index(drop=True)

#Test train split
X_train, X_test, y_train, y_test = train_test_split(data_original, targets, random_state = 0)

Let's get a look at our data before starting:

In [None]:
plt.figure(figsize=(6,4))

cmap = colors.LinearSegmentedColormap.from_list("", ['#20B2AA','#FF00FF'])

a = plt.scatter(data_original['S_MASS'], data_original['P_PERIOD'], marker = 'o',\
            c = targets, s = 20, cmap=cmap, label = 'Test')

plt.legend();

a.set_facecolor('none')

plt.yscale('log')
plt.xlabel('Mass of Parent Star (Solar Mass Units)')
plt.ylabel('Period of Orbit (days)');

bluepatch = mpatches.Patch(color='#20B2AA', label='Not Habitable')
magentapatch = mpatches.Patch(color='#FF00FF', label='Habitable')

ax = plt.gca()
leg = ax.get_legend()

plt.legend(handles=[magentapatch, bluepatch],\
           loc = 'lower right', fontsize = 14);

The data are split *before* the scaling is applied. Since PCA needs standard scaling, we will be doing that here.

In [None]:
X_train_s = StandardScaler().fit_transform(X_train)
X_test_s = StandardScaler().fit(X_train).transform(X_test)
#applying PCA
pca = PCA(n_components=0.9)
X_pca = pca.fit_transform(X_train_s)
X_test_p = pca.fit(X_train_s).transform(X_test_s)

## Decision Tree
First, we will see how the DT fares in classifying our exoplanets as habitable or not.

In [None]:
param_grid = {'max_depth': np.arange(10)+1}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=21), param_grid, cv=5, 
                           return_train_score=True, verbose=1)
grid_search.fit(X_train, y_train)

model = DecisionTreeClassifier(random_state = 21) #I picked 21 because I'm 21

model.fit(X_pca, y_train)
model.score(X_test_p, y_test)

print(classification_report(y_test, model.predict(X_test_p)))

## RandomForest
Let's see how RF changes our scores.

In [None]:
param_grid = {'max_depth': np.arange(10)+1}
grid_search = GridSearchCV(RandomForestClassifier(random_state=0), param_grid, cv=5, return_train_score=True, 
                           verbose=1)
grid_search.fit(X_pca, y_train)

print("Best parameters: {}".format(grid_search.best_params_))
print("Best model: {}".format(grid_search.best_estimator_))
print(classification_report(y_test, model.predict(X_test_p)))

## kNN
