In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read the CSV and Perform Basic Data Cleaning

In [None]:
# Read in data
df = pd.read_csv("exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.head()

In [None]:
print(df["koi_disposition"].unique())

# Select features (columns)

In [None]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [None]:
# Search for top 10 features according to feature importances
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X,y)
model.feature_importances_

In [None]:
feat_imp = pd.Series(model.feature_importances_, index=X.columns).nlargest(10)
feat_imp

In [None]:
# Set features based on feature importances
X = df[feat_imp.index]

# Use koi_disposition for y values
y = df['koi_disposition']

# Create a Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# Pre-processing

Scale the data using the MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Create scaler object
X_scaler = MinMaxScaler().fit(X_train)

# Scale training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model
Using SVM


In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize model (classifier)
classifier = LogisticRegression()

# Train the model
classifier.fit(X_train_scaled, y_train)

In [None]:
# Score the model
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
from sklearn.model_selection import GridSearchCV

# Create the GridSearchCV model
param_grid = {'C':np.logspace(-4, 4, 20),
             'penalty':['l1','l2']}

grid = GridSearchCV(classifier, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

# Save the Model

In [None]:
import joblib
filename = 'name.sav'
joblib.dump(my_model, filename)