In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\osun\anaconda3\envs\pythonadv\lib\site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
import joblib

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()


# Convert interger to float for all columns
for column, content in df.items():
    if df[column].dtype == 'int64':
        df = df.astype({column: 'float64'})

# Select your features (columns)

In [5]:
# pre processing data
# Assign x & y
x = df.drop("koi_disposition", axis=1)
y = df["koi_disposition"]

# Split into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, stratify=y)

In [6]:
# Scale X values
x_scaler = MinMaxScaler().fit(x_train)
x_train_scaled = x_scaler.transform(x_train)
x_test_scaled = x_scaler.transform(x_test)

In [8]:
model_one = LogisticRegression(solver='newton-cg', multi_class='auto')
model_one.fit(x_train_scaled, y_train)

model_one_training_score = round(model_one.score(x_train_scaled, y_train)*100,3)
test_score = round(model_one.score(x_test_scaled, y_test)*100,3)

print(f"Training Data Score: {model_one_training_score} %")
print(f"Testing Data Score: {test_score} %")

Training Data Score: 85.504 %
Testing Data Score: 86.213 %


In [9]:
# Evaluate features
feature_names = x.columns.tolist()
selector = RFECV(estimator=model_one, cv=5, step=1)
_ = selector.fit(x_train_scaled, y_train)


# choose features
selected_features = sorted(zip(selector.ranking_, feature_names))
# rank features
rank = pd.DataFrame(selected_features, columns=['Ranking', 'Feature'])
# set index to feature
rank = rank.set_index('Feature')


In [10]:
# Remove features with Ranking > 12
final_selected_features = []
for i in selected_features:
    if i[0] < 13:
        final_selected_features.append(i[1])


In [14]:
# Use selected features for all models 
## Assign new data to X 
x_train_selected = x_train[final_selected_features]
x_test_selected = x_test[final_selected_features]

x_scaler = MinMaxScaler().fit(x_train_selected)
x_train_scaled = x_scaler.transform(x_train_selected)
x_test_scaled = x_scaler.transform(x_test_selected)

## Train model
model_two = LogisticRegression(solver='newton-cg', multi_class='auto')
model_two.fit(x_train_scaled, y_train)

model_two_training_score = round(model_two.score(x_train_scaled, y_train)*100,3)
test_score_two = round(model_two.score(x_test_scaled, y_test)*100,3)

print(f"Training Data Score: {model_two_training_score} %")
print(f"Testing Data Score: {test_score_two} %")

Training Data Score: 85.504 %
Testing Data Score: 86.213 %


In [16]:
# Create the GridSearchCV model
model_three = LogisticRegression(solver='newton-cg', multi_class='auto')

param_grid = {
    'C': np.logspace(0, 4, 10),
    'penalty': ['l2']
}
grid = GridSearchCV(model_three, param_grid, cv=5, verbose=0)

# Train the model with GridSearch
_ = grid.fit(x_train_scaled, y_train)

In [17]:
# Tune parameters
C = grid.best_params_['C']
penalty = grid.best_params_['penalty']

# Tune model
model_four = LogisticRegression(solver='newton-cg', multi_class='auto',
                                 C=C, penalty=penalty)
model_four.fit(x_train_scaled, y_train)

model_three_training_score = round(model_four.score(x_train_scaled, y_train)*100,3)
test_score_three = round(model_four.score(x_test_scaled, y_test)*100,3)

print(f"Training Data Score: {model_three_training_score} %")
print(f"Testing Data Score: {test_score_three} %")

Training Data Score: 88.709 %
Testing Data Score: 89.474 %


In [20]:
predictions = model_four.predict(x_test_scaled)
classifications = y_test.unique().tolist()

prediction_actual = {
    'Actual': y_test,
    'Prediction': predictions
}

new_df = pd.DataFrame(prediction_actual)
new_df = new_df.set_index('Actual').reset_index()
new_df.head(5)

Unnamed: 0,Actual,Prediction
0,CANDIDATE,CANDIDATE
1,FALSE POSITIVE,FALSE POSITIVE
2,FALSE POSITIVE,FALSE POSITIVE
3,FALSE POSITIVE,FALSE POSITIVE
4,CANDIDATE,CANDIDATE


In [22]:
# evaluations
evaluations = {'': ['Base Model', 'Select Features Model', 'Tuned Model'],
               'Accuracy': [f"{test_score}%", f"{test_score_two}%", f"{test_score_three}%"]}

evaluations_df = pd.DataFrame(evaluations)
evaluations_df = evaluations_df.set_index('')

evaluations_df

Unnamed: 0,Accuracy
,
Base Model,86.213%
Select Features Model,86.213%
Tuned Model,89.474%


In [23]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash

filename = 'olivesun.sav'
_=joblib.dump(model_four, filename)