# Machine Learning Assignment

**Dataset**:       NASA NEO


## Imports

Add imports here as needed.

Remember to **re-run the cell when you add imports**, so it gets loaded into the virtual notebook environment!

In [1]:
# Data and Datasets
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler

# Clustering
from sklearn.cluster import DBSCAN

# Validation methods
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

# Metrics
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier

# Hyper-parameter optimisation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Feature selection & feature engineering
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA

# Stats
from scipy.stats import randint as sp_randint
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import shapiro     # Shapiro Wilk
from scipy.stats import normaltest  # D’Agostino’s K^2
from scipy.stats import anderson    # Anderson-Darling
from scipy.stats import ttest_ind    # independent student t-test; assumes normality
from scipy.stats import mannwhitneyu # non-parametric; doesn't assume normality

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
from sklearn.tree import export_graphviz

# Utils
import pprint
import numpy as np
from time import time
import openpyxl

## Loading the dataset

In [2]:
import pandas as pd

df = pd.read_csv('data/nasa_nearest_earth_objects_dataset.csv')

# printing out dataset info
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131416 entries, 0 to 131415
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   name               130220 non-null  object 
 1   abs_magnitude      104251 non-null  float64
 2   min_diameter       104010 non-null  float64
 3   max_diameter       130258 non-null  float64
 4   orbiting_body      104083 non-null  object 
 5   relative_velocity  91002 non-null   float64
 6   miss_distance      130228 non-null  float64
 7   is_hazardous       131416 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 8.0+ MB
None


Unnamed: 0,name,abs_magnitude,min_diameter,max_diameter,orbiting_body,relative_velocity,miss_distance,is_hazardous
0,349507 (2008 QY),,,1.185878,,109949.757148,55801050.0,1
1,(2011 DW),22.7,0.076658,0.171412,Earth,,63118630.0,0
2,(2014 BT8),25.0,0.02658,0.059435,,47477.649832,42905210.0,0
3,(2008 OX1),21.5,0.133216,0.297879,Earth,57853.295346,27279080.0,1
4,(2018 YH),,0.121494,0.271669,Earth,37424.548382,5657727.0,1


In [3]:
df.describe()

Unnamed: 0,abs_magnitude,min_diameter,max_diameter,relative_velocity,miss_distance,is_hazardous
count,104251.0,104010.0,130258.0,91002.0,130228.0,131416.0
mean,22.330584,0.188924,0.4012,54075.64711,41447800.0,0.328438
std,2.854634,0.322836,0.692532,27296.143139,20783590.0,0.469647
min,10.41,0.000511,0.0,439.938963,6745.533,0.0
25%,20.31,0.035039,0.062236,33331.934742,24537160.0,0.0
50%,21.77,0.118183,0.246627,50501.496533,43189020.0,0.0
75%,24.4,0.22938,0.494356,70004.838289,59333380.0,1.0
max,33.58,22.006703,49.208483,275303.183496,74798650.0,1.0


## Exploratory data analysis

Section for exploratory data analysis, to address **Tasks 1.1 and 1.2**.

**OBS:** You may need to do some data cleaning before you do your full exploratory data analysis, though you will find that some functions we'll cover in this unit are able to handle things like missing values and non-numeric data.

**Create more cells as needed!**

## Task 2.1 - ML Workflow to Critically Evaluate

In [4]:
# Dropping all rows with missing values
df = df.dropna()

# Converting all non-numeric (object) features to numeric
cat_columns = df.select_dtypes(['object']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.astype('category')) # converting 'object' columns to 'category' type
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes) # converting the 'category' columns to integer encoded values

In [5]:
# Splits the Pandas DataFrame into a feature matrix (X) and class/label vector (y)
X = df.iloc[:,:len(df.columns)-1]
y = df.iloc[:,len(df.columns)-1]

# Splitting dataset for hold-out validation
X_train, X_test, y_train, y_test = train_test_split(X, # feature matrix
                                                    y, # label vector
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify=None
                                                   )

# Creating and testing a Logistic Regression Model
model = RandomForestClassifier()

# Training the model
model.fit(X_train, y_train)

# Testing the model
y_pred = model.predict(X_test)

# Printing out confusion matrix and accuracy
print(metrics.confusion_matrix(y_test, y_pred))
print("\nAccuracy (Testing):  %0.2f " % (metrics.accuracy_score(y_test, y_pred)))

[[5625  561]
 [ 310 2781]]

Accuracy (Testing):  0.91 


## Task 2.3 - Evaluation of Improved ML Workflow

Add code for running your **improved** machine learning experiments below.


In [6]:
# Loading the dataset again, for you to do your own pre-processing (instead of what was done above)
df = pd.read_csv('data/nasa_nearest_earth_objects_dataset.csv')