<a href="https://colab.research.google.com/github/mohityadav11a/asteroid_spectra/blob/main/6_search_svm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 6. ML - Parameter Search / Optimization
Here, We are conducting a Grid Search for a binary classification task, using the F1 score as our evaluation metric.

In [1]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import sklearn
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [2]:
# Mount the Google Drive
try:
    from google.colab import drive
    drive.mount('/gdrive')
    core_path = "/gdrive/MyDrive/colab/asteroid_taxonomy/"
except ModuleNotFoundError:
    core_path = ""

Mounted at /gdrive


In [3]:
# Loading level 2 asteroid data
asteroids_df = pd.read_pickle(os.path.join(core_path, "data/lvl2/", "asteroids.pkl"))

In [4]:
# Adding a binary classification schema to distinguish between X and non-X classes
asteroids_df.loc[:, "Class"] = asteroids_df["Main_Group"].apply(lambda x: 1 if x=="X" else 0)

In [5]:
# Allocate the spectra to one array and the classes to another one
asteroids_X = np.array([k["Reflectance_norm550nm"].tolist() for k in asteroids_df["SpectrumDF"]])
asteroids_y = np.array(asteroids_df["Class"].to_list())

In [6]:
# Creating a single test-training split with a ratio of 0.8 / 0.2
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

# single train / test split
for train_index, test_index in sss.split(asteroids_X, asteroids_y):

    X_train, X_test = asteroids_X[train_index], asteroids_X[test_index]
    y_train, y_test = asteroids_y[train_index], asteroids_y[test_index]

# Computing class weight
positive_class_weight = int(1.0 / (sum(y_train) / len(X_train)))

In [7]:
# Performing GridSearch with the following parameters
param_grid = [
  {'C': [1, 10, 100], 'kernel': ['linear']},
  {'C': [1, 10, 100], 'kernel': ['rbf']},
 ]

# SVM classifier
svc = svm.SVC(class_weight={1: positive_class_weight})

# Instantiate the StandardScaler (mean 0, standard deviation 1) and use the training data to fit
# the scaler
scaler = preprocessing.StandardScaler().fit(X_train)

# Transforming training data
X_train_scaled = scaler.transform(X_train)

# Set the GridSearch
wclf = GridSearchCV(svc, param_grid, scoring='f1', verbose=3, cv=5)

# perform the training
wclf.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ................C=1, kernel=linear;, score=0.510 total time=   0.1s
[CV 2/5] END ................C=1, kernel=linear;, score=0.490 total time=   0.1s
[CV 3/5] END ................C=1, kernel=linear;, score=0.478 total time=   0.1s
[CV 4/5] END ................C=1, kernel=linear;, score=0.497 total time=   0.1s
[CV 5/5] END ................C=1, kernel=linear;, score=0.538 total time=   0.1s
[CV 1/5] END ...............C=10, kernel=linear;, score=0.518 total time=   0.3s
[CV 2/5] END ...............C=10, kernel=linear;, score=0.500 total time=   0.6s
[CV 3/5] END ...............C=10, kernel=linear;, score=0.493 total time=   0.4s
[CV 4/5] END ...............C=10, kernel=linear;, score=0.538 total time=   0.7s
[CV 5/5] END ...............C=10, kernel=linear;, score=0.547 total time=   0.6s
[CV 1/5] END ..............C=100, kernel=linear;, score=0.515 total time=   3.6s
[CV 2/5] END ..............C=100, kernel=linear;,

In [8]:
# the best estimator
final_clf = wclf.best_estimator_
print(final_clf)

SVC(C=100, class_weight={1: 5})


In [9]:
# Scale the testing data
X_test_scaled = scaler.transform(X_test)

# perform a predicition
y_test_pred = final_clf.predict(X_test_scaled)

In [10]:
# Importing confusion matrix and performing the computation
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_test_pred)

print(conf_mat)

# The order of the confusion matrix is:
#     - true negative (top left, tn)
#     - false positive (top right, fp)
#     - false negative (bottom left, fn)
#     - true positive (bottom right, tp)
tn, fp, fn, tp = conf_mat.ravel()

[[216   5]
 [  0  47]]


In [11]:
# Recall: ratio of correctly classified X Class spectra, considering the false negatives
# (recall = tp / (tp + fn))
recall_score = round(sklearn.metrics.recall_score(y_test, y_test_pred), 3)
print(f"Recall Score: {recall_score}")

# Precision: ratio of correctly classified X Class spectra, considering the false positives
# (precision = tp / (tp + fp))
precision_score = round(sklearn.metrics.precision_score(y_test, y_test_pred), 3)
print(f"Precision Score: {precision_score}")

# A combined score
f1_score = round(sklearn.metrics.f1_score(y_test, y_test_pred), 3)
print(f"F1 Score: {f1_score}")

Recall Score: 1.0
Precision Score: 0.904
F1 Score: 0.949
