<a href="https://www.kaggle.com/code/lorresprz/stellar-classification-ann-cnn-v-xgbclassifier?scriptVersionId=144950155" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Stellar classification: DL (ANN/CNN) vs ML (XGBClassifier, RandomForestClassifier)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report

#DL import
import tensorflow as tf
from tensorflow.keras.layers import Dense, BatchNormalization, Reshape, Flatten, Dropout, Conv1D
from tensorflow.keras.models import Sequential

#ML import
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

# EDA

The objective of this classification is to determine whether an object is a star, a galaxy or a quasar (the target column is 'class' with 3 values: QSO, GALAXY, STAR), based on a set of 17 observations.  The meaning of each column in the dataset was explained in the 'Data Card' section, but included here for clarity (and self-sufficiency):

- 1. obj_ID = Object Identifier, the unique value that identifies the object in the image catalog used by the CAS
- 2. alpha = Right Ascension angle (at J2000 epoch)
- 3. delta = Declination angle (at J2000 epoch)
- 4. u = Ultraviolet filter in the photometric system
- 5. g = Green filter in the photometric system
- 6. r = Red filter in the photometric system
- 7. i = Near Infrared filter in the photometric system
- 8. z = Infrared filter in the photometric system
- 9. run_ID = Run Number used to identify the specific scan
- 10. rereun_ID = Rerun Number to specify how the image was processed
- 11. cam_col = Camera column to identify the scanline within the run
- 12. field_ID = Field number to identify each field
- 13. spec_obj_ID = Unique ID used for optical spectroscopic objects (this means that 2 different observations with the same spec_obj_ID must share the same output class)
- 14. class = object class (galaxy, star or quasar object): This is the target column.
- 15. redshift = redshift value based on the increase in wavelength
- 16. plate = plate ID, identifies each plate in SDSS
- 17. MJD = Modified Julian Date, used to indicate when a given piece of SDSS data was taken
- 18. fiber_ID = fiber ID that identifies the fiber that pointed the light at the focal plane in each observation


In [None]:
df = pd.read_csv('/kaggle/input/stellar-classification-dataset-sdss17/star_classification.csv')
df.sample(5)

In [None]:
df.describe()

In [None]:
plt.figure(figsize = (6,4))
sns.histplot(data = df, x = 'class')
plt.title('Number of instances for each class of stellar objects')

In [None]:
fig, ax = plt.subplots(2,2, figsize = (12,8))
sns.kdeplot(data = df, x = 'redshift', ax = ax[0,0])
sns.kdeplot(data = df, x = 'r', ax = ax[0,1])
sns.kdeplot(data = df, x = 'delta', hue = 'class', ax = ax[1,0])
sns.kdeplot(data = df, x = 'alpha', hue = 'class', ax = ax[1,1])


In [None]:
#Encode the 'class' target in numeric format
LE = LabelEncoder()
df['class'] = LE.fit_transform(df['class'])

In [None]:
plt.figure(figsize = (15,12))
sns.heatmap(df.drop(['rerun_ID'],axis = 1).corr(),annot = True)

# Overview of ML methods: XGBClassifier, RandomForestClassifier, etc.

This section (and only this section) exploring various ML methods is based on the following notebook:
https://www.kaggle.com/code/geromeandrewducduc/stellar-entity-classification

Based on the results from the above notebook (section 'Feature Importance'), I used a set of reduced features comprised only of 'u, g, i, r, z, redshift' to form the basis for the classification task. Also, no oversampling was performed (despite the class imbalance). The obtained results show that the various ML methods are able to classify the target reasonably well. 

In [None]:
Xr = df[['u', 'g', 'i', 'z', 'r', 'redshift']]
yr = df['class']

Xr = np.array(Xr)
yr = np.array(yr)

xr_train, xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size = 0.2, random_state = 210)
xr_train.shape, xr_test.shape, yr_train.shape, yr_test.shape

In [None]:
classifiers = [
    LogisticRegression(),
    SVC(),
    RandomForestClassifier(),
    KNeighborsClassifier(),
    GaussianNB(),
    DecisionTreeClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    ExtraTreesClassifier(),
    BaggingClassifier(),
    MLPClassifier(),
    XGBClassifier()
]

In [None]:
best_accuracy = 0.0
acc_list = []
classifier_list = []

for classifier in classifiers:
    pipeline = make_pipeline(StandardScaler(), classifier)
    pipeline.fit(xr_train, yr_train)
    
    yr_pred = pipeline.predict(xr_test)
    
    accuracy = accuracy_score(yr_test, yr_pred)
    acc_list.append(accuracy)
    classifier_list.append(classifier.__class__.__name__)

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = classifier.__class__.__name__
    
d = {'Model': classifier_list, 'Accuracy': acc_list}
cl_df = pd.DataFrame(d, index = None)
cl_df = cl_df.set_index('Model')    


In [None]:
plt.figure(figsize = (8,6));
cl_df['Accuracy'].sort_values(ascending = False).plot(kind = 'bar')
plt.ylabel('Accuracy');
plt.title('Accuracy of each classifier in descending order');

In [None]:
cl_df

# RandomForestClassifier: Tuning for best model

In [None]:
#Tuning the best RandomForestClassifier model 
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())

param_grid = {
    'randomforestclassifier__n_estimators': [50, 100, 200],
    'randomforestclassifier__max_depth': [None, 10, 20],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(xr_train, yr_train)

best_params = grid_search.best_params_

print(best_params)

In [None]:
#Best model after tuning
model = RandomForestClassifier(n_estimators=200, max_depth=None)
model.fit(xr_train, yr_train)

In [None]:
preds = model.predict(xr_test)

In [None]:
plt.figure(figsize = (6,6))
ConfusionMatrixDisplay.from_predictions(yr_test, preds, 
                                        display_labels = ['GALAXY', 'QUASAR', 'STAR'])

In [None]:
rf_acc = accuracy_score(yr_test, preds)
rf_acc

# XGBClassifier: Tuning for best model

In [None]:
parameters = {
    'max_depth': range (5, 10, 20),
    'n_estimators': range(50, 100, 200),
    'learning_rate': [0.1, 0.01, 0.05]
}

#pipeline_2 = make_pipeline(StandardScaler(), XGBClassifier())

grid_search_2 = GridSearchCV(
    XGBClassifier(),
    param_grid=parameters,
    cv=5, n_jobs=-1,
    verbose=True
)

grid_search_2.fit(xr_train, yr_train)

best_params_2 = grid_search_2.best_params_
print(best_params_2)

In [None]:
#Best model after tuning
model_2 = XGBClassifier(n_estimators=50, max_depth=5, learning_rate = 0.1)
model_2.fit(xr_train, yr_train)

In [None]:
preds_2 = model_2.predict(xr_test)

In [None]:
plt.figure(figsize = (6,6))
ConfusionMatrixDisplay.from_predictions(yr_test, preds_2, 
                                        display_labels = ['GALAXY', 'QUASAR', 'STAR'])

In [None]:
xgb_acc = accuracy_score(yr_test, preds_2)
xgb_acc

# Classification with ANN

We need to re-encode the 'class' target in a format usable by the ANN/CNN built for classification task.

The encoding is as follows:

    'GALAXY' -> [1,0,0] -> 0,  'STAR' -> [0,0,1] -> 2,  'QSO' -> [0,1,0] -> 1

In [None]:
from keras.utils.np_utils import to_categorical
l_encode = LabelEncoder()
l_encode.fit(yr)
Yr = l_encode.transform(yr)
Yr = to_categorical(Yr)

In [None]:
xr_train, xr_test, yr_train, yr_test = train_test_split(Xr, Yr, test_size = 0.2, random_state = 210)
xr_train.shape, xr_test.shape, yr_train.shape, yr_test.shape

In [None]:
model_3 = Sequential([
        Dense(64, input_dim = xr_train.shape[1], activation = 'relu'),
        Dense(32, activation="relu"),
        Dense(15, activation="relu"),
        Dense(3, activation = 'softmax'),
    ])

model_3.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
              
model_3.fit(xr_train, yr_train, validation_data=(xr_test, yr_test), epochs = 40, verbose=0)

In [None]:
model_loss_acc = pd.DataFrame(model_3.history.history)
model_loss_acc[['loss', 'val_loss']].plot()
model_loss_acc[['accuracy', 'val_accuracy']].plot()

In [None]:
preds_3 = np.round(model_3.predict(xr_test))
#confusion_matrix(preds2.argmax(axis = 1), yr_test.argmax(axis = 1))

plt.figure(figsize = (6,6))
ConfusionMatrixDisplay.from_predictions(yr_test.argmax(axis = 1), preds_3.argmax(axis = 1), 
                                        display_labels = ['GALAXY', 'QUASAR', 'STAR'])

In [None]:
ann_acc = accuracy_score(yr_test.argmax(axis = 1), preds_3.argmax(axis =1))
ann_acc 

# Classification with CNN

In [None]:
model_4 = Sequential(
    [Reshape((1, xr_train.shape[1],1)),
     Conv1D(filters=10, kernel_size=2, activation='relu', 
            input_shape = (1,xr_train.shape[1],1)),
     Conv1D(filters=10, kernel_size=2, activation='relu'),
     Flatten(),
     Dense(3, activation = 'softmax')
                     ])

model_4.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
              
model_4.fit(xr_train, yr_train, validation_data=(xr_test, yr_test), epochs = 40, verbose=0)

In [None]:
model4_loss_acc = pd.DataFrame(model_4.history.history)
model4_loss_acc[['loss', 'val_loss']].plot()
model4_loss_acc[['accuracy', 'val_accuracy']].plot()

In [None]:
preds_4 = np.round(model_4.predict(xr_test))

plt.figure(figsize = (6,6))
ConfusionMatrixDisplay.from_predictions(yr_test.argmax(axis = 1),preds_4.argmax(axis = 1),
                                        display_labels = ['GALAXY', 'QUASAR', 'STAR'])

In [None]:
cnn_acc = accuracy_score(yr_test.argmax(axis = 1), preds_4.argmax(axis =1))
cnn_acc

# DL vs ML

In [None]:
dc = {'Methods': ['RandomForestClassifier', 'XGBClassifier', 'ANN', 'CNN'], 'Accuracy': [rf_acc, xgb_acc, ann_acc, cnn_acc]}
dcf = pd.DataFrame(data=dc)
dcf = dcf.set_index('Methods')

In [None]:
f, ax = plt.subplots(figsize = (7,5))
dcf['Accuracy'].sort_values(ascending = False).plot(ax = ax)
ax.axvline("CNN", color="green", linestyle="dashed")
ax.axvline("ANN", color="green", linestyle="dashed")
ax.axvline("XGBClassifier", color="green", linestyle="dashed")
ax.axvline("RandomForestClassifier", color="green", linestyle="dashed")
plt.title('Accuracy of various methods in descending order');