## Classifying Stellar Objects by spectral properties using supervised ML models

In [27]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import time 

In [28]:
# Import input dataset
stellar_df = pd.read_csv('Stellar_Classification/star_classification.csv')
stellar_df.head()

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.1522e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842


In [29]:
# Check datatypes for variables
stellar_df.dtypes

obj_ID         float64
alpha          float64
delta          float64
u              float64
g              float64
r              float64
i              float64
z              float64
run_ID           int64
rerun_ID         int64
cam_col          int64
field_ID         int64
spec_obj_ID    float64
class           object
redshift       float64
plate            int64
MJD              int64
fiber_ID         int64
dtype: object

In [30]:
# Check dist of classifications
stellar_df["class"].value_counts()

GALAXY    59445
STAR      21594
QSO       18961
Name: class, dtype: int64

In [31]:
# Define features set

X = stellar_df.copy()
X = X.drop("class", axis=1)
X.head()

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,0.779136,10445,58158,427
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.1522e+18,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107e+19,0.932346,9149,58039,775
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,0.116123,6121,56187,842


In [32]:
# Define target vector

y = stellar_df["class"].values.ravel() # returns a contiguous flattened array
y[:5]

array(['GALAXY', 'GALAXY', 'GALAXY', 'GALAXY', 'GALAXY'], dtype=object)

In [33]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [34]:
# Check shapes of train and test sets
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(75000, 17)
(25000, 17)
(75000,)
(25000,)


In [35]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [36]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [37]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Random Forest Model

In [38]:
# Create classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [40]:
start = time.time()

# Fit model
rf_model = rf_model.fit(X_train_scaled, y_train)

## Calculate time run
end = time.time()
print(end - start)

97.4527645111084


In [42]:
start = time.time()

# Make predictions from testing set
predictions = rf_model.predict(X_test_scaled)
predictions

## Calculate time run
end = time.time()
print(end - start)

1.1072814464569092


In [43]:
# Evaluate Model

## Calculate confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["GALAXY Actual", "STAR Actual", "QSO Actual"], columns=["GALAXY Predicted", "STAR Predicted", "QSO Predicted"]
)

cm_df

Unnamed: 0,GALAXY Predicted,STAR Predicted,QSO Predicted
GALAXY Actual,14609,158,44
STAR Actual,326,4364,0
QSO Actual,3,0,5496


In [44]:
# Calculate Accuracy Score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.97876