In [1]:
import pandas as pd
import numpy as np
from MLP import *
from sklearn.preprocessing import MinMaxScaler

The dataset was obtained from [Kaggle](https://www.kaggle.com/datasets/brsdincer/star-type-classification)

## Data Preparation

In [2]:
df = pd.read_csv('stars.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Temperature     240 non-null    int64  
 1   L               240 non-null    float64
 2   R               240 non-null    float64
 3   A_M             240 non-null    float64
 4   Color           240 non-null    object 
 5   Spectral_Class  240 non-null    object 
 6   Type            240 non-null    int64  
dtypes: float64(3), int64(2), object(2)
memory usage: 13.3+ KB


I will first standardize the column names into snake case so we don't get any unexpected errors from accessing a column titled with weird spacing/characters.

In [3]:
df.columns=[x.strip().replace(' ', '_').lower() for x in df.columns]
df.head()

Unnamed: 0,temperature,l,r,a_m,color,spectral_class,type
0,3068,0.0024,0.17,16.12,Red,M,0
1,3042,0.0005,0.1542,16.6,Red,M,0
2,2600,0.0003,0.102,18.7,Red,M,0
3,2800,0.0002,0.16,16.65,Red,M,0
4,1939,0.000138,0.103,20.06,Red,M,0


The data appears to be sorted. Let's shuffle them now.

In [4]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()

Unnamed: 0,temperature,l,r,a_m,color,spectral_class,type
0,16500,0.013,0.014,11.89,Blue White,B,2
1,2637,0.00073,0.127,17.22,Red,M,0
2,4980,0.357,1.13,4.78,Yellowish,K,3
3,33421,352000.0,67.0,-5.79,Blue,O,4
4,11096,112000.0,12.0,-5.91,Blue,O,4


Next, we want to turn the qualitative columns `color` and `spectral_class` into quantitative variables. Starting off with the `color` column, I'll take a look at the values just to get an idea.

In [5]:
df['color'].unique()

array(['Blue White', 'Red', 'Yellowish', 'Blue', 'Orange', 'Blue-white',
       'White', 'Orange-Red', 'Yellowish White', 'yellow-white',
       'Pale yellow orange', 'white', 'Whitish', 'Blue white',
       'White-Yellow', 'yellowish', 'Blue-White'], dtype=object)

We have some values like 'Blue White' and 'Blue white' getting flagged as different values because of the formatting. I'll go ahead and standardize them.

In [6]:
df['color'] = df['color'].str.capitalize().str.strip().str.replace(' ', '-')
df['color'].unique()

array(['Blue-white', 'Red', 'Yellowish', 'Blue', 'Orange', 'White',
       'Orange-red', 'Yellowish-white', 'Yellow-white',
       'Pale-yellow-orange', 'Whitish', 'White-yellow'], dtype=object)

We take a look at `spectral_class` to see if we need to do the same. It turns out that we don't

In [7]:
df['spectral_class'].unique()

array(['B', 'M', 'K', 'O', 'A', 'F', 'G'], dtype=object)

Next, we add a new column `spectral_class_num` to assign the `spectral_class` numerical values.

In [8]:
df['spectral_class_num'] = df['spectral_class'].astype('category').cat.codes
df.head()

Unnamed: 0,temperature,l,r,a_m,color,spectral_class,type,spectral_class_num
0,16500,0.013,0.014,11.89,Blue-white,B,2,1
1,2637,0.00073,0.127,17.22,Red,M,0,5
2,4980,0.357,1.13,4.78,Yellowish,K,3,4
3,33421,352000.0,67.0,-5.79,Blue,O,4,6
4,11096,112000.0,12.0,-5.91,Blue,O,4,6


We do the same for `color`.

In [9]:
df['color_num'] = df['color'].astype('category').cat.codes
df.head()

Unnamed: 0,temperature,l,r,a_m,color,spectral_class,type,spectral_class_num,color_num
0,16500,0.013,0.014,11.89,Blue-white,B,2,1,1
1,2637,0.00073,0.127,17.22,Red,M,0,5,5
2,4980,0.357,1.13,4.78,Yellowish,K,3,4,10
3,33421,352000.0,67.0,-5.79,Blue,O,4,6,0
4,11096,112000.0,12.0,-5.91,Blue,O,4,6,0


Next, we perform one-hot encoding for the target variable `type`

In [10]:
one_hot = pd.get_dummies(df['type'], prefix='type', dtype=float)
df = pd.concat([df, one_hot], axis=1)
df.head()

Unnamed: 0,temperature,l,r,a_m,color,spectral_class,type,spectral_class_num,color_num,type_0,type_1,type_2,type_3,type_4,type_5
0,16500,0.013,0.014,11.89,Blue-white,B,2,1,1,0.0,0.0,1.0,0.0,0.0,0.0
1,2637,0.00073,0.127,17.22,Red,M,0,5,5,1.0,0.0,0.0,0.0,0.0,0.0
2,4980,0.357,1.13,4.78,Yellowish,K,3,4,10,0.0,0.0,0.0,1.0,0.0,0.0
3,33421,352000.0,67.0,-5.79,Blue,O,4,6,0,0.0,0.0,0.0,0.0,1.0,0.0
4,11096,112000.0,12.0,-5.91,Blue,O,4,6,0,0.0,0.0,0.0,0.0,1.0,0.0


Having created quantitative columns out of the qualitative features, we can now drop them.

In [11]:
df = df.drop(['color', 'spectral_class', 'type'], axis=1)

In [12]:
df.head()

Unnamed: 0,temperature,l,r,a_m,spectral_class_num,color_num,type_0,type_1,type_2,type_3,type_4,type_5
0,16500,0.013,0.014,11.89,1,1,0.0,0.0,1.0,0.0,0.0,0.0
1,2637,0.00073,0.127,17.22,5,5,1.0,0.0,0.0,0.0,0.0,0.0
2,4980,0.357,1.13,4.78,4,10,0.0,0.0,0.0,1.0,0.0,0.0
3,33421,352000.0,67.0,-5.79,6,0,0.0,0.0,0.0,0.0,1.0,0.0
4,11096,112000.0,12.0,-5.91,6,0,0.0,0.0,0.0,0.0,1.0,0.0


Finally, we extract the features and target variables, then convert them to 2D numpy arrays.

In [13]:
X = df.loc[:,'temperature':'color_num'].to_numpy().astype(float)
X

array([[ 1.6500e+04,  1.3000e-02,  1.4000e-02,  1.1890e+01,  1.0000e+00,
         1.0000e+00],
       [ 2.6370e+03,  7.3000e-04,  1.2700e-01,  1.7220e+01,  5.0000e+00,
         5.0000e+00],
       [ 4.9800e+03,  3.5700e-01,  1.1300e+00,  4.7800e+00,  4.0000e+00,
         1.0000e+01],
       ...,
       [ 4.0770e+03,  8.5000e-02,  7.9500e-01,  6.2280e+00,  4.0000e+00,
         1.0000e+01],
       [ 2.4490e+04,  2.4849e+05,  1.1345e+03, -8.2400e+00,  1.0000e+00,
         1.0000e+00],
       [ 2.3000e+04,  1.2700e+05,  3.6000e+01, -5.7600e+00,  6.0000e+00,
         0.0000e+00]])

In [14]:
Y = df.loc[:,'type_0':].to_numpy().astype(float)
Y

array([[0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.]])

We then scale the feature variables because they vary too much. For example, temperature is in the thousands while relative luminosity is in the thousandths.

In [15]:
X.shape[0]*0.7

168.0

In [16]:
scaler = MinMaxScaler()
X_train, X_test = scaler.fit_transform(X[:168]), scaler.transform(X[168:])
Y_train, Y_test = Y[:168], Y[168:]

## Training

In [17]:
np.random.seed(15)

mlp = MLP([
    Layer(15, 6, activation='relu'),
    Layer(15, 15, activation='relu'),
    Layer(15, 15, activation='relu'),
    Layer(6, 15, activation='softmax'),
],
    0.0001,
    lossFunction='ce')

# Train on 70% of the dataset
mlp.train(X_train, Y_train, 800)
test_acc = mlp.test(X_test, Y_test)

print(f"Testing Data Accuracy: {test_acc}")

Testing Data Accuracy: 0.9861111111111112


Finally, our model has been trained with 98.61% classification accuracy on our test set.