In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('star_type_.csv')

In [4]:
x = df.drop(['Star type'], axis=1)
y = df['Star type']

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x, y , test_size=0.2, shuffle=True, random_state=42)


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

**with MinMaxScaler**

In [7]:
steps = [
    ('scaler', MinMaxScaler()),
    ('classifier', LogisticRegression(penalty='l2',
                                      tol=1e-13,
                                      random_state=42,
                                      solver='sag',
                                      max_iter=5000))
]

pipeline = Pipeline(steps)

In [8]:
pipeline.fit(x_train, y_train)

In [9]:
from sklearn.metrics import classification_report

# Predicted on Train set
y_pred_train = pipeline.predict(x_train)

# Convert the true y values to arrays
y_train = y_train.values

# Classification report
print('Training Classification Report:-')
print(classification_report(y_train, y_pred_train))

Training Classification Report:-
               precision    recall  f1-score   support

  Brown Dwarf       0.74      1.00      0.85        32
   Hypergiant       1.00      0.94      0.97        32
Main Sequence       0.93      0.84      0.89        32
    Red Dwarf       0.83      0.76      0.79        33
   Supergiant       1.00      1.00      1.00        29
  White Dwarf       1.00      0.91      0.95        34

     accuracy                           0.91       192
    macro avg       0.92      0.91      0.91       192
 weighted avg       0.92      0.91      0.91       192



**with StandardScaler**

In [12]:
steps = [
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(penalty='l2',
                                      tol=1e-13,
                                      random_state=42,
                                      solver='sag',
                                      max_iter=5000))
]

pipeline = Pipeline(steps)

In [17]:
pipeline.fit(x_train, y_train)

**Without any Scaling**

In [29]:
steps = [

    ('classifier', LogisticRegression(penalty='l2',
                                      tol=1e-13,
                                      random_state=42,
                                      solver='sag',
                                      max_iter=5000))
]

pipeline = Pipeline(steps)

In [30]:
pipeline.fit(x_train, y_train)



In [32]:
from sklearn.metrics import classification_report

# Predicted on Train set
y_pred_train = pipeline.predict(x_train)

# Convert the true y values to arrays
#y_train = y_train.values

# Classification report
print('Training Classification Report:-')
print(classification_report(y_train, y_pred_train))

Training Classification Report:-
               precision    recall  f1-score   support

  Brown Dwarf       0.00      0.00      0.00        32
   Hypergiant       0.86      0.94      0.90        32
Main Sequence       0.85      0.34      0.49        32
    Red Dwarf       0.00      0.00      0.00        33
   Supergiant       1.00      0.83      0.91        29
  White Dwarf       0.28      1.00      0.44        34

     accuracy                           0.52       192
    macro avg       0.50      0.52      0.46       192
 weighted avg       0.49      0.52      0.45       192



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**highest accuracy using standard scaler ==> use on test data**

In [18]:
from sklearn.metrics import classification_report

# Predicted on Train set
y_pred_train = pipeline.predict(x_train)

# Convert the true y values to arrays
y_train = y_train.values

# Classification report
print('Training Classification Report:-')
print(classification_report(y_train, y_pred_train))

Training Classification Report:-
               precision    recall  f1-score   support

  Brown Dwarf       0.94      1.00      0.97        32
   Hypergiant       0.97      0.94      0.95        32
Main Sequence       0.93      0.88      0.90        32
    Red Dwarf       0.91      0.94      0.93        33
   Supergiant       1.00      1.00      1.00        29
  White Dwarf       1.00      1.00      1.00        34

     accuracy                           0.96       192
    macro avg       0.96      0.96      0.96       192
 weighted avg       0.96      0.96      0.96       192



In [23]:
y_pred_val = pipeline.predict(x_val)
print('Training Classification Report:-')
print(classification_report(y_val, y_pred_val))


Training Classification Report:-
               precision    recall  f1-score   support

  Brown Dwarf       1.00      1.00      1.00         8
   Hypergiant       0.89      1.00      0.94         8
Main Sequence       1.00      0.75      0.86         8
    Red Dwarf       0.88      1.00      0.93         7
   Supergiant       1.00      1.00      1.00        11
  White Dwarf       1.00      1.00      1.00         6

     accuracy                           0.96        48
    macro avg       0.96      0.96      0.96        48
 weighted avg       0.96      0.96      0.96        48

