In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import warnings 
import pickle 

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv('Stars.csv')

In [3]:
df.head(5)

Unnamed: 0,Temperature (K),Luminosity (L/Lo),Radius (R/Ro),Absolute magnitude (Mv),Star type,Star category,Star color,Spectral Class
0,3068,0.0024,0.17,16.12,0,Brown Dwarf,Red,M
1,3042,0.0005,0.1542,16.6,0,Brown Dwarf,Red,M
2,2600,0.0003,0.102,18.7,0,Brown Dwarf,Red,M
3,2800,0.0002,0.16,16.65,0,Brown Dwarf,Red,M
4,1939,0.000138,0.103,20.06,0,Brown Dwarf,Red,M


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Temperature (K)          240 non-null    int64  
 1   Luminosity (L/Lo)        240 non-null    float64
 2   Radius (R/Ro)            240 non-null    float64
 3   Absolute magnitude (Mv)  240 non-null    float64
 4   Star type                240 non-null    int64  
 5   Star category            240 non-null    object 
 6   Star color               240 non-null    object 
 7   Spectral Class           240 non-null    object 
dtypes: float64(3), int64(2), object(3)
memory usage: 15.1+ KB


In [5]:
df.sum().isnull()

Temperature (K)            False
Luminosity (L/Lo)          False
Radius (R/Ro)              False
Absolute magnitude (Mv)    False
Star type                  False
Star category              False
Star color                 False
Spectral Class             False
dtype: bool

In [6]:
df.describe()

Unnamed: 0,Temperature (K),Luminosity (L/Lo),Radius (R/Ro),Absolute magnitude (Mv),Star type
count,240.0,240.0,240.0,240.0,240.0
mean,10497.4625,107188.361635,237.157781,4.382396,2.5
std,9552.425037,179432.24494,517.155763,10.532512,1.711394
min,1939.0,8e-05,0.0084,-11.92,0.0
25%,3344.25,0.000865,0.10275,-6.2325,1.0
50%,5776.0,0.0705,0.7625,8.313,2.5
75%,15055.5,198050.0,42.75,13.6975,4.0
max,40000.0,849420.0,1948.5,20.06,5.0


In [7]:
df.columns

Index(['Temperature (K)', 'Luminosity (L/Lo)', 'Radius (R/Ro)',
       'Absolute magnitude (Mv)', 'Star type', 'Star category', 'Star color',
       'Spectral Class'],
      dtype='object')

In [8]:
df['Spectral Class'].value_counts()

Spectral Class
M    111
B     46
O     40
A     19
F     17
K      6
G      1
Name: count, dtype: int64

In [9]:
df.replace({'Spectral Class':{'M':0, 'A':1, 'B':1, 'F':1, 'O':1, 'K':1, 'G':1 }}, inplace=True)

In [10]:
# Number of categories
df['Star type'].value_counts()

Star type
0    40
1    40
2    40
3    40
4    40
5    40
Name: count, dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Temperature (K)          240 non-null    int64  
 1   Luminosity (L/Lo)        240 non-null    float64
 2   Radius (R/Ro)            240 non-null    float64
 3   Absolute magnitude (Mv)  240 non-null    float64
 4   Star type                240 non-null    int64  
 5   Star category            240 non-null    object 
 6   Star color               240 non-null    object 
 7   Spectral Class           240 non-null    int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 15.1+ KB


In [12]:
df['Star category'].unique()

array(['Brown Dwarf', 'Red Dwarf', 'White Dwarf', 'Main Sequence',
       'Supergiant', 'Hypergiant'], dtype=object)

In [13]:
df['Star color'].unique()

array(['Red', 'Blue White', 'White', 'Yellowish White', 'Blue white',
       'Pale yellow orange', 'Blue', 'Blue-white', 'Whitish',
       'yellow-white', 'Orange', 'White-Yellow', 'white', 'Blue ',
       'yellowish', 'Yellowish', 'Orange-Red', 'Blue white ',
       'Blue-White'], dtype=object)

In [14]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)  # Dùng sparse_output thay vì sparse
encoded_features = encoder.fit_transform(df[['Star category', 'Star color']])

# Chuyển về DataFrame và gộp vào df gốc
df_encoded = df.drop(columns=['Star category', 'Star color'])
df_encoded = pd.concat([df_encoded, pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())], axis=1)

df_encoded.head()


Unnamed: 0,Temperature (K),Luminosity (L/Lo),Radius (R/Ro),Absolute magnitude (Mv),Star type,Spectral Class,Star category_Brown Dwarf,Star category_Hypergiant,Star category_Main Sequence,Star category_Red Dwarf,...,Star color_Pale yellow orange,Star color_Red,Star color_White,Star color_White-Yellow,Star color_Whitish,Star color_Yellowish,Star color_Yellowish White,Star color_white,Star color_yellow-white,Star color_yellowish
0,3068,0.0024,0.17,16.12,0,0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3042,0.0005,0.1542,16.6,0,0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2600,0.0003,0.102,18.7,0,0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2800,0.0002,0.16,16.65,0,0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1939,0.000138,0.103,20.06,0,0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 31 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Temperature (K)                240 non-null    int64  
 1   Luminosity (L/Lo)              240 non-null    float64
 2   Radius (R/Ro)                  240 non-null    float64
 3   Absolute magnitude (Mv)        240 non-null    float64
 4   Star type                      240 non-null    int64  
 5   Spectral Class                 240 non-null    int64  
 6   Star category_Brown Dwarf      240 non-null    float64
 7   Star category_Hypergiant       240 non-null    float64
 8   Star category_Main Sequence    240 non-null    float64
 9   Star category_Red Dwarf        240 non-null    float64
 10  Star category_Supergiant       240 non-null    float64
 11  Star category_White Dwarf      240 non-null    float64
 12  Star color_Blue                240 non-null    flo

In [16]:
## Define X, y feature 
X = df_encoded.drop(columns=['Spectral Class'])
y = df_encoded['Spectral Class']

In [17]:
### Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Select model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [19]:
# Train model
model.fit(X_train,y_train)

In [20]:
# Predict
y_pred = model.predict(X_test)

In [21]:
y_pred

array([1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1])

In [22]:
# Import function
from sklearn.metrics import confusion_matrix, classification_report
cm=confusion_matrix(y_test,y_pred)
cr=classification_report(y_test,y_pred)
print(cm)
print(cr)

[[21  0]
 [ 1 26]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        21
           1       1.00      0.96      0.98        27

    accuracy                           0.98        48
   macro avg       0.98      0.98      0.98        48
weighted avg       0.98      0.98      0.98        48



In [23]:
### Apply multiple model in classification
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [24]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}

for name, model in models.items():
    try:
        print(f"Training model: {name}")
        model.fit(X_train, y_train)  # Train model

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Training set performance
        model_train_accuracy = accuracy_score(y_train, y_train_pred)
        model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
        model_train_precision = precision_score(y_train, y_train_pred, average='weighted')
        model_train_recall = recall_score(y_train, y_train_pred, average='weighted')
        model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)
        
        # Test set performance
        model_test_accuracy = accuracy_score(y_test, y_test_pred)
        model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
        model_test_precision = precision_score(y_test, y_test_pred, average='weighted')
        model_test_recall = recall_score(y_test, y_test_pred, average='weighted')
        model_test_rocauc_score = roc_auc_score(y_test, y_test_pred)
        
        print(f"Model: {name}")
        print(f"Train Accuracy: {model_train_accuracy:.4f} | Test Accuracy: {model_test_accuracy:.4f}")
        print(f"Train F1-score: {model_train_f1:.4f} | Test F1-score: {model_test_f1:.4f}")
        print(f"Train Precision: {model_train_precision:.4f} | Test Precision: {model_test_precision:.4f}")
        print(f"Train Recall: {model_train_recall:.4f} | Test Recall: {model_test_recall:.4f}")
        print(f"Train ROC-AUC: {model_train_rocauc_score:.4f} | Test ROC-AUC: {model_test_rocauc_score:.4f}")
        print("=" * 50)

    except Exception as e:
        print(f"Model {name} error: {e}")
        print("-" * 50)

Training model: Logistic Regression
Model: Logistic Regression
Train Accuracy: 0.9844 | Test Accuracy: 0.9792
Train F1-score: 0.9844 | Test F1-score: 0.9792
Train Precision: 0.9849 | Test Precision: 0.9801
Train Recall: 0.9844 | Test Recall: 0.9792
Train ROC-AUC: 0.9853 | Test ROC-AUC: 0.9815
Training model: Random Forest
Model: Random Forest
Train Accuracy: 1.0000 | Test Accuracy: 1.0000
Train F1-score: 1.0000 | Test F1-score: 1.0000
Train Precision: 1.0000 | Test Precision: 1.0000
Train Recall: 1.0000 | Test Recall: 1.0000
Train ROC-AUC: 1.0000 | Test ROC-AUC: 1.0000
Training model: Gradient Boosting
Model: Gradient Boosting
Train Accuracy: 1.0000 | Test Accuracy: 1.0000
Train F1-score: 1.0000 | Test F1-score: 1.0000
Train Precision: 1.0000 | Test Precision: 1.0000
Train Recall: 1.0000 | Test Recall: 1.0000
Train ROC-AUC: 1.0000 | Test ROC-AUC: 1.0000
Training model: SVM
Model: SVM
Train Accuracy: 0.5938 | Test Accuracy: 0.6458
Train F1-score: 0.5778 | Test F1-score: 0.6373
Train Pre

In [25]:
## Hyperparameter Training

# Random Forest
rf_params = {
    "n_estimators": [100, 200, 500],
    "max_depth": [5, 10, None],
    "min_samples_split": [2, 5, 10],
    "criterion": ["gini", "entropy"]
}

# Logistic Regression
lr_params = {
    "penalty": ["l1", "l2"],
    "C": [0.1, 1, 10],
    "solver": ["liblinear", "lbfgs"]
}

# Support Vector Machine (SVC)
svc_params = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf", "poly"],
    "gamma": ["scale", "auto"]
}

# K-Nearest Neighbors (KNN)
knn_params = {
    "n_neighbors": [3, 5, 7, 9],
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"]
}

In [26]:
# Models list for Hyperparameter tuning
randomcv_models = [
    ("RandomForest", RandomForestClassifier(), rf_params),
    ("LogisticRegression", LogisticRegression(), lr_params),
    ("SVM", SVC(), svc_params),
    ("KNN", KNeighborsClassifier(), knn_params),
]

In [27]:
randomcv_models

[('RandomForest',
  RandomForestClassifier(),
  {'n_estimators': [100, 200, 500],
   'max_depth': [5, 10, None],
   'min_samples_split': [2, 5, 10],
   'criterion': ['gini', 'entropy']}),
 ('LogisticRegression',
  LogisticRegression(),
  {'penalty': ['l1', 'l2'],
   'C': [0.1, 1, 10],
   'solver': ['liblinear', 'lbfgs']}),
 ('SVM',
  SVC(),
  {'C': [0.1, 1, 10],
   'kernel': ['linear', 'rbf', 'poly'],
   'gamma': ['scale', 'auto']}),
 ('KNN',
  KNeighborsClassifier(),
  {'n_neighbors': [3, 5, 7, 9],
   'weights': ['uniform', 'distance'],
   'metric': ['euclidean', 'manhattan']})]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 54 candidates, totalling 162 fits


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 18 candidates, totalling 54 fits
