In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("Stars.csv")

In [3]:
df.head()

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class,Type
0,3068,0.0024,0.17,16.12,Red,M,0
1,3042,0.0005,0.1542,16.6,Red,M,0
2,2600,0.0003,0.102,18.7,Red,M,0
3,2800,0.0002,0.16,16.65,Red,M,0
4,1939,0.000138,0.103,20.06,Red,M,0


In [4]:
def color_name_fix(color):
    if "orange" in color.lower():
        color = "Orange"
    elif "yellow" in color.lower():
        color = "Yellowish"
    elif color.lower() == "whitish":
        color = "White"
    else:
        color = color.replace("-"," ")
        color = color.title()
        color = color.replace(" ","-")
    return color
df["Color"] = df["Color"].apply(color_name_fix)

In [5]:
non_numerical = []
for col in df.columns:
    if df[col].dtypes == "object":
        non_numerical.append(col)

In [6]:
class customEncoder:
    
    _type_decoder = {
        0 : "Red Dwarf",
        1 : "Brown Dwarf", 
        2 : "White Dwarf", 
        3 : "Main Sequence",
        4 : "Super Giants" ,
        5 : "Hyper Giants" 
    }
    
    def __init__(self):
        self.enc_map = dict()
        self.decoded_data = None
        self._encoded = False
    def fit(self,data):
        self.data = data
    def encode(self):
        if not self._encoded:
            self._encoded = True
            for i,col in enumerate(self.data.columns):
                if self.data[col].dtypes == "object":
                    n_ins = self.data[col].nunique()
                    dict_to_map = dict(zip(self.data[col].unique(), range(n_ins)))
                    self.enc_map[col] = dict_to_map
                    self.data[col] = self.data[col].map(dict_to_map)
                    
    def fit_encode(self,data):
        self.fit(data)
        self.encode()
        
    def decode_target(self):
        if not self.decoded_data:
            self.decoded_data = self.data.copy()
            self.decoded_data["Type"] = self.decoded_data["Type"].map(self._type_decoder)
    
    def decode(self):
        if self._encoded:
            for col in self.enc_map.keys():
                decode_map = dict(zip(self.enc_map[col].values(),self.enc_map[col].keys()))
                self.data[col] = self.data[col].map(decode_map)
            self._encoded = False

In [7]:
cenc = customEncoder()
cenc.fit_encode(df)

In [8]:
df.head()

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class,Type
0,3068,0.0024,0.17,16.12,0,0,0
1,3042,0.0005,0.1542,16.6,0,0,0
2,2600,0.0003,0.102,18.7,0,0,0
3,2800,0.0002,0.16,16.65,0,0,0
4,1939,0.000138,0.103,20.06,0,0,0


In [9]:
cenc.decode()

In [10]:
df.head()

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class,Type
0,3068,0.0024,0.17,16.12,Red,M,0
1,3042,0.0005,0.1542,16.6,Red,M,0
2,2600,0.0003,0.102,18.7,Red,M,0
3,2800,0.0002,0.16,16.65,Red,M,0
4,1939,0.000138,0.103,20.06,Red,M,0


In [11]:
cenc.decode_target()
cenc.decoded_data.head()

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class,Type
0,3068,0.0024,0.17,16.12,Red,M,Red Dwarf
1,3042,0.0005,0.1542,16.6,Red,M,Red Dwarf
2,2600,0.0003,0.102,18.7,Red,M,Red Dwarf
3,2800,0.0002,0.16,16.65,Red,M,Red Dwarf
4,1939,0.000138,0.103,20.06,Red,M,Red Dwarf


In [12]:
cenc.encode()

In [13]:
df.head()

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class,Type
0,3068,0.0024,0.17,16.12,0,0,0
1,3042,0.0005,0.1542,16.6,0,0,0
2,2600,0.0003,0.102,18.7,0,0,0
3,2800,0.0002,0.16,16.65,0,0,0
4,1939,0.000138,0.103,20.06,0,0,0


In [14]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score
from sklearn.preprocessing import StandardScaler

In [15]:
std = StandardScaler()
df[["Temperature","L","R","A_M"]] = std.fit_transform(df[["Temperature","L","R","A_M"]])

In [16]:
df.head()

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class,Type
0,-0.779382,-0.598624,-0.45921,1.116745,0,0,0
1,-0.78211,-0.598624,-0.459241,1.162414,0,0,0
2,-0.828477,-0.598624,-0.459342,1.362213,0,0,0
3,-0.807496,-0.598624,-0.459229,1.167171,0,0,0
4,-0.897819,-0.598624,-0.45934,1.491607,0,0,0


In [17]:
x_train, x_test, y_train, y_test = train_test_split(df[df.columns[:-1]],df["Type"], test_size = 0.3)

In [18]:
models = [("Random Forest Classifier",RandomForestClassifier()), 
    ("Ada Boost Classifier",AdaBoostClassifier()),
    ("Bagging Classifier",BaggingClassifier()),
    ("Gradient Boosting Classifier",GradientBoostingClassifier()),
    ("Decision Tree Classifier",DecisionTreeClassifier())]

In [19]:
y_test.head()

86     2
174    5
239    5
16     1
158    3
Name: Type, dtype: int64

In [20]:
trained_models = dict()
for name,model in models:
    model.fit(x_train, y_train)
    trained_models[name] = model
    preds = model.predict(x_test)
    print("----------------------------------")
    print(name)
    print(classification_report(y_test,preds))
    print("----------------------------------")
    print(confusion_matrix(y_test,preds))

----------------------------------
Random Forest Classifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        11
           3       1.00      1.00      1.00        12
           4       1.00      1.00      1.00        11
           5       1.00      1.00      1.00        11

    accuracy                           1.00        72
   macro avg       1.00      1.00      1.00        72
weighted avg       1.00      1.00      1.00        72

----------------------------------
[[13  0  0  0  0  0]
 [ 0 14  0  0  0  0]
 [ 0  0 11  0  0  0]
 [ 0  0  0 12  0  0]
 [ 0  0  0  0 11  0]
 [ 0  0  0  0  0 11]]
----------------------------------
Ada Boost Classifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.00      0.00      0.00        14
           2    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


----------------------------------
Gradient Boosting Classifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        11
           3       1.00      1.00      1.00        12
           4       1.00      1.00      1.00        11
           5       1.00      1.00      1.00        11

    accuracy                           1.00        72
   macro avg       1.00      1.00      1.00        72
weighted avg       1.00      1.00      1.00        72

----------------------------------
[[13  0  0  0  0  0]
 [ 0 14  0  0  0  0]
 [ 0  0 11  0  0  0]
 [ 0  0  0 12  0  0]
 [ 0  0  0  0 11  0]
 [ 0  0  0  0  0 11]]
----------------------------------
Decision Tree Classifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        14
        