# Importing the dataset

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns

# Analysis of the problem statement

Lets first understand the data set provided and using that dataset we will try to understand the above problem in Machine Learning world.

In [None]:
dataset = pd.read_csv('../input/star-dataset/6 class csv.csv')


In [None]:
dataset.head()

Let's understand above data. There are 7 fields above:

* **Temperature (K)**: This column consists of the Surface temperatures of several stars
* **Luminosity**: This column consists of the Luminosity of several stars calculated with respect to sun
* **Radius**: This column consists of the Radius of several stars calculated with respect to sun
* **Absolute magnitude**: This column consists of the Absolute Visual magnitude(Mv) of several stars
* **Star type**: This column is the output class (6 classes ranging from 0-5).
* **Star color**: This column contains the info about the colors of each star after Spectral Analysis
* **Spectral Class**: This column contains info about the spectral classes of each star(O,B,A,F,G,K,,M).

This is descrete data so it is **classification problem** and since there are multiple descrete output possible so we can call it **Multi class** classification problem.

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
# Checking dimention of data
dataset.shape

In [None]:
# Clecking column in above data set
dataset.columns

In [None]:
dataset['Spectral Class'].unique()

In [None]:
dataset['Star color'].unique()

In [None]:
dataset['Star type'].unique()

In [None]:
dataset = dataset.copy()
color_replacing = {
    'Blue ': 'Blue',
    'Blue white': 'Blue White',
    'Blue-white': 'Blue White',
    'Blue white ': 'Blue White',
    'Blue-White': 'Blue White',
    'white': 'White',
    'yellow-white': 'Yellowish White',
    'White-Yellow': 'Yellowish White',
    'yellowish': 'Yellowish'
}
dataset['Star color'] = dataset['Star color'].replace(color_replacing)
 

In [None]:
dataset['Star color'].unique()

It's very important to look for missing values. Else they create problem in final analysis.

In [None]:
dataset[dataset.isnull().any(axis=1)]

Dropping Star type class column from a dataset to create labeled dataset. 

In [None]:
y = dataset.iloc[:, 4].values
dataset = dataset.drop(columns= ['Star type'])
X = dataset.iloc[:, :].values                                 

# Encoding categorical data

In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [4])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
ct_1 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [14])], remainder='passthrough')
X = np.array(ct_1.fit_transform(X))


In [None]:
print(X)

In [None]:
X.shape

# Splitting the dataset into train and split

In [None]:
# Splitting the data into train and test set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Training XGBoost on the Training set


In [None]:
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

# Evaluating predicted model

In [None]:
y_pred = classifier.predict(X_test)

# Applying k-Fold Cross Validation

In [None]:
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

# Evaluating Metrics and Plotting Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
labels = ['Brown Dwarf','Red Dwarf','White Dwarf','Main Sequence','Supergiant ','Hypergiant']
sns. set(font_scale=2) 
plt.figure(figsize=(20,7))
sns.heatmap(cm, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()

In [None]:
accuracy_score(y_test, y_pred)