# Import de pacotes

In [28]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import random
from pprint import pprint

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
import pydotplus 
from IPython.display import Image


# Conjunto de dados

## Dicionário das variáveis:

 **Temperature (K)}**:

**Luminosity(L/Lo)**:

**Radius(R/Ro)**:

**Absolute magnitude(Mv)**:

**Star Type**: Brown Dwarf = 0, Red Dwarf = 1, White Dwarf = 2, Main Sequence = 3, Supergiant = 4, Hypergiant = 5

**Star color**:

**Spectral Class**:

The Luminosity and radius of each star is calculated w.r.t. that of the values of Sun.

Lo = 3.828 x 10^26 Watts

Ro = 6.9551 x 10^8 m

## Carregamento dos dados

In [2]:
dados = pd.read_csv('stars_corrigido.csv')

In [3]:
dados.head()

Unnamed: 0,Temperature_(K),Luminosity(L/Lo),Radius(R/Ro),Absolute_magnitude(Mv),Star_type,Star_color,Spectral_Class
0,3068,0.0024,0.17,16.12,0,red,M
1,3042,0.0005,0.1542,16.6,0,red,M
2,2600,0.0003,0.102,18.7,0,red,M
3,2800,0.0002,0.16,16.65,0,red,M
4,1939,0.000138,0.103,20.06,0,red,M


In [4]:
dados.shape

(240, 7)

In [5]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Temperature_(K)         240 non-null    int64  
 1   Luminosity(L/Lo)        240 non-null    float64
 2   Radius(R/Ro)            240 non-null    float64
 3   Absolute_magnitude(Mv)  240 non-null    float64
 4   Star_type               240 non-null    int64  
 5   Star_color              240 non-null    object 
 6   Spectral_Class          240 non-null    object 
dtypes: float64(3), int64(2), object(2)
memory usage: 13.2+ KB


## Preparando os dados

### Removendo espaços dos nomes das variáveis

In [6]:
dados.columns

Index(['Temperature_(K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)',
       'Absolute_magnitude(Mv)', 'Star_type', 'Star_color', 'Spectral_Class'],
      dtype='object')

In [7]:
dados.columns = ['Temperature_(K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)',
       'Absolute_magnitude(Mv)', 'Star_type', 'Star_color', 'Spectral_Class']

In [8]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Temperature_(K)         240 non-null    int64  
 1   Luminosity(L/Lo)        240 non-null    float64
 2   Radius(R/Ro)            240 non-null    float64
 3   Absolute_magnitude(Mv)  240 non-null    float64
 4   Star_type               240 non-null    int64  
 5   Star_color              240 non-null    object 
 6   Spectral_Class          240 non-null    object 
dtypes: float64(3), int64(2), object(2)
memory usage: 13.2+ KB


### Variável resposta (alvo)

In [9]:
dados['Star_type'].unique()

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [10]:
dados['label'] = dados['Star_type'].astype('category')
dados_tree = dados.drop(['Star_type'], axis = 1)

In [11]:
dados_tree.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Temperature_(K)         240 non-null    int64   
 1   Luminosity(L/Lo)        240 non-null    float64 
 2   Radius(R/Ro)            240 non-null    float64 
 3   Absolute_magnitude(Mv)  240 non-null    float64 
 4   Star_color              240 non-null    object  
 5   Spectral_Class          240 non-null    object  
 6   label                   240 non-null    category
dtypes: category(1), float64(3), int64(1), object(2)
memory usage: 11.8+ KB


### Corrigindo type das variáveis categóricas

In [12]:
dados_tree['Star_color'] = dados_tree['Star_color'].astype('category')
dados_tree['Spectral_Class'] = dados_tree['Spectral_Class'].astype('category')

### Encoding data

In [13]:
dados_tree.head()

Unnamed: 0,Temperature_(K),Luminosity(L/Lo),Radius(R/Ro),Absolute_magnitude(Mv),Star_color,Spectral_Class,label
0,3068,0.0024,0.17,16.12,red,M,0
1,3042,0.0005,0.1542,16.6,red,M,0
2,2600,0.0003,0.102,18.7,red,M,0
3,2800,0.0002,0.16,16.65,red,M,0
4,1939,0.000138,0.103,20.06,red,M,0


In [14]:
list(dados_tree['Star_color'].unique())

['red',
 'blue-white',
 'white',
 'yellowish-white',
 'pale-yellow-orange',
 'blue',
 'whitish',
 'yellow-white',
 'orange',
 'yellowish',
 'orange-red']

In [15]:
star_color_dummies = pd.get_dummies(dados_tree['Star_color'])
star_color_dummies

Unnamed: 0,blue,blue-white,orange,orange-red,pale-yellow-orange,red,white,whitish,yellow-white,yellowish,yellowish-white
0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
235,1,0,0,0,0,0,0,0,0,0,0
236,1,0,0,0,0,0,0,0,0,0,0
237,0,0,0,0,0,0,1,0,0,0,0
238,0,0,0,0,0,0,1,0,0,0,0


In [16]:
dados_tree['Spectral_Class'].unique()

['M', 'B', 'A', 'F', 'O', 'K', 'G']
Categories (7, object): ['M', 'B', 'A', 'F', 'O', 'K', 'G']

In [17]:
spectral_class_dummies = pd.get_dummies(dados_tree['Spectral_Class'])
spectral_class_dummies

Unnamed: 0,A,B,F,G,K,M,O
0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0
2,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...
235,0,0,0,0,0,0,1
236,0,0,0,0,0,0,1
237,1,0,0,0,0,0,0
238,1,0,0,0,0,0,0


In [18]:
dados_tree_encoded = pd.concat([dados_tree.drop(['Star_color','Spectral_Class'], axis = 1),star_color_dummies,spectral_class_dummies], axis = 1) 
dados_tree_encoded.to_csv('stars_encoded.csv', index = False)
dados_tree_encoded

Unnamed: 0,Temperature_(K),Luminosity(L/Lo),Radius(R/Ro),Absolute_magnitude(Mv),label,blue,blue-white,orange,orange-red,pale-yellow-orange,...,yellow-white,yellowish,yellowish-white,A,B,F,G,K,M,O
0,3068,0.002400,0.1700,16.12,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,3042,0.000500,0.1542,16.60,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2600,0.000300,0.1020,18.70,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2800,0.000200,0.1600,16.65,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1939,0.000138,0.1030,20.06,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,38940,374830.000000,1356.0000,-9.93,5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
236,30839,834042.000000,1194.0000,-10.63,5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
237,8829,537493.000000,1423.0000,-10.73,5,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
238,9235,404940.000000,1112.0000,-11.23,5,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


### Separando os dados entre teste e treino

In [19]:
random.seed(0)
X, y = dados_tree_encoded.drop('label', axis = 1), dados_tree_encoded['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [20]:
X_train.shape, X_test.shape

((216, 22), (24, 22))

In [21]:
random.seed(0)
X_2, y_2 = dados_tree_encoded[['Radius(R/Ro)','Absolute_magnitude(Mv)']], dados_tree_encoded['label']
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.1, random_state=1)

In [22]:
X_train_2.shape, X_test_2.shape

((216, 2), (24, 2))

# Treinando uma árvore de decisão

In [23]:
clf = tree.DecisionTreeClassifier(max_depth = 5, criterion='gini')

In [31]:
clf = clf.fit(X_train,y_train)

dot_data = StringIO()
dot_data = tree.export_graphviz(clf, feature_names=X.columns, class_names=['0','1','2','3','4','5'], filled=True, 
                                out_file=None) 
graph = pydotplus.graph_from_dot_data(dot_data) 
Image(graph.create_png())
#tree.export_graphviz(clf, out_file=dot_data,  
#                filled=True, rounded=True,
#                special_characters=True,feature_names = X_train.columns,class_names=['0','1','2','3','4','5'])
#graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
#graph.write_png('estrelas.png')
#Image(graph.create_png())

InvocationException: GraphViz's executables not found

In [None]:
clf_2 = tree.DecisionTreeClassifier(max_depth = 5, criterion='gini')
clf_2 = clf_2.fit(X_train_2,y_train_2)
dot_data = StringIO()
export_graphviz(clf_2, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = ['Radius(R/Ro)','Absolute_magnitude(Mv)'],class_names=['0','1','2','3','4','5'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('estrelas_2.png')
Image(graph.create_png())

In [None]:
tree.plot_tree(clf)
plt.show()

In [None]:
tree.plot_tree(clf_2)
plt.show()

# Classificando dados de teste

In [None]:
y_pred = clf.predict(X_test)

In [None]:
pd.DataFrame({'y_test':y_test,'y_pred': y_pred})

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
y_pred_2 = clf_2.predict(X_test_2)

In [None]:
pd.DataFrame({'y_test':y_test_2,'y_pred': y_pred_2})

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test_2, y_pred_2))

# Classificando dados do treino

In [None]:
y_pred_train = clf.predict(X_train)

In [None]:
pd.DataFrame({'y_train':y_train,'y_pred_train': y_pred_train})

In [None]:
print("Accuracy:",metrics.accuracy_score(y_train, y_pred_train))