# Import de pacotes

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import random
from pprint import pprint


# Funções para treinar uma árvore de decisão

Importante! A variável resposta deve ser a última coluna do data frame.

## Funções auxiliares

### Função para separar dados em treino e teste

In [2]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))
    
    indices = df.index.tolist()
    test_indices = random.sample(population = indices, k = test_size)
    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df

### Dados com só uma resposta?

In [3]:
def check_purity(data):
    label_column = data[:,-1]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else: 
        return False

### Classificador

In [4]:
def classify_data(data):
    
    label_column = data[:,-1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts = True)
    
    index = counts_unique_classes.argmax()
    
    classification = unique_classes[index]
    
    return classification

### Potencial split

In [5]:
def get_potential_splits(data):
    
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):
        
        values = data[:, column_index]
        unique_values = np.unique(values)
        
        type_of_feature = feature_types[column_index]
        if type_of_feature == 'countinuous':
            potential_splits[column_index] = []
            for index in range(len(unique_values)):
                if index != 0:
                    current_value = unique_values[index]
                    previous_value = unique_values[index - 1]
                    potential_split = (current_value + previous_value) / 2

                    potential_splits[column_index].append(potential_split)
                    
        else:
            potential_splits[column_index] = unique_values
                
    return potential_splits

### Split dos dados

In [6]:
def split_data(data, split_column, split_value):
    
    split_column_values = data[:, split_column]
    
    type_of_feature = feature_types[split_column]
    if type_of_feature == 'countinuous':
        data_below = data[split_column_values <= split_value]
        data_above = data[split_column_values > split_value]
    else:
        data_below = data[split_column_values == split_value]
        data_above = data[split_column_values != split_value]
        
    return data_below, data_above

### Menor impureza. 

Foram criadas as funções podendo ser usadas as medidas de impureza Gini ou entropia.

In [7]:
def calculate_entropy(data):
    
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts = True)

    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
    return entropy

In [8]:
def calculate_gini(data):
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts = True)
    
    probabilities = (counts / counts.sum())**2
    
    gini = 1 - sum(probabilities)
    
    return gini

In [9]:
def calculate_overall_entropy(data_below, data_above):
    n_data_points = len(data_below) + len(data_above)
    
    p_data_below = len(data_below) / n_data_points
    p_data_above = len(data_above) / n_data_points
    
    overall_entropy = (p_data_below * calculate_entropy(data_below)) + (p_data_above * calculate_entropy(data_above))
    
    return overall_entropy

In [10]:
def calculate_overall_gini(data_below, data_above):
    n_data_points = len(data_below) + len(data_above)
    
    p_data_below = len(data_below) / n_data_points
    p_data_above = len(data_above) / n_data_points
    
    overall_gini = (p_data_below * calculate_gini(data_below)) + (p_data_above * calculate_gini(data_above))
    
    return overall_gini

### Melhor split

In [11]:
def determine_best_split(data, potential_splits, impurity = 'gini'):
    
    overall_impurity = 999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column = column_index, split_value = value)
            if impurity == 'entropy':
                current_overall_impurity = calculate_overall_entropy(data_below, data_above)
            else: # impurity == 'gini'
                current_overall_impurity = calculate_overall_gini(data_below, data_above)
            
            if current_overall_impurity < overall_impurity:
                overall_impurity = current_overall_impurity
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value

### Determinar o tipo da variável explicativa

In [12]:
def determine_type_of_feature(df):
    feature_types = []
    n_unique_values_threshold = 15
    
    for column in df.columns:
        unique_values = df[column].unique()
        example_value = unique_values[0]
        
        if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_threshold):
            feature_types.append('catgorical')
        else:
            feature_types.append('continuous')
            
    return feature_types

## Função principal 

Parâmetros da função:
 - df: conjunto de dados
 - min_samples: quantidade máxima de observações nas folhas (finais)
 - max_depth: quantidade máxima de ramificações (splits)
 - impurity: qual a medida de impureza usada

In [31]:
def decision_tree_algorithm(df, counter = 0, min_samples = 2,max_depth = 5, impurity = 'gini'):
    
    # 
    if counter == 0:
        global column_headers, feature_types
        column_headers = df.columns
        feature_types = determine_type_of_feature(df)
        data = df.values
    else:
        data = df
        
    #base case
    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        classification = classify_data(data)
        return classification
    
    #recursive part
    else:
        counter += 1
        
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits, impurity = 'gini')
        data_below, data_above = split_data(data,split_column, split_value)
        
        # instantiate sub-tree
        feature_name = column_headers[split_column]
        type_of_feature = feature_types[split_column]
        if type_of_feature == 'countinuous':
            question = "{} <= {:.6f}".format(feature_name, split_value)
        else:
            question = "{} = {}".format(feature_name, split_value)    
        sub_tree = {question: []}
    
        #find answers (recursion)
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples, impurity, max_depth)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples, impurity, max_depth)
        
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
    return sub_tree

# Funções para classificação de novos dados

## Classificação

In [39]:
def classify_example(example, tree):
    
    question = list(tree.keys())[0]
    feature_name, comparison_operator,value = question.split()
    
    if comparison_operator == '<=':
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else: 
            answer = tree[question][1]
    else:
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else: 
            answer = tree[question][1]
        
    if not isinstance(answer, dict):
        return answer
    else:
        residual_tree = answer
        return classify_example(example, answer)

## Acurácia

In [15]:
def calculate_accuracy(df, tree):
    
    df['classification'] = df.apply(classify_example, axis=1, args = (tree,))
    df['classification_correct'] = df.classification == df.label
    
    accuracy = df.classification_correct.mean()
    
    return accuracy

# Conjunto de dados

## Dicionário das variáveis:

 **Temperature (K)}**:

**Luminosity(L/Lo)**:

**Radius(R/Ro)**:

**Absolute magnitude(Mv)**:

**Star Type**: Brown Dwarf = 0, Red Dwarf = 1, White Dwarf = 2, Main Sequence = 3, Supergiant = 4, Hypergiant = 5

**Star color**:

**Spectral Class**:

The Luminosity and radius of each star is calculated w.r.t. that of the values of Sun.

Lo = 3.828 x 10^26 Watts

Ro = 6.9551 x 10^8 m

## Carregamento dos dados

In [16]:
dados = pd.read_csv('stars.csv')

In [17]:
dados.head()

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3068,0.0024,0.17,16.12,0,Red,M
1,3042,0.0005,0.1542,16.6,0,Red,M
2,2600,0.0003,0.102,18.7,0,Red,M
3,2800,0.0002,0.16,16.65,0,Red,M
4,1939,0.000138,0.103,20.06,0,Red,M


In [18]:
dados.shape

(240, 7)

In [19]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Temperature (K)         240 non-null    int64  
 1   Luminosity(L/Lo)        240 non-null    float64
 2   Radius(R/Ro)            240 non-null    float64
 3   Absolute magnitude(Mv)  240 non-null    float64
 4   Star type               240 non-null    int64  
 5   Star color              240 non-null    object 
 6   Spectral Class          240 non-null    object 
dtypes: float64(3), int64(2), object(2)
memory usage: 13.2+ KB


## Preparando os dados

### Removendo espaços dos nomes das variáveis

In [46]:
dados.columns

Index(['Temperature (K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)',
       'Absolute magnitude(Mv)', 'Star type', 'Star color', 'Spectral Class',
       'label'],
      dtype='object')

In [48]:
dados.columns = ['Temperature_(K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)',
       'Absolute_magnitude(Mv)', 'Star_type', 'Star_color', 'Spectral_Class',
       'label']

### Variável resposta (alvo)

In [20]:
dados['Star_type'].unique()

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [49]:
dados['label'] = dados['Star_type'].astype('category')
dados_tree = dados.drop(['Star_type'], axis = 1)

In [50]:
dados_tree.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Temperature_(K)         240 non-null    int64   
 1   Luminosity(L/Lo)        240 non-null    float64 
 2   Radius(R/Ro)            240 non-null    float64 
 3   Absolute_magnitude(Mv)  240 non-null    float64 
 4   Star_color              240 non-null    object  
 5   Spectral_Class          240 non-null    object  
 6   label                   240 non-null    category
dtypes: category(1), float64(3), int64(1), object(2)
memory usage: 11.8+ KB


### Corrigindo type das variáveis categóricas

In [51]:
dados_tree['Star_color'] = dados_tree['Star_color'].astype('category')
dados_tree['Spectral_Class'] = dados_tree['Spectral_Class'].astype('category')

### Separando os dados entre teste e treino

In [52]:
random.seed(0)
treino, teste = train_test_split(dados_tree,0.1)

In [53]:
treino.shape, teste.shape

((216, 7), (24, 7))

# Treinando uma árvore de decisão

In [62]:
tree = decision_tree_algorithm(treino)

In [63]:
pprint(tree)

{'Star_color = Red': [{'Radius(R/Ro) = 1324.0': [5,
                                                 {'Radius(R/Ro) = 0.24': [1,
                                                                          {'Luminosity(L/Lo) = 200000.0': [{'Temperature_(K) = 3615': [5,
                                                                                                                                       4]},
                                                                                                           {'Spectral_Class = M': [{'Temperature_(K) = 3324': [1,
                                                                                                                                                               {'Temperature_(K) = 3598': [1,
                                                                                                                                                                                           {'Temperature_(K) = 3607': [1,
                       

# Classificando dados de teste

In [64]:
teste['classification'] = teste.apply(classify_example, axis=1, args = (tree,))

In [65]:
teste

Unnamed: 0,Temperature_(K),Luminosity(L/Lo),Radius(R/Ro),Absolute_magnitude(Mv),Star_color,Spectral_Class,label,classification,classification_correct
216,9320,29.0,1.91,1.236,Blue-white,A,3,3,True
98,12098,689.0,7.01,0.02,Blue-white,A,3,3,True
194,3523,0.0054,0.319,12.43,Red,M,1,0,False
227,10930,783930.0,25.0,-6.224,Blue,O,4,4,True
107,12893,184000.0,36.0,-6.34,Blue,O,4,4,True
10,3600,0.0029,0.51,10.69,Red,M,1,5,False
66,2945,0.00032,0.093,18.34,Red,M,0,0,True
130,3095,0.00019,0.492,10.87,Red,M,1,0,False
124,3511,0.00064,0.109,17.12,Red,M,0,0,True
103,17120,235000.0,83.0,-6.89,Blue,O,4,4,True


In [66]:
accuracy = calculate_accuracy(teste,tree)

In [67]:
pprint(accuracy)

0.7083333333333334
