# Bibliotecas

In [713]:
import pandas as pd
import numpy  as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

# Metadata

In [714]:
dataFolder = "./data"

# Carregando a base de dados

In [715]:
cars = pd.read_csv(f"{dataFolder}/car.csv")

In [716]:
cars.shape

(1728, 7)

In [717]:
cars.sample(10)

Unnamed: 0,Buying_Price,Maintenance_Price,No_of_Doors,Person_Capacity,Size_of_Luggage,Safety,Car_Acceptability
759,high,low,2,2,med,low,unacc
1399,low,vhigh,5more,more,med,med,acc
989,med,high,2,4,big,high,acc
1569,low,med,4,2,med,low,unacc
1179,med,med,5more,more,small,low,unacc
429,vhigh,low,5more,more,big,low,unacc
81,vhigh,vhigh,5more,2,small,low,unacc
1725,low,low,5more,more,big,low,unacc
855,high,low,5more,more,small,low,unacc
408,vhigh,low,5more,2,med,low,unacc


# Explorando os atributos do dataset

In [718]:
cars['Buying_Price'].value_counts()

Buying_Price
vhigh    432
high     432
med      432
low      432
Name: count, dtype: int64

In [719]:
cars['Maintenance_Price'].value_counts()

Maintenance_Price
vhigh    432
high     432
med      432
low      432
Name: count, dtype: int64

In [720]:
cars['No_of_Doors'].value_counts()

No_of_Doors
2        432
3        432
4        432
5more    432
Name: count, dtype: int64

In [721]:
cars['Size_of_Luggage'].value_counts()

Size_of_Luggage
small    576
med      576
big      576
Name: count, dtype: int64

In [722]:
cars['Safety'].value_counts()

Safety
low     576
med     576
high    576
Name: count, dtype: int64

In [723]:
cars['Car_Acceptability'].value_counts()

Car_Acceptability
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64

# Pré processando o dataset

Binarizando as classes

In [724]:
cars['Car_Acceptability'].value_counts()

Car_Acceptability
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64

In [725]:
cars['Car_Acceptability'] = cars['Car_Acceptability'].replace('good' , 'acc')
cars['Car_Acceptability'] = cars['Car_Acceptability'].replace('vgood', 'acc')

In [726]:
cars['Car_Acceptability'].value_counts()

Car_Acceptability
unacc    1210
acc       518
Name: count, dtype: int64

Renomeando a coluna 'Car_Acceptability' para 'class'

In [727]:
cars = cars.rename(columns = { 'Car_Acceptability': 'class' })

Padronizando os nomes das colunas

In [728]:
def toCamelCase(text: str):
	words 		   = text.replace('_', ' ').split()
	camelCaseWords = [words[0].lower()] + [word.capitalize() for word in words[1:]]
	
	return "".join(camelCaseWords)

In [729]:
renamedColumns = cars.columns.tolist()
renamedColumns = map(lambda column: column.lower(), renamedColumns)
renamedColumns = map(toCamelCase, renamedColumns)
renamedColumns = list(renamedColumns)

In [730]:
cars.columns = renamedColumns

In [731]:
cars.sample(10)

Unnamed: 0,buyingPrice,maintenancePrice,noOfDoors,personCapacity,sizeOfLuggage,safety,class
574,high,high,3,2,big,med,unacc
510,high,vhigh,4,more,big,low,unacc
265,vhigh,med,3,more,med,med,acc
307,vhigh,med,5more,4,small,med,unacc
1210,med,low,2,more,med,med,acc
54,vhigh,vhigh,4,2,small,low,unacc
834,high,low,4,more,big,low,unacc
1286,med,low,5more,4,big,high,acc
634,high,high,5more,4,med,med,acc
673,high,med,2,more,big,med,acc


# Definição da Árvore de Decisão

In [None]:
class Node(object):
	def __init__(self):

		self.branches   	 = {}
		self.feature    	 = None
		self.prediction 	 = None
		self.informationGain = 0

	def isLeaf(self):
		return self.prediction is not None

class DecisionTree(object):
	def __init__(self, maxDepth: int, entropy):

		self.entropy  = entropy
		self.maxDepth = maxDepth
		self.root     = None

	def informationGain(self, X: pd.DataFrame, y: pd.Series, feature: str) -> float:

		totalEntropy = self.entropy(y)

		averageEntropy = 0

		for value in X[feature].unique():
			subsetY = y[X[feature] == value]

			averageEntropy += (subsetY.size / y.size) * self.entropy(subsetY)
		
		return totalEntropy - averageEntropy

	def getBestFeature(self, X: pd.DataFrame, y: pd.Series, usedFeatures: set) -> tuple[str | None, float]:

		bestFeature = None
		bestGain    = 0

		for feature in X.columns:

			if feature in usedFeatures:
				continue

			gain = self.informationGain(X, y, feature)

			if gain > bestGain:
				bestGain    = gain
				bestFeature = feature

		return bestFeature, bestGain

	def buildTree(self, X: pd.DataFrame, y: pd.Series, usedFeatures: set[str], depth: int) -> Node:
		
		if depth == self.maxDepth or len(usedFeatures) == X.columns.size:
			node 			= Node()
			node.prediction = y.mode().iloc[0]

			return node
		
		bestFeature, bestGain = self.getBestFeature(X, y, usedFeatures)

		if bestFeature is None:
			node 			= Node()
			node.prediction = y.mode().iloc[0]

			return node
		
		node 		 		 = Node()
		node.feature 		 = bestFeature
		node.informationGain = bestGain

		usedFeatures =  usedFeatures | { bestFeature }

		for value in X[bestFeature].unique():

			subsetIndexes = X[bestFeature] == value

			subsetX = X[subsetIndexes].drop(columns=[ bestFeature ])
			subsetY = y[subsetIndexes]

			child = self.buildTree(
				X 		     = subsetX,
				y 			 = subsetY,
				usedFeatures = usedFeatures,
				depth		 = depth + 1
			)

			node.branches[value] = child

		return node

	def fit(self, X, y):
		self.root = self.buildTree(X, y, set(), 0)

	def predictOne(self, node: Node, x: pd.Series) -> str:
		
		while not node.isLeaf():
			
			feature = node.feature
			value   = x.get(feature)

			if value in node.branches:
				node = node.branches[value]

		return node.prediction

	def predict(self, X: pd.DataFrame)-> pd.Series:
		return X.apply(lambda x: self.predictOne(self.root, x), axis = 1)
	
	def totalInformationGain(self) -> float:

		def traverse(node: Node) -> float:
			if node is None or node.isLeaf():
				return 0.0

			totalGain = node.informationGain

			for child in node.branches.values():
				totalGain += traverse(child)

			return totalGain

		if self.root is None:
			return 0.0

		return traverse(self.root)

In [733]:
def entropy(y: pd.Series) -> float:
	odds = y.value_counts(normalize = True)
	
	return - np.sum(odds * np.log2(odds))

# Função de cálculo das métricas

In [734]:
def metrics(yTrue, yPred):
	acc  = accuracy_score(yTrue, yPred)
	prec = precision_score(yTrue, yPred, pos_label = 'acc', zero_division = 0)
	rec  = recall_score(yTrue, yPred, pos_label = 'acc', zero_division = 0)  
	f1   = f1_score(yTrue, yPred, pos_label = 'acc', zero_division = 0)
	
	cm = confusion_matrix(yTrue, yPred, labels = ['acc', 'unacc'])
	tn = cm[1, 1]
	fp = cm[1, 0]
	
	specificity = tn / (tn + fp)
	
	return acc, prec, rec, specificity, f1

# Holdout do dataset

In [735]:
X = cars.drop(columns=['class'])
y = cars['class']


XTrain, XTest, yTrain, yTest = train_test_split(
	X,
	y,
	train_size   = 0.7,
	stratify     = y,
	random_state = 42
)

In [736]:
print("Amostras para treino:", yTrain.size)
print("Amostras para test:  ", yTest.size)

Amostras para treino: 1209
Amostras para test:   519


# Treino usando KFold com K = 10

In [737]:
kfold = StratifiedKFold(
	n_splits     = 10,
	shuffle      = True,
	random_state = 42
)

In [738]:
trainMetrics = []

In [None]:
for i, (trainIdxFold, testIdxFold) in enumerate(kfold.split(XTrain, yTrain)):
	
	XTrainFold = XTrain.iloc[trainIdxFold]
	XTestFold  = XTrain.iloc[testIdxFold]
	
	yTrainFold = yTrain.iloc[trainIdxFold]
	yTestFold  = yTrain.iloc[testIdxFold]

	model = DecisionTree(maxDepth = 4, entropy = entropy)
	model.fit(XTrainFold, yTrainFold)
	
	yPred = model.predict(XTestFold)

	foldMetrics 		 = metrics(yTestFold, yPred)
	foldMetrics          = list(foldMetrics)
	totalInformationGain = model.totalInformationGain()

	trainMetric = [totalInformationGain] + foldMetrics

	trainMetrics.append(trainMetric)

	print(f"{i} Fold")
	print(f"Ganho de informação : {trainMetric[0]:.3f}")
	print(f"Acurácia            : {trainMetric[1]:.3f}")
	print(f"Precisão            : {trainMetric[2]:.3f}")
	print(f"Sensibilidade       : {trainMetric[3]:.3f}")
	print(f"Especificidade      : {trainMetric[4]:.3f}")
	print(f"F1-score            : {trainMetric[5]:.3f}")
	print("\n")


0 Fold
Ganho de informação : 1.857
Acurácia            : 0.909
Precisão            : 0.805
Sensibilidade       : 0.917
Especificidade      : 0.906
F1-score            : 0.857


1 Fold
Ganho de informação : 1.822
Acurácia            : 0.843
Precisão            : 0.718
Sensibilidade       : 0.778
Especificidade      : 0.871
F1-score            : 0.747


2 Fold
Ganho de informação : 1.875
Acurácia            : 0.884
Precisão            : 0.775
Sensibilidade       : 0.861
Especificidade      : 0.894
F1-score            : 0.816


3 Fold
Ganho de informação : 1.879
Acurácia            : 0.851
Precisão            : 0.800
Sensibilidade       : 0.667
Especificidade      : 0.929
F1-score            : 0.727


4 Fold
Ganho de informação : 1.838
Acurácia            : 0.884
Precisão            : 0.750
Sensibilidade       : 0.917
Especificidade      : 0.871
F1-score            : 0.825


5 Fold
Ganho de informação : 1.835
Acurácia            : 0.884
Precisão            : 0.789
Sensibilidade       : 0.

Métrica de treino

In [740]:
trainMetrics = np.array(trainMetrics)

print(f"Ganho de informação : {trainMetrics[:, 0].mean():.3f}")
print(f"Acurácia média      : {trainMetrics[:, 1].mean():.3f}")
print(f"Precisão média      : {trainMetrics[:, 2].mean():.3f}")
print(f"Sensibilidade média : {trainMetrics[:, 3].mean():.3f}")
print(f"Especificidade média: {trainMetrics[:, 4].mean():.3f}")
print(f"F1-score médio      : {trainMetrics[:, 5].mean():.3f}")

Ganho de informação : 1.848
Acurácia média      : 0.873
Precisão média      : 0.774
Sensibilidade média : 0.821
Especificidade média: 0.896
F1-score médio      : 0.794


# Teste

Construíndo a arvore com todos os dados de treino

In [None]:
model = DecisionTree(
    maxDepth = 4,
    entropy  = entropy
)

In [742]:
model.fit(
    X = XTrain,
    y = yTrain
)

Predição do conjunto de teste

In [743]:
yPredicted = model.predict(
    X = XTest
)

Calculo das métricas

In [744]:
testMetrics = metrics(
    yTrue = yTest,
    yPred = yPredicted
)
testMetrics = list(testMetrics)

In [745]:
totalInformationGain = model.totalInformationGain()
testMetrics 		 = [totalInformationGain] + testMetrics

## Resultados

In [746]:
print(f"Ganho de informação : {testMetrics[0]:.3f}")
print(f"Acurácia            : {testMetrics[1]:.3f}")
print(f"Precisão            : {testMetrics[2]:.3f}")
print(f"Sensibilidade       : {testMetrics[3]:.3f}")
print(f"Especificidade      : {testMetrics[4]:.3f}")
print(f"F1-score            : {testMetrics[5]:.3f}")

Ganho de informação : 1.831
Acurácia            : 0.902
Precisão            : 0.814
Sensibilidade       : 0.872
Especificidade      : 0.915
F1-score            : 0.842
