# Árvore de Decisão e KNN

### Carregar a base TITANIC do seaborn

In [40]:
import pandas as pd
from seaborn import load_dataset

# Carregar base de dados
df = load_dataset('titanic')

### Observar a relação dos atributos "pclass" com "class"

In [41]:
# Analisando as informações do dataset

df.info()

print('--------------')
print('Shape-->',df.shape)
print('--------------')

df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
--------------
Shape--> (891, 15)
--------------


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [42]:
# comparando 'pclass' e 'class'
print(df.loc[:,['pclass', 'class']])

     pclass   class
0         3   Third
1         1   First
2         3   Third
3         1   First
4         3   Third
..      ...     ...
886       2  Second
887       1   First
888       3   Third
889       1   First
890       3   Third

[891 rows x 2 columns]


In [43]:
# Dropando colunas 'class'
df.drop('class', axis = 1, inplace = True)
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,man,True,C,Cherbourg,yes,True


### Observar a quantidade de NaN existente no atributo "deck"

In [44]:
print('Quantidade total de linhas NaN da coluna --> ', df.loc[:, ['deck']].isna().sum())

Quantidade total de linhas NaN da coluna -->  deck    688
dtype: int64


In [45]:
# Dropando colunas 'deck' por causa da quantidade de NaN, ou seja 688/891 NaN
df.drop('deck', axis = 1, inplace = True)

### Remover as linhas que tenham valores NaN

In [46]:
df = df.dropna()
print("shape depois ->", df.shape)

shape depois -> (712, 13)


### Transformar todos os atributos categóricos usando One-Hot Encoder

In [47]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

column_transformer = make_column_transformer((OneHotEncoder(), ['sex','embarked', 'who', 'adult_male', 'embark_town', 'alive', 'alone']), remainder='passthrough')

df = column_transformer.fit_transform(df)

df = pd.DataFrame(data=df)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,3.0,22.0,1.0,0.0,7.2500
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,38.0,1.0,0.0,71.2833
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,3.0,26.0,0.0,0.0,7.9250
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,35.0,1.0,0.0,53.1000
4,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,3.0,35.0,0.0,0.0,8.0500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,3.0,39.0,0.0,5.0,29.1250
708,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,2.0,27.0,0.0,0.0,13.0000
709,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,19.0,0.0,0.0,30.0000
710,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,26.0,0.0,0.0,30.0000


### Coluna target é a "pclass" ou "survived"

In [48]:
y = df[18]

X = df.drop(18,axis=1)

### Separar os dados de treinamento e testes

In [49]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, stratify=y) # 80% treino e 20% teste

### Treinar 2 árvores de decisão com características diferentes

In [50]:
from sklearn.tree import DecisionTreeClassifier

model1 = DecisionTreeClassifier(criterion="gini")
model2 = DecisionTreeClassifier(criterion="entropy", max_depth=3)

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)

### Treinar 2 KNNs com características diferentes

In [51]:
from sklearn.neighbors import KNeighborsClassifier

model3 = KNeighborsClassifier(n_neighbors=7, metric='euclidean', algorithm='brute')
model4 = KNeighborsClassifier(n_neighbors=5, metric='minkowski', algorithm='brute')

model3.fit(X_train, y_train)
model4.fit(X_train, y_train)

### Exibindo os resultados:

In [67]:
from sklearn import metrics

print('Resultado Geral')
print('=' * 85)
print('-' * 85)
resultado1 = model1.predict(X_test)
acuracia1 = metrics.accuracy_score(resultado1, y_test)
show1 = round(acuracia1 * 100)
print('Resultado modelo 1 - árvore de decisão - com critério gini: ', show1, '%')
print('-' * 85)

print('-' * 85)
resultado2 = model2.predict(X_test)
acuracia2 = metrics.accuracy_score(resultado2, y_test)
show2 = round(acuracia2 * 100)
print('Resultado modelo 2 - árvore de decisão - critério entropy e profundidade max 3:', show2, '%')
print('-' * 85)

print('-' * 85)
resultado3 = model3.predict(X_test)
acuracia3 = metrics.accuracy_score(resultado3, y_test)
show3 = round(acuracia3 * 100)
print('Resultado modelo 3 - KNN - 7 vizinhos - métrica: euclidean - algoritmo: brute: ', show3, '%')
print('-' * 85)

print('-' * 85)
resultado4 = model4.predict(X_test)
acuracia4 = metrics.accuracy_score(resultado4, y_test)
show4 = round(acuracia4 * 100)
print('Resultado modelo 4 - KNN - 5 vizinhos - métrica: minkowski - algoritmo: brute: ', show4, '%')
print('-' * 85)

Resultado Geral
-------------------------------------------------------------------------------------
Resultado modelo 1 - árvore de decisão - com critério gini:  89 %
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
Resultado modelo 2 - árvore de decisão - critério entropy e profundidade max 3: 82 %
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
Resultado modelo 3 - KNN - 7 vizinhos - métrica: euclidean - algoritmo: brute:  78 %
-------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------
Resultado modelo 4 - KNN - 5 vizinhos - métrica: minkowski - algoritmo: brute:  83 %
-------------------------------------------------------------