In [1]:
%load_ext watermark
%watermark

Last updated: 2022-01-20T18:11:34.561947-03:00

Python implementation: CPython
Python version       : 3.8.12
IPython version      : 7.29.0

Compiler    : MSC v.1916 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 158 Stepping 10, GenuineIntel
CPU cores   : 12
Architecture: 64bit



In [2]:
import pandas as pd
import numpy as np

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [3]:
df = pd.read_csv('datasets/titanic.csv')
df.head()

Unnamed: 0,superviviente,clase_billete,genero,edad,n_hermanos_esposos,n_hijos_padres,precio_billete,puerto_salida
0,0,3,hombre,22.0,1,0,7.25,S
1,1,1,mujer,38.0,1,0,71.2833,C
2,1,3,mujer,26.0,0,0,7.925,S
3,1,1,mujer,35.0,1,0,53.1,S
4,0,3,hombre,35.0,0,0,8.05,S


In [4]:
df.shape

(891, 8)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   superviviente       891 non-null    int64  
 1   clase_billete       891 non-null    int64  
 2   genero              891 non-null    object 
 3   edad                714 non-null    float64
 4   n_hermanos_esposos  891 non-null    int64  
 5   n_hijos_padres      891 non-null    int64  
 6   precio_billete      891 non-null    float64
 7   puerto_salida       889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [6]:
arbol = tree.DecisionTreeClassifier(max_depth=4)

In [7]:
col_categ = ['genero', 'puerto_salida']

In [8]:
datos_categ = pd.get_dummies(df[col_categ])
datos_categ

Unnamed: 0,genero_hombre,genero_mujer,puerto_salida_C,puerto_salida_Q,puerto_salida_S
0,1,0,0,0,1
1,0,1,1,0,0
2,0,1,0,0,1
3,0,1,0,0,1
4,1,0,0,0,1
...,...,...,...,...,...
886,1,0,0,0,1
887,0,1,0,0,1
888,0,1,0,0,1
889,1,0,1,0,0


In [9]:
# drop col categoricas
pasajeros = pd.concat([df.drop(col_categ, axis=1), datos_categ], axis=1)
pasajeros

Unnamed: 0,superviviente,clase_billete,edad,n_hermanos_esposos,n_hijos_padres,precio_billete,genero_hombre,genero_mujer,puerto_salida_C,puerto_salida_Q,puerto_salida_S
0,0,3,22.0,1,0,7.2500,1,0,0,0,1
1,1,1,38.0,1,0,71.2833,0,1,1,0,0
2,1,3,26.0,0,0,7.9250,0,1,0,0,1
3,1,1,35.0,1,0,53.1000,0,1,0,0,1
4,0,3,35.0,0,0,8.0500,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000,1,0,0,0,1
887,1,1,19.0,0,0,30.0000,0,1,0,0,1
888,0,3,,1,2,23.4500,0,1,0,0,1
889,1,1,26.0,0,0,30.0000,1,0,1,0,0


In [10]:
# eliminar Nan con la media de edad
pasajeros.edad = pasajeros.edad.fillna(pasajeros.edad.mean())

In [11]:
# split
X_train, X_test, y_train, y_test = train_test_split(pasajeros.drop('superviviente', axis=1),
                                                   pasajeros.superviviente, test_size=0.3)

In [12]:
# predicciones
predicciones = arbol.fit(X_train, y_train).predict(X_test)

In [13]:
# medir el error de la clasificacion
cross_val_score(arbol, X_test, y_test, scoring='roc_auc', cv=10).mean()

0.8236012700534759

In [14]:
arbol.feature_importances_

array([0.17171548, 0.1198447 , 0.03545006, 0.01286431, 0.12411322,
       0.52963965, 0.        , 0.00637257, 0.        , 0.        ])

In [15]:
dict(zip(pasajeros.drop('superviviente', axis=1), arbol.feature_importances_))

{'clase_billete': 0.17171547858028258,
 'edad': 0.11984469740449787,
 'n_hermanos_esposos': 0.03545006339937108,
 'n_hijos_padres': 0.012864313574783608,
 'precio_billete': 0.12411322212160342,
 'genero_hombre': 0.5296396538763646,
 'genero_mujer': 0.0,
 'puerto_salida_C': 0.006372571043096955,
 'puerto_salida_Q': 0.0,
 'puerto_salida_S': 0.0}

import graphviz

def dibujar_arbol(arb):
    graf = tree.export_graphviz(arb,
                               feature_names=pasajeros.drop('superviviente', axis=1).columns,
                               filled=True,
                               rounded=True)
    grafico = graphviz.Source(graf)
    grafico.format = 'png'
    grafico.render('arbol', view=True)

dibujar_arbol(arbol)