In [135]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Primero cargamos el conjunto de datos

In [77]:
titanic = pd.read_csv("titanic_data.csv")

In [78]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [79]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### Seleccionamos la variable objetivo y los atributos a utilizar

In [80]:
objetivo = titanic["Survived"]
atributos = titanic[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]

In [81]:
print(objetivo.head())
atributos.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,22.0,1,0,7.25
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.925
3,1,female,35.0,1,0,53.1
4,3,male,35.0,0,0,8.05


### Modificamos el atributo "Sex" para que sea numérico

In [82]:
atributos["Sex"] = (atributos["Sex"] == "female").astype(int)
atributos.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,0,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,3,1,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,3,0,35.0,0,0,8.05


In [83]:
atributos = atributos.fillna(0)

# Reglas Simples

### Regla 1: Todas las mujeres sobreviven

In [105]:
def regla1(instancia):
    if instancia["Sex"] == 1:
        return 1
    return 0

In [106]:
def predecir_con_regla(atributos, regla):
    prediccion = []
    for i in range(atributos.shape[0]):
        pred = regla(atributos.ix[i,:])
        prediccion.append(pred)
    return prediccion

In [107]:
prediccion = predecir_con_regla(atributos, regla1)
exactitud = accuracy_score(objetivo, prediccion)
print("Exactitud: {:.3f}".format(exactitud))

Exactitud: 0.787


### Regla 2: Todos los niños sobreviven

In [129]:
def regla2(instancia):
    if instancia["Age"] < 18:
        return 1
    return 0

In [130]:
prediccion = predecir_con_regla(atributos, regla2)
exactitud = accuracy_score(objetivo, prediccion)
print("Exactitud: {:.3f}".format(exactitud))

Exactitud: 0.544


### Regla 3: Todas las mujeres y todos los niños sobreviven

In [132]:
def regla3(instancia):
    if instancia["Sex"] == 1:
        return 1
    if instancia["Age"] < 18:
        return 1
    return 0

In [133]:
prediccion = predecir_con_regla(atributos, regla3)
exactitud = accuracy_score(objetivo, prediccion)
print("Exactitud: {:.3f}".format(exactitud))

Exactitud: 0.670


# Árboles de Decisión

### Primero dividimos el conjunto de datos en entrenamiento y prueba

In [142]:
x_train, x_test, y_train, y_test = train_test_split(atributos, objetivo, test_size=0.1, random_state=42, stratify=objetivo)

In [143]:
print(y_train.head())
x_train.describe()

86     0
329    1
517    0
844    0
408    0
Name: Survived, dtype: int64


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
count,801.0,801.0,801.0,801.0,801.0,801.0
mean,2.313358,0.350811,23.683109,0.519351,0.394507,31.924427
std,0.833927,0.477522,17.659147,1.099972,0.827146,47.675662
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,5.0,0.0,0.0,7.8958
50%,3.0,0.0,24.0,0.0,0.0,14.4542
75%,3.0,1.0,35.0,1.0,0.0,31.0
max,3.0,1.0,80.0,8.0,6.0,512.3292


In [144]:
print(y_test.head())
x_test.describe()

530    1
401    0
433    0
241    1
773    0
Name: Survived, dtype: int64


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
count,90.0,90.0,90.0,90.0,90.0,90.0
mean,2.266667,0.366667,24.833333,0.555556,0.266667,34.694259
std,0.858533,0.484594,17.085377,1.132871,0.576701,65.239665
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,16.0,0.0,0.0,8.05
50%,3.0,0.0,25.0,0.0,0.0,14.30415
75%,3.0,1.0,34.75,1.0,0.0,27.8625
max,3.0,1.0,63.0,8.0,2.0,512.3292


### Entrenamos el árbol de decisión

In [145]:
modelo = DecisionTreeClassifier()
modelo.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [147]:
y_pred = modelo.predict(x_test)
exactitud = accuracy_score(y_test, y_pred)
print("Exactitud: {:.3f}".format(exactitud))

Exactitud: 0.800


# Bosques Aleatorios

### Entrenamos el bosque aleatorio

In [182]:
modelo = RandomForestClassifier(n_estimators=100, max_features=4, class_weight='balanced', oob_score=True, random_state=500)
modelo.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features=4,
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=500, verbose=0, warm_start=False)

In [183]:
y_pred = modelo.predict(x_test)
exactitud = accuracy_score(y_test, y_pred)
print("Exactitud: {:.3f}".format(exactitud))

Exactitud: 0.800


### Importancia de las Variables

In [193]:
pd.concat([pd.Series(["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]), pd.Series(modelo.feature_importances_)], axis=1)

Unnamed: 0,0,1
0,Pclass,0.082814
1,Sex,0.280917
2,Age,0.24329
3,SibSp,0.050422
4,Parch,0.043876
5,Fare,0.298681
