In [458]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

titanic_test = pd.read_csv('test.csv')
titanic_train = pd.read_csv('train.csv')
titanic_gender = pd.read_csv('survival.csv')

#survival	Survival	0 = No, 1 = Yes
#pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
#sex	Sex	
#Age	Age in years	
#sibsp	# of siblings / spouses aboard the Titanic	
#parch	# of parents / children aboard the Titanic	
#ticket	Ticket number	
#fare	Passenger fare	
#cabin	Cabin number	
#embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton


In [459]:
y_train = titanic_train['Survived']
titanic_train = titanic_train.drop(['Survived'],axis=1)
titanic_train = titanic_train.drop(['PassengerId'],axis=1)
titanic_test = titanic_test.drop(['PassengerId'],axis=1)

In [460]:
titanic_train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [461]:
titanic_test["Cabin"].fillna("S", inplace = True)
titanic_train["Cabin"].fillna("S", inplace = True)
titanic_train['Embarked'].fillna("S", inplace = True)
titanic_test["Age"].fillna(method='ffill', inplace = True)
titanic_train["Age"].fillna(method='ffill', inplace = True)
titanic_test["Fare"].fillna(method='ffill', inplace = True)

In [462]:
titanic_test.isnull().sum()

Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [463]:
titanic_train.isnull().sum()

Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [464]:
def first_letter(word):
    letter = word[0]
    letter = str(letter)
    return letter

In [465]:
def replace_cabin_class(dataset,column):
    unique_cabin = dataset[column].unique()
    for word in unique_cabin:
        classe_cabin = first_letter(word)
        dataset[column].replace(word,classe_cabin,inplace=True)

In [466]:
replace_cabin_class(titanic_test,'Cabin')
replace_cabin_class(titanic_train,'Cabin')

In [467]:
def replace_lastname(dataset,column):
    names_unique = dataset[column].unique()
    for name in names_unique:
        my_string = name    
        splitted = my_string.split()
        first = splitted[0]
        lastname = first.replace(',','')
        dataset[column].replace(name,lastname,inplace=True)

In [468]:
replace_lastname(titanic_train,'Name')
replace_lastname(titanic_test,'Name')

In [469]:
for columns in titanic_train.columns:
    tamanho = len(titanic_train[columns].unique())   
    print('Coluna : ' + str(columns) +' tem ' + str(tamanho) + ' valores únicos.')

Coluna : Pclass tem 3 valores únicos.
Coluna : Name tem 661 valores únicos.
Coluna : Sex tem 2 valores únicos.
Coluna : Age tem 88 valores únicos.
Coluna : SibSp tem 7 valores únicos.
Coluna : Parch tem 7 valores únicos.
Coluna : Ticket tem 681 valores únicos.
Coluna : Fare tem 248 valores únicos.
Coluna : Cabin tem 9 valores únicos.
Coluna : Embarked tem 3 valores únicos.


In [470]:
for columns in titanic_test.columns:
    tamanho = len(titanic_test[columns].unique())   
    print('Coluna : ' + str(columns) +' tem ' + str(tamanho) + ' valores únicos.')

Coluna : Pclass tem 3 valores únicos.
Coluna : Name tem 349 valores únicos.
Coluna : Sex tem 2 valores únicos.
Coluna : Age tem 79 valores únicos.
Coluna : SibSp tem 7 valores únicos.
Coluna : Parch tem 8 valores únicos.
Coluna : Ticket tem 363 valores únicos.
Coluna : Fare tem 169 valores únicos.
Coluna : Cabin tem 8 valores únicos.
Coluna : Embarked tem 3 valores únicos.


In [471]:
titanic_train = titanic_train.drop(['Name','Ticket'],axis=1)
titanic_test = titanic_test.drop(['Name','Ticket'],axis=1)

In [472]:
# Get list of categorical variables
s = (titanic_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Sex', 'Cabin', 'Embarked']


In [473]:
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(titanic_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(titanic_test[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = titanic_train.index
OH_cols_valid.index = titanic_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = titanic_train.drop(object_cols, axis=1)
num_X_valid = titanic_test.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)


In [474]:
rf = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=250, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')
rf.fit(OH_X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=250, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [475]:
y_pred = rf.predict(OH_X_valid)
# verificando acurácia
print('acurácia é ', accuracy_score(titanic_gender['Survived'],y_pred))

acurácia é  0.9066985645933014
