# Kaggle Titanic Competition 

In [None]:
#Importing the necessary modules
#Importando as bibliotecas necessarias
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [None]:
#Converting the dataset to dataframe
#Convertendo o dataset para um dataframe

#Local Files
#Arquivos Locais
#train = pd.read_csv("input/train.csv") 
#test = pd.read_csv("input/test.csv")

#Kaggle Kernel Files
#Arquivos do Kernel do Kaggle
train = pd.read_csv("../input/train.csv") 
test = pd.read_csv("../input/test.csv") 

In [None]:
#It shows the first 5 rows of the dataframe
#Mostra as 5 primeiras linhas do dataframe
train.head()

In [None]:
#Look that the 'test' dataframe doesn't have a 'Survived' column, it's what we want to predict
#Observe que o dataframe de "test" nao possui a coluna "Survived", pois isso e o que queremos prever futuramente.
test.head()

In [None]:
#It removes the specific columns, like 'Name', 'Ticket', 'Cabin', in this case
#If you don't want to create a new dataframe you need to set the 'inplace' parameter as 'True'
'''
Remove as colunas especificadas, no caso quando desejar remover mais de uma coluna de uma so vez,
deve-se utilizar uma lista como primeiro argumento.
Poderia alterar/remover diretamente no dataset, sem precisar atribuir novamente a um dataframe.
Deveria utilizar entao: train.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)
'''

train = train.drop(["Name", "Ticket", "Cabin"], axis=1)
test = test.drop(["Name", "Ticket", "Cabin"], axis=1)

In [None]:
#Now it shows the new dataframe without the dropped columns
#We need to apply the changes in both 'train' and 'test' datasets
#Imprime novamente o "head", mas agora apos o .drop, nao ha mais as colunas "Name", "Ticket", "Cabin"
#Deve-se sempre executar os "mesmas" manipulacoes de dados para "train" e "test", se nao teriamos um erro
train.head()

In [None]:
test.head()

In [None]:
#Applies the one_hot_encoding for the 'Sex' and 'Embarked' features
#Aplica one_hot_encoding para a feature de "Sex" e "Embarked"
#one_hot_train = pd.get_dummies(train)
#one_hot_test = pd.get_dummies(test)
new_data_train = pd.get_dummies(train)
new_data_test = pd.get_dummies(test)

In [None]:
#Now we can see that the 'Sex' and 'Embarked' are now numerical columns.
#Observamos agora as colunas que nao eram numericas, seguindo o one_hot_encoding.
new_data_train.head()

In [None]:
#Checks if there is a NaN value for the training data, 'train' data.
#Verifica e agrupa a quantidade de valores nulos(NaN) para "train" data
new_data_train.isnull().sum().sort_values(ascending=False).head(10)

In [None]:
#We will use the mean 'Age' of the dataset for the NaN values
#Atribui a media da coluna "Age" para os valores nulos(NaN)
new_data_train["Age"].fillna(new_data_train["Age"].mean(), inplace=True)
new_data_test["Age"].fillna(new_data_test["Age"].mean(), inplace=True)

In [None]:
#Checks if there is a NaN value for the testing data, 'test' data
#Verifica e agrupa a quantidade de valores nulos(NaN) para "test" data
new_data_test.isnull().sum().sort_values(ascending=False).head(10)

In [None]:
#We will use the mean 'Fare' for the NaN values
#Atribui a media da Coluna "Fare" para os valores nulos(NaN)
new_data_test["Fare"].fillna(new_data_test["Fare"].mean(), inplace=True)

## Decision Tree Model

In [None]:
#Splitting the 'features' and 'targets' for the model, as X and y
#Separando "features" e "targets" para o modelo, X e y respectivamente
X = new_data_train.drop("Survived", axis=1)
y = new_data_train["Survived"]

In [None]:
#We will use a Decision Tree Model as the Machine Learning Algorithm
#Utilizaremos Decision Tree, como algoritmo de Machine Learning
tree = DecisionTreeClassifier(max_depth = 10, random_state = 0)
tree.fit(X, y)

In [None]:
#tree.score(X, y)

## Decision Tree Results
The decision tree model has an 0.77990 accuracy in the competition

> ## Random Forests Model

In [None]:
#Import the necessary modules for the model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 

In [None]:
#It's already done, without the 'Survived' columns and with all the features prepared
#Já esta to normalizado, sem a coluna "Survived" e a feature "Sex" ja tratada pelo"one-hot-encoded"
X.head()

In [None]:
#Test Data
#Xtest = new_data_test.drop(["Survived"], axis=1)
Xtest = new_data_test
Xtest.head()

In [None]:
Xtrain, Xvalidation, Ytrain, Yvalidation = train_test_split(X, y, test_size=0.2, random_state=True)

In [None]:
#Model
model = RandomForestClassifier(n_estimators=100,
                               max_leaf_nodes=12,
                               max_depth=12,
                               random_state=0)
model.fit(Xtrain, Ytrain)
#model.score(Xtrain, Ytrain)

In [None]:
#Prediction
from sklearn.metrics import accuracy_score
Yprediction = model.predict(Xvalidation)
accuracy_score(Yvalidation, Yprediction)

In [None]:
#Submission
#We create a new dataframe for the submission
submission = pd.DataFrame()

submission["PassengerId"] = Xtest["PassengerId"]
submission["Survived"] = model.predict(Xtest)

#We save the submission as a '.csv' file
submission.to_csv("submission.csv", index=False)

In [None]:
submission.head()

# Final Results
It has an  .80382 of accuracy in the real test of the competition at Kaggle.  
If you have found this Kernel useful, feel free to use it.