In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline

#ignore warnings
import warnings
warnings.filterwarnings('ignore')
np.random.seed(0)

#Importando dados
data_teste = pd.read_csv("/kaggle/input/titanic/test.csv")
data_train = pd.read_csv("/kaggle/input/titanic/train.csv")

data_train.describe()

## *Analise Inicial do Dataset*

In [None]:
data_train.head()

In [None]:
data_teste.head()

In [None]:
print (data_train.info())
print ()
print (data_train.info())

In [None]:
print ('Train Columns: ', data_train.columns)

print ('\nTest Columns: ', data_teste.columns)

- Categorical Features: Sex, Survived, PClass, Embarked
- Numerical Features: Age (Continua), Fare (Continua), SibSp (Discreta), Parch (Discreta)
- Alphanumeric: Cabin, Ticket

In [None]:
data_train.isnull().sum() # Analisando as colunas com valores faltantes

In [None]:
data_teste.isnull().sum() # Analisando as colunas com valores faltantes

## *Tratando os dados*

In [None]:
train_data = data_train.copy()
teste_data = data_teste.copy() # Copia para n alterar o banco de dados original

In [None]:
cols = ['Cabin', 'Ticket', 'Name']
X_teste = teste_data.drop(cols, axis=1)
X_train= train_data.drop(cols, axis=1)

In [None]:
# Preenchendo colunas NaN com valores medianos quando numericas e com modas quando categoricas.
median_age = X_train['Age'].median()
median_fare = X_train['Fare'].median()
mode_embarked = X_train['Embarked'].mode()[0]
X_train['Age'].fillna(median_age, inplace=True)
X_train['Fare'].fillna(median_fare, inplace=True)
X_train['Embarked'].fillna(mode_embarked, inplace=True)

median_ageT = X_teste['Age'].median()
median_fareT = X_teste['Fare'].median()
X_teste['Age'].fillna(median_ageT, inplace=True)
X_teste['Fare'].fillna(median_fareT, inplace=True)

In [None]:
# Separando Colunas em categoricas e numericas
categorical_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique() < 10 and 
                    X_train[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train.columns if 
                X_train[cname].dtype in ['int64', 'float64']]



categorical_colsT = [cname for cname in X_teste.columns if
                    X_teste[cname].nunique() < 10 and 
                    X_teste[cname].dtype == "object"]

numerical_colsT = [cname for cname in X_teste.columns if 
                X_teste[cname].dtype in ['int64', 'float64']]




my_cols = categorical_cols + numerical_cols
my_colsT = categorical_colsT + numerical_colsT
train = X_train[my_cols].copy()
teste = X_teste[my_colsT].copy()

In [None]:
def one_hot_encode_dataframe(df, categorical_columns):

    #Aplica One-Hot Encoding às colunas categóricas de um DataFrame.
    
    #param df: DataFrame original.
    #param categorical_columns: Lista de colunas categóricas a serem codificadas.
    #return: Novo DataFrame com as colunas codificadas e devidamente nomeadas.

    df_copy = df.copy()
    encoder = OneHotEncoder(sparse=False, drop=None)
    
    for column in categorical_columns:
        encoded_array = encoder.fit_transform(df_copy[[column]])
        encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out([column]))
        
        df_copy = df_copy.drop(columns=[column])
        df_copy = pd.concat([df_copy, encoded_df], axis=1)
    
    return df_copy

In [None]:
train = one_hot_encode_dataframe(train, categorical_cols)
teste = one_hot_encode_dataframe(teste, categorical_colsT)

In [None]:
# Separando a coluna Age em grupos de mesma extensão
bins = [0, 16, 32, 50, 64, np.inf]
labels = [1, 2, 3, 4, 5]
train['AgeGroup'] = pd.cut(train["Age"], bins, labels = labels)
teste['AgeGroup'] = pd.cut(teste["Age"], bins, labels = labels)

In [None]:
# Separando a coluna Fare em grupos com numeros de dados parecidos
train['FareGroup'] = pd.qcut(train["Fare"], q=6, labels=[1, 2, 3, 4, 5, 6])
teste['FareGroup'] = pd.qcut(teste["Fare"], q=6, labels=[1, 2, 3, 4, 5, 6])

In [None]:
# Removendo as colunas que foram divididas em partições
train = train.drop('Fare', axis=1)
train = train.drop('Age', axis=1)
teste = teste.drop('Fare', axis=1)
teste = teste.drop('Age', axis=1)

## *Exibindo Graficos de Sobrevivencia*

In [None]:
# Criando uma copia do DataSet para plotar graficos
graph_train = data_train.copy()

In [None]:
# Separando novamente a coluna Age em grupos, porem utilizando strings
bins = [0, 16, 32, 50, 64, np.inf]
labels = ['Criança', 'Jovem', 'Adulto', 'Senhor', 'Idoso']
graph_train['AgeGroup'] = pd.cut(graph_train["Age"], bins, labels = labels)

plt.title('Sobrevivencia por Grupo de Idade')
sns.barplot(x="AgeGroup", y="Survived", data=graph_train)

In [None]:
# Separando novamente a coluna Fare em grupos, porem utilizando strings
graph_train['FareGroup'] = pd.qcut(graph_train["Fare"], q=6, labels=['Muito Baixo', 'Baixo', 'Médio-Baixo', 'Médio', 'Alto', 'Muito Alto'])

plt.title('Sobrevivencia por Tarifa')
sns.barplot(x="FareGroup", y="Survived", data=graph_train)

In [None]:
sns.barplot(x="Parch", y="Survived", data=graph_train)

In [None]:
sns.barplot(x="SibSp", y="Survived", data=graph_train)

In [None]:
sns.barplot(x="Sex", y="Survived", data=graph_train)

## *Treinando o modelo*

In [None]:
from sklearn.model_selection import train_test_split

X = train.drop('Survived', axis=1)
Y = train['Survived']

x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size = 0.20, random_state = 0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

randomforest = RandomForestClassifier()
randomforest.fit(x_train, y_train)

y_pred = randomforest.predict(x_val)
accuracy_randomforest = round(accuracy_score(y_pred, y_val) * 100, 2)
print(accuracy_randomforest)


## *Gerando modelo para competição*

In [None]:
X_teste = teste
teste_ids = teste['PassengerId']
predictions = randomforest.predict(X_teste)
#predictions = predictions.astype('int64')  
submission = pd.DataFrame({"PassengerId": teste_ids, "Survived": predictions})
submission.to_csv("submission.csv", index=False)