In [134]:
import pandas as pd
import sklearn.tree as T
import sklearn.ensemble as E 
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics as M

In [135]:
def idade_media(df_train: pd.DataFrame, df_test: pd.DataFrame):
    _df_test = pd.DataFrame.copy(df_test)
    _df_train = pd.DataFrame.copy(df_train)

    _df_test['Survived'] = 0

    df = pd.concat([_df_train, _df_test], ignore_index=True)
    df.dropna(subset=['Age'], how='all', inplace=True)
    return df.groupby('Saudacao')['Age'].mean()

def set_idade_media_if_null(row: pd.Series, media):
    result = row['Age']
    if ( np.isnan(row['Age']) ):
        result = media[row['Saudacao']]
    return result

def tipoIdade(x):
    if x < 18:
        return 'Crianca'
    elif x >= 18 and x < 50:
        return 'Adulto'
    else:
        return 'Idoso'

In [136]:
# Load the data
split_train = True
if split_train:
  df = pd.read_csv("train.csv")
  train, test = train_test_split(df, test_size=0.3, random_state=100)
else:
  train = pd.read_csv("train.csv")
  test = pd.read_csv("test.csv")

In [137]:
train['Cabin'] = train['Cabin'].str[0:1]
train['Cabin'].fillna('C', inplace=True)
test['Cabin'] = test['Cabin'].str[0:1]
test['Cabin'].fillna('C', inplace=True)

train['Tam_Familia'] = train['SibSp'] + train['Parch']
test['Tam_Familia'] = test['SibSp'] + test['Parch']

train['Saudacao'] = train['Name'].str.extract("([A-Za-z]+)\.")
test['Saudacao'] = test['Name'].str.extract("([A-Za-z]+)\.")

_idade_media = idade_media(train, test)
train['Age'] = train.apply(set_idade_media_if_null, axis=1, media=_idade_media)
test['Age'] = test.apply(set_idade_media_if_null, axis=1, media=_idade_media)

train['Age'] = train['Age'].round(0)
test['Age'] = test['Age'].round(0)

# Reduzindo possibilidades para padronizar modelo, pois em teste não há todos os tipos existentes e treino
train['Saudacao']\
  .replace(['Mlle','Mme', 'Ms',  'Dr','Major','Lady','Countess','Jonkheer','Col',  'Rev',  'Capt','Sir','Don'],
           ['Miss','Miss','Miss','Mr','Mr',   'Mrs', 'Mrs',     'Other',   'Other','Other','Mr',  'Mr', 'Mr'], inplace=True)
test['Saudacao']\
  .replace(['Dona','Mlle','Mme', 'Ms',  'Dr','Major','Lady','Countess','Jonkheer','Col',  'Rev',  'Capt','Sir','Don'],
           ['Miss','Miss','Miss','Miss','Mr','Mr',   'Mrs', 'Mrs',     'Other',   'Other','Other','Mr',  'Mr',  'Mr'], inplace=True)

# Classificando idade por Criança, Adulto, Idoso
train['t_idade'] = train['Age'].apply(lambda x: tipoIdade(x))
test['t_idade'] = test['Age'].apply(lambda x: tipoIdade(x))

In [138]:
# Preprocess the data
le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'].astype(str))
train['t_idade'] = le.fit_transform(train['t_idade'].astype(str))
train['Saudacao'] = le.fit_transform(train['Saudacao'].astype(str))
train['Embarked'] = le.fit_transform(train['Embarked'].astype(str))
train['Cabin'] = le.fit_transform(train['Cabin'].astype(str))
train['Age'] = le.fit_transform(train['Age'].astype(str))

test['Sex'] = le.fit_transform(test['Sex'].astype(str))
test['t_idade'] = le.fit_transform(test['t_idade'].astype(str))
test['Saudacao'] = le.fit_transform(test['Saudacao'].astype(str))
test['Embarked'] = le.fit_transform(test['Embarked'].astype(str))
test['Cabin'] = le.fit_transform(test['Cabin'].astype(str))
test['Age'] = le.fit_transform(test['Age'].astype(str))
#train

In [139]:
# Fit the model
features = ['Sex', 'Tam_Familia', 'Cabin', 't_idade'] #'t_idade', 'Tam_Familia', 'Cabin']

X = train[features]
y = train['Survived']
# mod = T.DecisionTreeClassifier()
# mod = E.GradientBoostingClassifier()
mod = E.HistGradientBoostingClassifier()
mod.fit(X, y)
predict = mod.predict(test[features])
df_pred = test.loc[:,['PassengerId']]
df_pred['Survived'] = predict
if split_train:
  print(M.accuracy_score(test['Survived'], predict))
print(predict)

0.832089552238806
[1 1 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1
 1 1 0 1 1 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 1 0 0 0 0 1
 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 1 1 1 1 0 1 0 0 0 1 0 1 0 1 1 0 0
 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 1 1
 0 1 1 0 0 1 0 1 0 1 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 1 1 1 0 0 1 0 0 0 1 0 1
 0 0 0 1 1 1 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 1
 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0
 0 1 1 1 1 1 0 1 0]


In [140]:
# Plot the decision tree
#plt.figure(figsize=(10, 10))
#tree.plot_tree(mod, feature_names=features, class_names=['0', '1'], filled=True)
#plt.show()

In [141]:
df_pred = test.loc[:,['PassengerId']]
df_pred['Survived'] = predict
df_pred.to_csv('predict_Classf.csv', index=False) 