In [1]:
# Titanic Competition: https://www.kaggle.com/competitions/titanic/data
import pandas as pd

train = pd.read_csv('Dados/titanic/train.csv')
test = pd.read_csv('Dados/titanic/test.csv')
train = train.set_index('PassengerId')
test = test.set_index('PassengerId')

train.shape, test.shape

((891, 11), (418, 10))

In [2]:
# Add new feature call 'travel_with_family' and 'size_family', based on last name

train['Surname'] = train['Name'].str.split(',').str[0]
train['size_family'] = train['Surname'].map(train['Surname'].value_counts())
train['travel_with_family'] = train['size_family'] > 1

test['Surname'] = test['Name'].str.split(',').str[0]
test['size_family'] = test['Surname'].map(test['Surname'].value_counts())
test['travel_with_family'] = test['size_family'] > 1

map_travel_with_family = {False: 0, True: 1}

test['travel_with_family'] = test['travel_with_family'].map(map_travel_with_family)
train['travel_with_family'] = train['travel_with_family'].map(map_travel_with_family)

In [3]:
# To complete the cabin, lets suppose that the family was in same cabin.
    
nan_cabin_passenger = train.loc[train['Cabin'].isnull() 
                                & train['travel_with_family'] == 1][
                                ['Surname', 'travel_with_family', 'Name', 'Cabin']
                                ]
cabin_family = train.loc[~train['Cabin'].isnull() & train['Surname'].isin(nan_cabin_passenger['Surname'])][['Name', 'Surname', 'Cabin']]
mapping_surname_cabin = cabin_family.drop_duplicates('Surname')
mapping_surname_cabin = mapping_surname_cabin.set_index('Surname')['Cabin']
train['Cabin'] = train['Cabin'].fillna(train['Surname'].map(mapping_surname_cabin))

In [4]:
# Dealing with miss values about cabin and encoding the cabin
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# cabins_letter = train['Cabin'].astype(str).str.replace(r'[^a-zA-Z]', '', regex=True).str.split()
# def choose_random(val):
#     if 'nan' in val or '' in val:
#         return 'U'
    
#     if len(val) > 1:
#         choice = np.random.choice(val)
#         if len(choice) > 1:
#             return choice[0]
            
#         return choice
#     if len(val[0]) > 1:
#         return val[0][0]
        
#     return val[0]

# cabins_letter = cabins_letter.apply(choose_random)
# train['Cabin'] = cabins_letter

# cabins_letter_test = test['Cabin'].astype(str).str.replace(r'[^a-zA-Z]', '', regex=True).str.split()
# cabins_letter_test = cabins_letter_test.apply(choose_random)
# test['Cabin'] = cabins_letter_test

# enc = OrdinalEncoder()
# cabin_enc = enc.fit_transform(train[['Cabin']])
# train['Cabin'] = cabin_enc[:, 0]

# cabin_enc_test = enc.fit_transform(test[['Cabin']])
# test['Cabin'] = cabin_enc_test[:, 0]

le = LabelEncoder()
train['Cabin'] = le.fit_transform(train['Cabin'])
test['Cabin'] = le.fit_transform(test['Cabin'])


In [5]:
# Dealing with the missing values about embarked.
# I will put the mathematical mode.

mode_train = train['Embarked'].value_counts()
train['Embarked'] = train['Embarked'].fillna('S')

# Encoding "Embarked" 
map_embarked = {'S': 0, 'C': 1, 'Q': 2}
train['Embarked'] = train['Embarked'].map(map_embarked)
test['Embarked'] = test['Embarked'].map(map_embarked)

In [6]:
train['Sex'] = train['Sex'].replace(to_replace=['male', 'female'], value=[0, 1])
test['Sex'] = test['Sex'].replace(to_replace=['male', 'female'], value=[0, 1])

  train['Sex'] = train['Sex'].replace(to_replace=['male', 'female'], value=[0, 1])
  test['Sex'] = test['Sex'].replace(to_replace=['male', 'female'], value=[0, 1])


In [7]:
#The 'ticket' column contains string data. The authors suggest using an encoding method for this data type. I am applying the Feature Hashing method,
#as described in Feature Engineering and Selection by Max Kuhn and Kjell Johnson

from sklearn.feature_extraction import FeatureHasher

# features = 2**1
features = 3

h = FeatureHasher(n_features=features, input_type='string')
ticket_proc = train['Ticket'].astype(str).str.replace('/', ' ', regex=False).str.replace(r'[^\w\s]', '', regex=True).str.split()
ticket_feat_hasher = h.fit_transform(ticket_proc).toarray()
ticket_feat_hasher_df = pd.DataFrame(ticket_feat_hasher, columns = [f'f{i}' for i in range(len(ticket_feat_hasher[0]))], index = train.index)
train = pd.concat([train, ticket_feat_hasher_df], axis=1)
train = train.drop(columns = ['Ticket'])


ticket_proc = test['Ticket'].astype(str).str.replace('/', ' ', regex=False).str.replace(r'[^\w\s]', '', regex=True).str.split()
ticket_feat_hasher = h.fit_transform(ticket_proc).toarray()
ticket_feat_hasher_df = pd.DataFrame(ticket_feat_hasher, columns = [f'f{i}' for i in range(len(ticket_feat_hasher[0]))], index = test.index)
test = pd.concat([test, ticket_feat_hasher_df], axis=1)
test = test.drop(columns = ['Ticket'])



In [8]:
train['Age'].isna().sum() / train['Age'].isna().count() # 20% unknow, 80% know. Remove the unknow data is not appropriate

# Solution: Impute data with KNN. Source: Feature Engineering and Selection by Max Kuhn and Kjell Johnson
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

df_aux = train.select_dtypes(include = ['number'])

corr = df_aux.corr()['Age']
corr_abs = 0.15
corr_age_feat = corr[abs(corr) > corr_abs].index
# corr[abs(corr) > corr_abs], corr_age_feat

imputer = KNNImputer(n_neighbors=10)
data_to_impute = train[corr_age_feat]

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_to_impute)

imputer_df_pred = imputer.fit_transform(data_scaled) # KNN
imputed_data_final = scaler.inverse_transform(imputer_df_pred) # Return to normal scale

df_aux_train = pd.DataFrame(imputed_data_final, columns = corr_age_feat, index=train.index)

train['Age'] = df_aux_train['Age']


data_to_impute_test = test[corr_age_feat]
scaler_test = StandardScaler()
data_scaled_test = scaler_test.fit_transform(data_to_impute_test)
imputer_df_pred_test = imputer.fit_transform(data_scaled_test) # KNN
imputed_data_final_test = scaler_test.inverse_transform(imputer_df_pred_test) # Return to normal scale
df_aux_test = pd.DataFrame(imputed_data_final_test, columns = corr_age_feat, index=test.index)
test['Age'] = df_aux_test['Age']


In [9]:
test = test.drop(columns=['Name', 'Surname'])
train = train.drop(columns=['Name', 'Surname'])

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier


rfc = RandomForestClassifier(max_depth=500)
abc = AdaBoostClassifier(n_estimators = 5)
gbc = GradientBoostingClassifier(n_estimators = 10)

y = train['Survived']
train = train.drop(columns=['Survived'])


In [13]:
rfc_cvs = cross_val_score(rfc, train, y, cv=5)
print(f"Acurácia média: {rfc_cvs.mean()}")

Acurácia média: 0.8136902893729208


In [14]:
abc_cvs = cross_val_score(abc, train, y, cv=5)
print(f"Acurácia média: {abc_cvs.mean()}")


Acurácia média: 0.7945891657774151


In [15]:
gbc_cvs = cross_val_score(gbc, train, y, cv=5)
print(f"Acurácia média: {gbc_cvs.mean()}")


Acurácia média: 0.8125729709371665


In [16]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_accuracies = []
for i, (train_idx, test_idx) in enumerate(kf.split(train)):
    X_train_fold, X_test_fold = train.iloc[train_idx], train.iloc[test_idx]
    y_train_fold, y_test_fold = y.iloc[train_idx], y.iloc[test_idx]

    rfc.fit(X_train_fold, y_train_fold)
    score = rfc.score(X_test_fold, y_test_fold)
    fold_accuracies.append(score)



In [17]:
y_pred = rfc.predict(test)


In [18]:
submission = pd.DataFrame({
    "PassengerId": test.index,
    "Survived": y_pred
})

submission.to_csv("submission.csv", index=False)

In [None]:
", index=False)