In [55]:
# ============================================
# 1. Importações
# ============================================
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score


# ============================================
# 2. Verificar arquivos disponíveis no Kaggle
# ============================================
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# ============================================
# 3. Carregar os dados
# ============================================
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")


# ============================================
# 4. Preenchimento de dados ausentes
# ============================================
# -- Treino
train_data["Fare"] = train_data["Fare"].fillna(train_data["Fare"].mean())
train_data["Age"] = train_data["Age"].fillna(train_data["Age"].mean())
train_data["Embarked"] = train_data["Embarked"].fillna(train_data["Embarked"].mode()[0])
train_data["CabinLetter"] = train_data["Cabin"].str[0].fillna("Unknown")

# -- Teste
test_data["Fare"] = test_data["Fare"].fillna(test_data["Fare"].mean())
test_data["Age"] = test_data["Age"].fillna(test_data["Age"].mean())
test_data["Embarked"] = test_data["Embarked"].fillna(train_data["Embarked"].mode()[0])
test_data["CabinLetter"] = test_data["Cabin"].str[0].fillna("Unknown")


# ============================================
# 5. Feature Engineering
# ============================================
# -- Treino
train_data["FareBin"] = pd.qcut(train_data["Fare"], 10, labels=False)
train_data["AgeBin"] = pd.cut(train_data["Age"], bins=[0, 12, 18, 35, 60, 80], labels=False)
train_data["FamilySize"] = train_data["SibSp"] + train_data["Parch"] + 1
train_data["IsAlone"] = (train_data["FamilySize"] == 1).astype(int)
train_data["FarePerPerson"] = train_data["Fare"] / train_data["FamilySize"]
train_data["ClassFareInteraction"] = train_data["Pclass"] * train_data["Fare"]

# -- Teste
test_data["FareBin"] = pd.qcut(test_data["Fare"], 10, labels=False)
test_data["AgeBin"] = pd.cut(test_data["Age"], bins=[0, 12, 18, 35, 60, 80], labels=False)
test_data["FamilySize"] = test_data["SibSp"] + test_data["Parch"] + 1
test_data["IsAlone"] = (test_data["FamilySize"] == 1).astype(int)
test_data["FarePerPerson"] = test_data["Fare"] / test_data["FamilySize"]
test_data["ClassFareInteraction"] = test_data["Pclass"] * test_data["Fare"]


# ============================================
# 6. Encoding categórico (OneHotEncoder)
# ============================================
categorical_cols = ["Sex", "Embarked", "CabinLetter", "IsAlone"]
num_features = [
    "Pclass", "SibSp", "Parch", "FareBin", "AgeBin",
    "Fare", "Age", "FamilySize", "ClassFareInteraction", "FarePerPerson"
]

# -- Treino
cat_encoder = OneHotEncoder(handle_unknown='ignore')
train_data_cat = train_data[categorical_cols]
train_data_cat_1hot = cat_encoder.fit_transform(train_data_cat)
encoded_train_df = pd.DataFrame(train_data_cat_1hot.toarray(), columns=cat_encoder.get_feature_names_out())

train_data_num = train_data[num_features]
train_data_final = pd.concat([train_data_num.reset_index(drop=True), encoded_train_df.reset_index(drop=True)], axis=1)
y_train = train_data["Survived"].copy()

# -- Teste
test_data_cat = test_data[categorical_cols]
test_data_cat_1hot = cat_encoder.transform(test_data_cat)
encoded_test_df = pd.DataFrame(test_data_cat_1hot.toarray(), columns=cat_encoder.get_feature_names_out())

test_data_num = test_data[num_features]
test_data_final = pd.concat([test_data_num.reset_index(drop=True), encoded_test_df.reset_index(drop=True)], axis=1)


# ============================================
# 7. Split para validação local
# ============================================
X_train_strat, X_val_strat, y_train_strat, y_val_strat = train_test_split(
    train_data_final, y_train,
    test_size=0.2, stratify=y_train, random_state=42
)


# ============================================
# 8. Modelo
# ============================================
rfc = RandomForestClassifier(
    n_estimators=500,
    criterion='entropy',
    max_depth=8,
    min_samples_split=8,
    min_samples_leaf=2
)


# ============================================
# 9. Treino
# ============================================
rfc.fit(train_data_final, y_train)


# ============================================
# 10. Predição no conjunto de teste para submissão
# ============================================
predictions = rfc.predict(test_data_final)

submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': predictions
})

submission.to_csv("submission.csv", index=False)

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv
