In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn import set_config


/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
train_data.head()
#train_data.info()
test_data.head()
test_data.info()
#print(train_data[["Name", "Survived"]].to_string())

#men = train_data.loc[train_data.Sex == 'male']["Survived"]
#rate_men = sum(men)/len(men)

#print("% of men who survived:", rate_men)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [3]:
test_data.head()
print(test_data.shape)

(418, 11)


In [4]:
train_ids = train_data["PassengerId"] 
test_ids = test_data["PassengerId"] 
y_data = train_data["Survived"]

train_data["Title"] = train_data["Name"].str.extract(r" ([A-Za-z]+)\.", expand=False)
test_data["Title"] = test_data["Name"].str.extract(r" ([A-Za-z]+)\.", expand=False)


train_data["Title"] = train_data["Title"].replace(
    ["Lady","Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"],
    "Rare"
)

test_data["Title"] = test_data["Title"].replace(
    ["Lady","Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"],
    "Rare"
)

train_data["Title"] = train_data["Title"].replace(["Mlle","Ms"], "Miss")
test_data["Title"] = test_data["Title"].replace(["Mlle","Ms"], "Miss")

train_data["Title"] = train_data["Title"].replace("Mme", "Mrs")
test_data["Title"] = test_data["Title"].replace("Mme", "Mrs")

# Family size
train_data["FamilySize"] = train_data["SibSp"] + train_data["Parch"] + 1
test_data["FamilySize"] = test_data["SibSp"] + test_data["Parch"] + 1
# HasCabin
train_data["HasCabin"] = train_data["Cabin"].notnull().astype(int)
test_data["HasCabin"] = test_data["Cabin"].notnull().astype(int)

train_data["Sex_Pclass"] = train_data["Sex"] + "_" + train_data["Pclass"].astype(str)
test_data["Sex_Pclass"] = test_data["Sex"] + "_" + test_data["Pclass"].astype(str)

# Drop unnecessary columns
train_data.drop(["Survived","PassengerId","Name","Ticket","Cabin"], axis=1, inplace=True)
test_data.drop(["PassengerId","Name","Ticket","Cabin"], axis=1, inplace = True)

print(train_data.shape, test_data.shape)


(891, 11) (418, 11)


In [5]:
numeric_features = train_data.select_dtypes(include=["int", "float"]).columns
categorical_features = train_data.select_dtypes(include=["object"]).columns
print(numeric_features, numeric_features.shape,categorical_features, categorical_features.shape)

#print(numeric_features, numeric_features.shape, categorical_features, categorical_features.shape)
# -----------------------
# Preprocessing Pipelines
# -----------------------
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="None")),
    ("onehot", OneHotEncoder(drop="first"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'HasCabin'], dtype='object') (7,) Index(['Sex', 'Embarked', 'Title', 'Sex_Pclass'], dtype='object') (4,)


In [6]:
model = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, C=0.5))
])

results = cross_validate(
    model,
    train_data,
    y_data, 
    cv=10,
    scoring="accuracy",
    return_train_score=True
)

print("Accuracy Train:", results["train_score"].mean())
print("Accuracy Validation:", results["test_score"].mean())

model.fit(train_data, y_data)

# -----------------------
# Predict on test set
# -----------------------
preds = model.predict(test_data)

print(preds.shape)

feature_names = model.named_steps['preprocessing'].get_feature_names_out()
coefficients = model.named_steps['classifier'].coef_[0]

# Criar um DataFrame para facilitar a leitura
df_coeffs = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
print(df_coeffs.sort_values(by='Coefficient', ascending=False))


Accuracy Train: 0.8402545446620652
Accuracy Validation: 0.833932584269663
(418,)
                     Feature  Coefficient
15  cat__Sex_Pclass_female_2     0.779654
13            cat__Title_Mrs     0.555046
6              num__HasCabin     0.400838
19    cat__Sex_Pclass_male_3     0.187307
4                  num__Fare     0.111719
11           cat__Title_Miss     0.050501
8         cat__Embarked_None     0.032852
9            cat__Embarked_Q    -0.001676
3                 num__Parch    -0.074256
5            num__FamilySize    -0.242541
2                 num__SibSp    -0.300592
1                   num__Age    -0.434251
10           cat__Embarked_S    -0.439106
18    cat__Sex_Pclass_male_2    -0.484352
17    cat__Sex_Pclass_male_1    -0.534231
16  cat__Sex_Pclass_female_3    -0.602618
0                num__Pclass    -0.687130
7              cat__Sex_male    -0.831276
14           cat__Title_Rare    -0.981209
12             cat__Title_Mr    -2.027858


In [7]:
submission = pd.DataFrame({
    "PassengerId": test_ids,
    "Survived": preds
})
print(submission.shape)
print(train_data.shape)
print(test_data.shape)
print(submission.head())
submission.to_csv("submission.csv", index=False)
print("Submission file created successfully!")

(418, 2)
(891, 11)
(418, 11)
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1
Submission file created successfully!
