# TAREA EXTRA: ENCODING Y FEATURE ENGINEERING CON DATASET TITANIC

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:

df = sns.load_dataset("titanic").dropna(subset=["age", "fare", "embarked", "sex", "class", "survived"])
df = df[["sex", "age", "fare", "embarked", "class", "alone", "survived"]]

df["age_group"] = pd.cut(df["age"], bins=[0, 12, 18, 40, 60, 80],
                         labels=["niño", "adolescente", "adulto", "maduro", "mayor"])
df["fare_log"] = np.log1p(df["fare"])

X = df.drop("survived", axis=1)
y = df["survived"]

cat_cols = ["sex", "embarked", "class", "age_group", "alone"]
num_cols = ["age", "fare_log"]



### Preprocesamiento y modelo 

In [3]:

ct = ColumnTransformer([
    ("onehot", OneHotEncoder(drop="first"), cat_cols),
    ("scaler", StandardScaler(), num_cols)
])

X_encoded = ct.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

### Evaluación con métricas

In [4]:

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy del modelo: {acc:.3f}")
print(f"Número total de features después de encoding: {X_encoded.shape[1]}")


Accuracy del modelo: 0.748
Número total de features después de encoding: 12
