In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Cargar
# Aseg√∫rate de que el archivo existe en data/raw/train.csv
try:
    df = pd.read_csv("data/raw/train.csv")
    print("‚úÖ Datos cargados.")
except FileNotFoundError:
    print("‚ùå ERROR: No encuentro data/raw/train.csv")

# 2. Ingenier√≠a de T√≠tulos (La clave para la Edad)
# Extraemos Mr, Mrs, Miss, Master, etc.
if 'Name' in df.columns:
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Unificar t√≠tulos raros
    title_mapping = {
        "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs", 
        "Lady": "Mrs", "Countess": "Mrs",
        "Capt": "Mr", "Col": "Mr", "Don": "Mr", "Dr": "Mr", 
        "Major": "Mr", "Rev": "Mr", "Sir": "Mr", "Jonkheer": "Mr",
        "Dona": "Mrs"
    }
    df['Title'] = df['Title'].replace(title_mapping)
    
    # Imputar Edad usando la mediana del T√≠tulo
    df['Age'] = df['Age'].fillna(df.groupby('Title')['Age'].transform('median'))
    
    # Limpieza final de columnas sobrantes
    df.drop(['Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)

# 3. Limpieza de Cabina y Embarque
if 'Cabin' in df.columns:
    df['Has_Cabin'] = df['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)
    df.drop('Cabin', axis=1, inplace=True)

if 'Embarked' in df.columns:
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# 4. Check Final
print("--- ESTADO ACTUAL ---")
df.info()

‚úÖ Datos cargados.
--- ESTADO ACTUAL ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Survived   891 non-null    int64  
 1   Pclass     891 non-null    int64  
 2   Sex        891 non-null    object 
 3   Age        891 non-null    float64
 4   SibSp      891 non-null    int64  
 5   Parch      891 non-null    int64  
 6   Fare       891 non-null    float64
 7   Embarked   891 non-null    object 
 8   Title      891 non-null    object 
 9   Has_Cabin  891 non-null    int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 69.7+ KB


In [2]:
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1. Preparar datos para la IA (Todo a n√∫meros)
# Convertir Sex a 0/1
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# One-Hot Encoding para el resto (Embarked, Title)
df = pd.get_dummies(df, columns=['Embarked', 'Title'], drop_first=True)

# 2. Separar X (Preguntas) e y (Respuestas)
X = df.drop('Survived', axis=1)
y = df['Survived']

# 3. Partir Train/Test (80% / 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Entrenar Modelo
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 5. Evaluar
acc = accuracy_score(y_test, model.predict(X_test))
print(f"üéØ Precisi√≥n recuperada: {acc:.2%}")

# 6. Guardar el cerebro (Sobrescribe el anterior para asegurar que coincidan)
joblib.dump(model, 'models/titanic_logistic_v1.pkl')
print("üíæ Modelo guardado correctamente en models/")

üéØ Precisi√≥n recuperada: 82.12%
üíæ Modelo guardado correctamente en models/
