In [1]:
import pandas as pd

In [2]:
# Si se usa Google Drive para leer datos
# from google.colab import drive
# drive.mount('/content/gdrive')

In [3]:
try:
    data_orig = pd.read_csv("../data/titanic.csv", index_col="PassengerId")
except:
    data_orig = pd.read_csv("https://raw.githubusercontent.com/pgaliana/IntroPythonCD/main/data/titanic.csv", index_col="PassengerId")

In [4]:
data_orig.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Preprocesamiento

1. Eliminar datos superfluos
2. Rellenar valores faltantes (imputación)
3. Convertir características de datos categóricos a formato numérico
4. Ingeniería de variables
5. Escalar datos numéricos

### 1. Eliminar datos superfluos

In [5]:
data = data_orig.drop(['Name', 'Ticket', 'Cabin'], axis=1)
data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


### 2. Rellenar valores faltantes (imputación)

In [6]:
data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [7]:
from sklearn.impute import SimpleImputer
 
def get_parameters(df):
    parameters = {}
    for col in df.columns[df.isnull().any()]:
        if df[col].dtype == 'float64' or df[col].dtype == 'int64' or df[col].dtype =='int32':
            strategy = 'mean'
        else:
            strategy = 'most_frequent'
        missing_values = df[col][df[col].isnull()].values[0]
        parameters[col] = {'missing_values':missing_values, 'strategy':strategy}
    return parameters
 
parameters = get_parameters(data)
 
for col, param in parameters.items():
    missing_values = param['missing_values']
    strategy = param['strategy']
    imp = SimpleImputer(missing_values=missing_values, strategy=strategy)
    data[col] = imp.fit_transform(data[[col]])
 
data.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

### 3. Convertir características de datos categóricos a formato numérico

In [8]:
# handle categorical data
# cat_cols = data.select_dtypes(include=['object','category']).columns
# dummies = pd.get_dummies(data[cat_cols], drop_first=True)
# data[dummies.columns] = dummies
# data.drop(cat_cols, axis=1, inplace=True)

data['Sex'] = data['Sex'].map({'male':1, 'female':0})
data['Embarked'] = data['Embarked'].map({'C':0, 'Q':1, 'S':2})

data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,1,22.0,1,0,7.25,2
2,1,1,0,38.0,1,0,71.2833,0
3,1,3,0,26.0,0,0,7.925,2
4,1,1,0,35.0,1,0,53.1,2
5,0,3,1,35.0,0,0,8.05,2


### 4. Ingeniería de variables

In [9]:
data['NumFam'] = data['SibSp'] + data['Parch']

data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,NumFam
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,1,22.0,1,0,7.25,2,1
2,1,1,0,38.0,1,0,71.2833,0,1
3,1,3,0,26.0,0,0,7.925,2,0
4,1,1,0,35.0,1,0,53.1,2,1
5,0,3,1,35.0,0,0,8.05,2,0


### 5. Escalar datos numéricos

In [10]:
# Scale numeric data
from sklearn.preprocessing import StandardScaler
 
# Select numerical columns
num_cols = data.select_dtypes(include=['float64']).columns
 
# Apply StandardScaler
scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])

data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,NumFam
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,1,-0.592481,1,0,-0.502445,2,1
2,1,1,0,0.638789,1,0,0.786845,0,1
3,1,3,0,-0.284663,0,0,-0.488854,2,0
4,1,1,0,0.407926,1,0,0.42073,2,1
5,0,3,1,0.407926,0,0,-0.486337,2,0


In [11]:
data.to_csv('titanic_proc_new.csv', index=False)