In [1]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [12]:
df = pd.read_csv("/teamspace/studios/this_studio/mlopsrepo/data/raw/titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
df = pd.read_csv("/teamspace/studios/this_studio/mlopsrepo/data/raw/titanic.csv")
print(df.shape)
# --- Define column lists ---
drop_cols = ['PassengerId', 'Name', 'Ticket']
cat_features = ["Sex", "Cabin", "Embarked"]
num_features = ["Age"]
passthrough_features = ["Survived","PClass","SibSp", "Parch", "Fare"]  # keep these as-is

# --- Custom preprocessing function: drop cols + extract first letter from Cabin ---
def preprocess_df(X):
    X = X.copy()
    X = X.drop(columns=drop_cols)
    X["Cabin"] = X["Cabin"].astype(str).str[0]
    return X

# --- Build full pipeline ---
full_pipeline = Pipeline([
    # Custom preprocessing for dropping cols and cabin letter
    ("custom_preprocessing", FunctionTransformer(preprocess_df)),

    # ColumnTransformer for feature-specific pipelines
    ("preprocessing", ColumnTransformer([
        # Categorical pipeline
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OrdinalEncoder())
        ]), cat_features),

        # Numerical pipeline
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
        ]), num_features),

        # Pass through others
    ], remainder="passthrough"))
])

# --- Fit and transform ---
all_columns = cat_features + num_features + passthrough_features

processed_array = full_pipeline.fit_transform(df)


(891, 12)


In [17]:
processed_df = pd.DataFrame(processed_array, columns=all_columns)


In [18]:
processed_df.describe()

Unnamed: 0,Sex,Cabin,Embarked,Age,Survived,PClass,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.647587,6.716049,1.536476,29.699118,0.383838,2.308642,0.523008,0.381594,32.204208
std,0.47799,2.460739,0.791503,13.002015,0.486592,0.836071,1.102743,0.806057,49.693429
min,0.0,0.0,0.0,0.42,0.0,1.0,0.0,0.0,0.0
25%,0.0,8.0,1.0,22.0,0.0,2.0,0.0,0.0,7.9104
50%,1.0,8.0,2.0,29.699118,0.0,3.0,0.0,0.0,14.4542
75%,1.0,8.0,2.0,35.0,1.0,3.0,1.0,0.0,31.0
max,1.0,8.0,2.0,80.0,1.0,3.0,8.0,6.0,512.3292


In [41]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1.0,22.000000,1,0,7.2500,8.0,2.0
1,1,1,0.0,38.000000,1,0,71.2833,2.0,0.0
2,1,3,0.0,26.000000,0,0,7.9250,8.0,2.0
3,1,1,0.0,35.000000,1,0,53.1000,2.0,2.0
4,0,3,1.0,35.000000,0,0,8.0500,8.0,2.0
...,...,...,...,...,...,...,...,...,...
886,0,2,1.0,27.000000,0,0,13.0000,8.0,2.0
887,1,1,0.0,19.000000,0,0,30.0000,1.0,2.0
888,0,3,0.0,29.699118,1,2,23.4500,8.0,2.0
889,1,1,1.0,26.000000,0,0,30.0000,2.0,0.0


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [22]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [30]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.2500,U,S
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.9250,U,S
3,1,1,female,35.0,1,0,53.1000,C,S
4,0,3,male,35.0,0,0,8.0500,U,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,U,S
887,1,1,female,19.0,0,0,30.0000,B,S
888,0,3,female,,1,2,23.4500,U,S
889,1,1,male,26.0,0,0,30.0000,C,C
