<a href="https://colab.research.google.com/github/noorehira/ProgrammingForAI/blob/main/Lab_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Column Transformer**

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

# Sample DataFrame
data = pd.DataFrame({
    'Age': [22, None, 24, 22, None, 24],
    'Sex': ['male', 'female', 'female','male', 'female', 'female'],
    'Embarked': ['B', 'B', 'C', 'C', 'C','S'],
    'Fare': [7.25, 71.83, 8.05,7.25, 71.83, 8.05]
})

print("Original DataFrame:")
print(data)

# Define ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('age_imputer', SimpleImputer(strategy='mean'), ['Age']),
    ('ohe', OneHotEncoder(), ['Sex','Embarked'])
], remainder='passthrough')  # Pass through other columns like 'Fare'

# Transform the data
transformed_data = preprocessor.fit_transform(data)

# Output shape and transformed data
print("\nTransformed Data Shape:")
print(transformed_data.shape)
print(transformed_data)


transformed_df = pd.DataFrame(transformed_data)
print("\nTransformed DataFrame:")
print(transformed_df)

Original DataFrame:
    Age     Sex Embarked   Fare
0  22.0    male        B   7.25
1   NaN  female        B  71.83
2  24.0  female        C   8.05
3  22.0    male        C   7.25
4   NaN  female        C  71.83
5  24.0  female        S   8.05

Transformed Data Shape:
(6, 7)
[[22.    0.    1.    1.    0.    0.    7.25]
 [23.    1.    0.    1.    0.    0.   71.83]
 [24.    1.    0.    0.    1.    0.    8.05]
 [22.    0.    1.    0.    1.    0.    7.25]
 [23.    1.    0.    0.    1.    0.   71.83]
 [24.    1.    0.    0.    0.    1.    8.05]]

Transformed DataFrame:
      0    1    2    3    4    5      6
0  22.0  0.0  1.0  1.0  0.0  0.0   7.25
1  23.0  1.0  0.0  1.0  0.0  0.0  71.83
2  24.0  1.0  0.0  0.0  1.0  0.0   8.05
3  22.0  0.0  1.0  0.0  1.0  0.0   7.25
4  23.0  1.0  0.0  0.0  1.0  0.0  71.83
5  24.0  1.0  0.0  0.0  0.0  1.0   8.05


# **Function Transfomer**

In [8]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline

# Sample DataFrame
data = pd.DataFrame({
    'Age': [22, None, 24,22, None, 24],
    'Sex': ['male', 'female', 'female','male', 'female', 'female'],
    'Embarked': ['B', 'B', 'C', None, 'C','S'],
    'Fare': [7.25, 71.83, 8.05,7.25, 71.83, 8.05]
})

print("Original DataFrame:")
print(data)

# Custom function to impute 'Embarked'
def impute_embarked(X):
    X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])  # Fill missing values
    #print(X['Embarked'])
    return X

# Define ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('age_imputer', SimpleImputer(strategy='mean'), ['Age']),
    ('embarked_imputer', FunctionTransformer(impute_embarked), ['Embarked']),
    ('ohe', OneHotEncoder(), ['Sex','Embarked'])
], remainder='passthrough')  # Pass through other columns like 'Fare'

# Transform the data
transformed_data = preprocessor.fit_transform(data)

# Output shape and transformed data
print("\nTransformed Data Shape:")
print(transformed_data.shape)
print(transformed_data)


transformed_df = pd.DataFrame(transformed_data)
print("\nTransformed DataFrame:")
print(transformed_df)


Original DataFrame:
    Age     Sex Embarked   Fare
0  22.0    male        B   7.25
1   NaN  female        B  71.83
2  24.0  female        C   8.05
3  22.0    male     None   7.25
4   NaN  female        C  71.83
5  24.0  female        S   8.05

Transformed Data Shape:
(6, 9)
[[22.0 'B' 0.0 1.0 1.0 0.0 0.0 0.0 7.25]
 [23.0 'B' 1.0 0.0 1.0 0.0 0.0 0.0 71.83]
 [24.0 'C' 1.0 0.0 0.0 1.0 0.0 0.0 8.05]
 [22.0 'B' 0.0 1.0 0.0 0.0 0.0 1.0 7.25]
 [23.0 'C' 1.0 0.0 0.0 1.0 0.0 0.0 71.83]
 [24.0 'S' 1.0 0.0 0.0 0.0 1.0 0.0 8.05]]

Transformed DataFrame:
      0  1    2    3    4    5    6    7      8
0  22.0  B  0.0  1.0  1.0  0.0  0.0  0.0   7.25
1  23.0  B  1.0  0.0  1.0  0.0  0.0  0.0  71.83
2  24.0  C  1.0  0.0  0.0  1.0  0.0  0.0   8.05
3  22.0  B  0.0  1.0  0.0  0.0  0.0  1.0   7.25
4  23.0  C  1.0  0.0  0.0  1.0  0.0  0.0  71.83
5  24.0  S  1.0  0.0  0.0  0.0  1.0  0.0   8.05


# **Sklean Pipeline**

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline

# Sample DataFrame
data = pd.DataFrame({
    'Age': [22, None, 24,22, None, 24],
    'Sex': ['male', 'female', 'female','male', 'female', 'female'],
    'Embarked': ['B', 'B', 'C', None, 'C','S'],
    'Fare': [7.25, 71.83, 8.05,7.25, 71.83, 8.05]
})

print("Original DataFrame:")
print(data)

# Custom function to impute 'Embarked'
def impute_embarked(X):
    X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])  # Fill missing values
    print (X['Embarked'])
    return X

preprocessor = ColumnTransformer(transformers=[
    ('age_imputer', SimpleImputer(strategy='mean'), ['Age']),
    ('embarked_encoder', Pipeline(steps=[
        ('imputer', FunctionTransformer(impute_embarked)),  # Impute Embarked first
        ('onehot', OneHotEncoder())  # Then apply OneHotEncoder
    ]), ['Embarked']),
    ('ohe', OneHotEncoder(), ['Sex'])
], remainder='passthrough')  # Pass through other columns like 'Fare'


# Transform the data
transformed_data = preprocessor.fit_transform(data)

# Output shape and transformed data
print("\nTransformed Data Shape:")
print(transformed_data.shape)
print(transformed_data)

transformed_df = pd.DataFrame(transformed_data)
print("\nTransformed DataFrame:")
print(transformed_df)


Original DataFrame:
    Age     Sex Embarked   Fare
0  22.0    male        B   7.25
1   NaN  female        B  71.83
2  24.0  female        C   8.05
3  22.0    male     None   7.25
4   NaN  female        C  71.83
5  24.0  female        S   8.05
0    B
1    B
2    C
3    B
4    C
5    S
Name: Embarked, dtype: object

Transformed Data Shape:
(6, 7)
[[22.    1.    0.    0.    0.    1.    7.25]
 [23.    1.    0.    0.    1.    0.   71.83]
 [24.    0.    1.    0.    1.    0.    8.05]
 [22.    1.    0.    0.    0.    1.    7.25]
 [23.    0.    1.    0.    1.    0.   71.83]
 [24.    0.    0.    1.    1.    0.    8.05]]

Transformed DataFrame:
      0    1    2    3    4    5      6
0  22.0  1.0  0.0  0.0  0.0  1.0   7.25
1  23.0  1.0  0.0  0.0  1.0  0.0  71.83
2  24.0  0.0  1.0  0.0  1.0  0.0   8.05
3  22.0  1.0  0.0  0.0  0.0  1.0   7.25
4  23.0  0.0  1.0  0.0  1.0  0.0  71.83
5  24.0  0.0  0.0  1.0  1.0  0.0   8.05


In [10]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the Titanic dataset
# Replace the path with the correct location of your Titanic dataset
titanic_data = pd.read_csv("titanic.csv")

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Apply MinMaxScaler on 'Age' and 'Fare' columns
titanic_data[['Age', 'Fare']] = scaler.fit_transform(titanic_data[['Age', 'Fare']])

# Display the transformed data
print(titanic_data[['Age', 'Fare']].head())
titanic_data['FamilySize'] = titanic_data['SibSp'] + titanic_data['Parch'] + 1

# Display the updated data
print(titanic_data[['Age', 'Fare', 'FamilySize']].head())


        Age      Fare
0  0.452723  0.015282
1  0.617566  0.013663
2  0.815377  0.018909
3  0.353818  0.016908
4  0.287881  0.023984
        Age      Fare  FamilySize
0  0.452723  0.015282           1
1  0.617566  0.013663           2
2  0.815377  0.018909           1
3  0.353818  0.016908           1
4  0.287881  0.023984           3


In [13]:
def add_family_size(X):
    X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
    return X

# Define transformers
scaler = MinMaxScaler()
family_size_transformer = FunctionTransformer(add_family_size)

# Create ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', scaler, ['Age', 'Fare']),
        ('family_size', family_size_transformer, ['SibSp', 'Parch'])  # We use 'SibSp' and 'Parch' to calculate 'FamilySize'
    ],
    remainder='passthrough'  # Keeps the other columns as they are
)
print(titanic_data[['Age', 'Fare', 'FamilySize']].head())


        Age      Fare  FamilySize
0  0.452723  0.015282           1
1  0.617566  0.013663           2
2  0.815377  0.018909           1
3  0.353818  0.016908           1
4  0.287881  0.023984           3


In [20]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline

# Sample DataFrame
data = pd.DataFrame({
    'Age': [22, None, 24, 22, None, 24],
    'Sex': ['male', 'female', 'female', 'male', 'female', 'female'],
    'Embarked': ['B', 'B', 'C', None, 'C', 'S'],
    'Fare': [7.25, 71.83, 8.05, 7.25, 71.83, 8.05],
    'SibSp': [1, 0, 1, 1, 0, 2],
    'Parch': [0, 1, 2, 0, 2, 1]
})

print("Original DataFrame:")
print(data)

# Custom function to impute 'Embarked'
def impute_embarked(X):
    X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])  # Fill missing values
    return X

# Custom function to calculate 'FamilySize'
def calculate_family_size(X):
    X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
    return X[['FamilySize']]

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('age_imputer', SimpleImputer(strategy='mean'), ['Age']),
    ('embarked_encoder', Pipeline(steps=[
        ('imputer', FunctionTransformer(impute_embarked)),  # Impute Embarked first
        ('onehot', OneHotEncoder())  # Then apply OneHotEncoder
    ]), ['Embarked']),
    ('sex_encoder', OneHotEncoder(), ['Sex']),
    ('family_size', FunctionTransformer(calculate_family_size), ['SibSp', 'Parch'])  # Create FamilySize feature
], remainder='passthrough')  # Pass through other columns like 'Fare'

# Transform the data
transformed_data = print(preprocessor.fit_transform(data))

# # Convert the result to a DataFrame for readability
# transformed_df = print(pd.DataFrame(transformed_data, columns=['Age', 'Embarked_B', 'Embarked_C', 'Embarked_S', 'Sex_female', 'Sex_male', 'FamilySize', 'Fare']))
# print("\nTransformed DataFrame:")
# print(transformed_df)


Original DataFrame:
    Age     Sex Embarked   Fare  SibSp  Parch
0  22.0    male        B   7.25      1      0
1   NaN  female        B  71.83      0      1
2  24.0  female        C   8.05      1      2
3  22.0    male     None   7.25      1      0
4   NaN  female        C  71.83      0      2
5  24.0  female        S   8.05      2      1
[[22.    1.    0.    0.    0.    1.    2.    7.25]
 [23.    1.    0.    0.    1.    0.    2.   71.83]
 [24.    0.    1.    0.    1.    0.    4.    8.05]
 [22.    1.    0.    0.    0.    1.    2.    7.25]
 [23.    0.    1.    0.    1.    0.    3.   71.83]
 [24.    0.    0.    1.    1.    0.    4.    8.05]]
Empty DataFrame
Columns: [Age, Embarked_B, Embarked_C, Embarked_S, Sex_female, Sex_male, FamilySize, Fare]
Index: []


# Lab Task

**Add Scaling preprocessing step using MinMacScaler function from sklearn.preprocessing module for Age and Fare Columns in the titanic dataset.**

**Apply Feature Creation preprocessing step to create a Family Size feature which calculates Family Size for each Passenger using following Equation. FamilySize = SibSp + Parch + 1**

**Use ColumnTransformer, FunctionTransformer and Pipeline features to preprocess the following dataset**

https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease