# **Feature engineering Notebook**

## Objectives

* Engineer features for Classification

## Inputs

* outputs/datasets/cleaned/TrainSetCleaned.csv
* outputs/datasets/cleaned/TestSetCleaned.csv

## Outputs

* Generate a list with variebles to use in model



---

# Set up the Working Directory

Define and confirm the working directory.

In [None]:
import os
current_dir = os.getcwd()
os.chdir(os.path.dirname(current_dir))
current_dir = os.getcwd()
current_dir

# Load data

Loading train set

In [None]:
import pandas as pd

train_set_path = "outputs/datasets/cleaned/TrainSetCleaned.csv"
TrainSet = pd.read_csv(train_set_path)
TrainSet.head(3)

Loading test set

In [None]:
test_set_path = "outputs/datasets/cleaned/TestSetCleaned.csv"
TestSet = pd.read_csv(test_set_path)
TestSet.head(3)

The data has already been check and cleaned , ready for feature engineering step.

---

# Custom function for feature engineering


Feature engineering is a crucial step in the data preprocessing process, and it can significantly impact the performance of the machine learning models
We are going to use the custom function from the Code Institute walkthrough project 'Churnometer'.

In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings
from feature_engine import transformation as vt
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import OrdinalEncoder
sns.set(style="whitegrid")
warnings.filterwarnings('ignore')


def FeatureEngineeringAnalysis(df, analysis_type=None):
    """
    - used for quick feature engineering on numerical and categorical variables
    to decide which transformation can better transform the distribution shape
    - Once transformed, use a reporting tool, like pandas-profiling, to evaluate distributions
    """
    check_missing_values(df)
    allowed_types = ['numerical', 'ordinal_encoder', 'outlier_winsorizer']
    check_user_entry_on_analysis_type(analysis_type, allowed_types)
    list_column_transformers = define_list_column_transformers(analysis_type)

    # Loop in each variable and engineer the data according to the analysis type
    df_feat_eng = pd.DataFrame([])
    for column in df.columns:
        # create additional columns (column_method) to apply the methods
        df_feat_eng = pd.concat([df_feat_eng, df[column]], axis=1)
        for method in list_column_transformers:
            df_feat_eng[f"{column}_{method}"] = df[column]

        # Apply transformers in respective column_transformers
        df_feat_eng, list_applied_transformers = apply_transformers(
            analysis_type, df_feat_eng, column)

        # For each variable, assess how the transformations perform
        transformer_evaluation(
            column, list_applied_transformers, analysis_type, df_feat_eng)

    return df_feat_eng


def check_user_entry_on_analysis_type(analysis_type, allowed_types):
    """ Check analysis type """
    if analysis_type is None:
        raise SystemExit(
            f"You should pass analysis_type parameter as one of the following options: {allowed_types}")
    if analysis_type not in allowed_types:
        raise SystemExit(
            f"analysis_type argument should be one of these options: {allowed_types}")


def check_missing_values(df):
    if df.isna().sum().sum() != 0:
        raise SystemExit(
            f"There is a missing value in your dataset. Please handle that before getting into feature engineering.")


def define_list_column_transformers(analysis_type):
    """ Set suffix columns according to analysis_type"""
    if analysis_type == 'numerical':
        list_column_transformers = [
            "log_e", "log_10", "reciprocal", "power", "box_cox", "yeo_johnson"]

    elif analysis_type == 'ordinal_encoder':
        list_column_transformers = ["ordinal_encoder"]

    elif analysis_type == 'outlier_winsorizer':
        list_column_transformers = ['iqr']

    return list_column_transformers


def apply_transformers(analysis_type, df_feat_eng, column):
    for col in df_feat_eng.select_dtypes(include='category').columns:
        df_feat_eng[col] = df_feat_eng[col].astype('object')

    if analysis_type == 'numerical':
        df_feat_eng, list_applied_transformers = FeatEngineering_Numerical(
            df_feat_eng, column)

    elif analysis_type == 'outlier_winsorizer':
        df_feat_eng, list_applied_transformers = FeatEngineering_OutlierWinsorizer(
            df_feat_eng, column)

    elif analysis_type == 'ordinal_encoder':
        df_feat_eng, list_applied_transformers = FeatEngineering_CategoricalEncoder(
            df_feat_eng, column)

    return df_feat_eng, list_applied_transformers


def transformer_evaluation(column, list_applied_transformers, analysis_type, df_feat_eng):
    # For each variable, assess how the transformations perform
    print(f"* Variable Analyzed: {column}")
    print(f"* Applied transformation: {list_applied_transformers} \n")
    for col in [column] + list_applied_transformers:

        if analysis_type != 'ordinal_encoder':
            DiagnosticPlots_Numerical(df_feat_eng, col)

        else:
            if col == column:
                DiagnosticPlots_Categories(df_feat_eng, col)
            else:
                DiagnosticPlots_Numerical(df_feat_eng, col)

        print("\n")


def DiagnosticPlots_Categories(df_feat_eng, col):
    plt.figure(figsize=(4, 3))
    sns.countplot(data=df_feat_eng, x=col, palette=[
                  '#432371'], order=df_feat_eng[col].value_counts().index)
    plt.xticks(rotation=90)
    plt.suptitle(f"{col}", fontsize=30, y=1.05)
    plt.show()
    print("\n")


def DiagnosticPlots_Numerical(df, variable):
    fig, axes = plt.subplots(1, 3, figsize=(12, 4))
    sns.histplot(data=df, x=variable, kde=True, element="step", ax=axes[0])
    stats.probplot(df[variable], dist="norm", plot=axes[1])
    sns.boxplot(x=df[variable], ax=axes[2])

    axes[0].set_title('Histogram')
    axes[1].set_title('QQ Plot')
    axes[2].set_title('Boxplot')
    fig.suptitle(f"{variable}", fontsize=30, y=1.05)
    plt.tight_layout()
    plt.show()


def FeatEngineering_CategoricalEncoder(df_feat_eng, column):
    list_methods_worked = []
    try:
        encoder = OrdinalEncoder(encoding_method='arbitrary', variables=[
                                 f"{column}_ordinal_encoder"])
        df_feat_eng = encoder.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_ordinal_encoder")

    except Exception:
        df_feat_eng.drop([f"{column}_ordinal_encoder"], axis=1, inplace=True)

    return df_feat_eng, list_methods_worked


def FeatEngineering_OutlierWinsorizer(df_feat_eng, column):
    list_methods_worked = []

    # Winsorizer iqr
    try:
        disc = Winsorizer(
            capping_method='iqr', tail='both', fold=1.5, variables=[f"{column}_iqr"])
        df_feat_eng = disc.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_iqr")
    except Exception:
        df_feat_eng.drop([f"{column}_iqr"], axis=1, inplace=True)

    return df_feat_eng, list_methods_worked


def FeatEngineering_Numerical(df_feat_eng, column):
    list_methods_worked = []

    # LogTransformer base e
    try:
        lt = vt.LogTransformer(variables=[f"{column}_log_e"])
        df_feat_eng = lt.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_log_e")
    except Exception:
        df_feat_eng.drop([f"{column}_log_e"], axis=1, inplace=True)

    # LogTransformer base 10
    try:
        lt = vt.LogTransformer(variables=[f"{column}_log_10"], base='10')
        df_feat_eng = lt.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_log_10")
    except Exception:
        df_feat_eng.drop([f"{column}_log_10"], axis=1, inplace=True)

    # ReciprocalTransformer
    try:
        rt = vt.ReciprocalTransformer(variables=[f"{column}_reciprocal"])
        df_feat_eng = rt.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_reciprocal")
    except Exception:
        df_feat_eng.drop([f"{column}_reciprocal"], axis=1, inplace=True)

    # PowerTransformer
    try:
        pt = vt.PowerTransformer(variables=[f"{column}_power"])
        df_feat_eng = pt.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_power")
    except Exception:
        df_feat_eng.drop([f"{column}_power"], axis=1, inplace=True)

    # BoxCoxTransformer
    try:
        bct = vt.BoxCoxTransformer(variables=[f"{column}_box_cox"])
        df_feat_eng = bct.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_box_cox")
    except Exception:
        df_feat_eng.drop([f"{column}_box_cox"], axis=1, inplace=True)

    # YeoJohnsonTransformer
    try:
        yjt = vt.YeoJohnsonTransformer(variables=[f"{column}_yeo_johnson"])
        df_feat_eng = yjt.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_yeo_johnson")
    except Exception:
        df_feat_eng.drop([f"{column}_yeo_johnson"], axis=1, inplace=True)

    return df_feat_eng, list_methods_worked

Numerical transformation

In [None]:
variables_engineering = [
    'age',
    'sex',
    'cp',
    'trestbps',
    'chol',
    'fbs',
    'restecg',
    'thalach',
    'exang',
    'oldpeak',
    'slope',
    'ca',
    'thal'
    ]

variables_engineering

In [None]:
df_engineering = TrainSet[variables_engineering].copy()
df_engineering.head(3)

In [None]:
df_engineering = FeatureEngineeringAnalysis(df=df_engineering, analysis_type='numerical')

Performing shapiro test to assess normality of the data

In [None]:
import scipy.stats as stats

# Perform the Shapiro-Wilk test
statistic, p_value = stats.shapiro(df_engineering)

# Check the p-value
if p_value < 0.05:
    print("The data significantly deviates from a normal distribution.")
else:
    print("The data does not significantly deviate from a normal distribution.")

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats

# Create a DataFrame to store the results
before_transformation_results = pd.DataFrame(columns=["Variable", "Shapiro-Wilk p-value (Before)"])

for column in df_engineering.columns:
    # Extract the data for the current column
    data = df_engineering[column].values

    # Before Transformation:
    # Perform the Shapiro-Wilk test for normality
    statistic_before, p_value_before = stats.shapiro(data)

    # Append the results to the DataFrame
    before_transformation_results = before_transformation_results.append({
        "Variable": column,
        "Shapiro-Wilk p-value (Before)": p_value_before
    }, ignore_index=True)

# Print the results for the original data (before transformation)
print("Results Before Transformation:")
print(before_transformation_results)


#### Data Variable Transformations

Here's a breakdown of the variables and potential transformation methods based on ted data:

1. **age**: Consider applying a logarithmic transformation (log_e, log_10), reciprocal transformation, or Box-Cox/Yeo-Johnson transformation with an estimated lambda.

2. **sex**: This binary variable may not benefit from transformations. It's typically left as-is for a classification task.

3. **cp (chest pain type)**: Depending on the nature of the data, consider a Box-Cox/Yeo-Johnson transformation with an estimated lambda.

4. **trestbps (resting blood pressure)**: Consider a logarithmic transformation (log_e, log_10), reciprocal transformation, or Box-Cox/Yeo-Johnson transformation with an estimated lambda.

5. **chol (serum cholesterol)**: Consider a logarithmic transformation (log_e, log_10), reciprocal transformation, or Box-Cox/Yeo-Johnson transformation with an estimated lambda.

6. **fbs (fasting blood sugar)**: This binary variable may not benefit from transformations. It's typically left as-is for classification tasks.

7. **restecg (resting electrocardiographic results)**: Depending on the nature of the data, consider a Box-Cox/Yeo-Johnson transformation with an estimated lambda.

8. **thalach (maximum heart rate achieved)**: Consider a logarithmic transformation (log_e, log_10), reciprocal transformation, or Box-Cox/Yeo-Johnson transformation with an estimated lambda.

9. **exang (exercise-induced angina)**: This binary variable may not benefit from transformations. It's typically left as-is for classification tasks.

10. **oldpeak (ST depression induced by exercise)**: Depending on the nature of the data, consider a Box-Cox/Yeo-Johnson transformation with an estimated lambda.

11. **slope (slope of the peak exercise ST segment)**: Depending on the nature of the data, consider a Box-Cox/Yeo-Johnson transformation with an estimated lambda.

12. **ca (number of major vessels colored by fluoroscopy)**: Depending on the nature of the data, consider a Box-Cox/Yeo-Johnson transformation with an estimated lambda.

13. **thal (thalassemia)**: Depending on the nature of the data, consider a Box-Cox/Yeo-Johnson transformation with an estimated lambda.
th an estimated lambda.

Based on the statistical and visual investigation , developer choose to consider **Box-Cox/Yeo-Johnson** as the transformation method, since most of the variables could benefit from it. Because box-cox give error for not positive numbers, developer chose to use **Yeo Johnson**

# Yeo Johnson Transformer

In [None]:
from feature_engine import transformation as vt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Variables that may benefit from Yeo-Johnson transformation
variables_to_transform = [
    'age',
    'cp',
    'trestbps',
    'chol',
    'restecg',
    'thalach',
    'oldpeak',
    'slope',
    'ca',
    'thal'
]

# Create a ColumnTransformer to apply Yeo-Johnson only to selected variables
column_transformer = ColumnTransformer(
    transformers=[('yeo_johnson', vt.YeoJohnsonTransformer(), variables_to_transform)],
    remainder='passthrough'  # Include other columns as is
)

# Create a pipeline
pipeline = Pipeline([
    ('yeo_johnson_transform', column_transformer)
])

# Apply the Yeo-Johnson transformation to the selected variables
df_transformed = pipeline.fit_transform(df_engineering)

# Convert the result back to a DataFrame
df_transformed = pd.DataFrame(df_transformed, columns=df_engineering.columns)

# Display the first few rows of the transformed DataFrame
print(df_transformed.head())


In [None]:
import os
import numpy as np
import pandas as pd
from feature_engine import transformation as vt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import scipy.stats as stats
import matplotlib.pyplot as plt

# Variables that may benefit from Yeo-Johnson transformation
variables_to_transform = [
    'age',
    'cp',
    'trestbps',
    'chol',
    'restecg',
    'thalach',
    'oldpeak',
    'slope',
    'ca',
    'thal'
]

# Create a ColumnTransformer to apply Yeo-Johnson only to selected variables
column_transformer = ColumnTransformer(
    transformers=[('yeo_johnson', vt.YeoJohnsonTransformer(), variables_to_transform)],
    remainder='passthrough'  # Include other columns as is
)

# Create a pipeline
pipeline = Pipeline([
    ('yeo_johnson_transform', column_transformer)
])

# Apply the Yeo-Johnson transformation to the selected variables
df_transformed = pipeline.fit_transform(df_engineering)

# Convert the result back to a DataFrame
df_transformed = pd.DataFrame(df_transformed, columns=df_engineering.columns)

# Display the first few rows of the transformed DataFrame
print(df_transformed.head())

for variable in variables_to_transform:
    original_data = df_engineering[variable]
    transformed_data = df_transformed[variable]

    # Plot histograms
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.hist(original_data, bins=20, color='blue', alpha=0.7, label='Original')
    plt.xlabel(variable)
    plt.ylabel('Frequency')
    plt.legend()
    plt.title(f'Histogram for Original {variable}')

    plt.subplot(1, 2, 2)
    plt.hist(transformed_data, bins=20, color='green', alpha=0.7, label='Transformed')
    plt.xlabel(f'Transformed {variable}')
    plt.ylabel('Frequency')
    plt.legend()
    plt.title(f'Histogram for Transformed {variable}')

    plt.tight_layout()
    plt.show()

    # Plot Q-Q plots
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    stats.probplot(original_data, dist='norm', plot=plt)
    plt.title(f'Q-Q Plot for Original {variable}')

    plt.subplot(1, 2, 2)
    stats.probplot(transformed_data, dist='norm', plot=plt)
    plt.title(f'Q-Q Plot for Transformed {variable}')

    plt.tight_layout()
    plt.show()


We can see from the graphical and statistical observations above that transformations improved the distribution of data, but not necessarly his normality.

Developer will keep the transformations and decide based on the model performance next steps.

# SmartCorrelatedSelection Variables

In [None]:
df_engineering = TrainSet.copy()
df_engineering.head(3)

In [None]:
from feature_engine.selection import SmartCorrelatedSelection
corr_sel = SmartCorrelatedSelection(variables=None, method="spearman", threshold=0.6, selection_method="variance")

corr_sel.fit_transform(df_engineering)
corr_sel.correlated_feature_sets_

In [None]:
corr_sel.features_to_drop_

Based on smartcorrelated feature there are no features to drop.

---

# Conclusions


The list below shows the transformations needed for feature engineering. These steps will be added to the ML Pipeline
Both BoxCox and YeoJohnson transformer were observed, but boxcox gave an error when trying to fit model, in ModelAndEvaluation notebook, saying that data must be positive. Developer choose YeoJohnson because the two transformation perform relatively close and it was not given any error.

**Feature Engineering Transformer**
- **Yeo-Johnson numerical transformer**: Applies Yeo-Johnson transformation to the following numerical features: 'age', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'oldpeak', 'slope', 'ca', 'thal'.

**Smart Correlation Detection**
- None found.
corical enc**Categooding**
- Not .red.
t']