<a id="feature_engineering_title"></a>
# <p style="background-color: blue; font-family:calibri; color:white; font-size:140%; font-family:Verdana; text-align:center; border-radius:15px 50px;">Feature Engineering</p>

<a id="libraries"></a>
# <b><span style='color:lightblue'> Importing Necessary Libraries</span></b>

In [95]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.append(os.path.abspath('../src'))

import seaborn as sns
import matplotlib.pyplot as plt

import data_cleaning

<a id="load_dataset"></a>
# <b><span style='color:lightblue'> Load Dataset</span></b>

In [96]:
# Calling get_dataframe function from load_data.py

train_df = pd.read_csv('../data/train.csv')

df = data_cleaning.data_cleaning_pipeline(train_df)

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,4223-BKEOR,Female,0,No,Yes,21,Yes,No,DSL,Yes,...,Yes,No,No,Yes,One year,No,Mailed check,64.85,1336.8,No
1,6035-RIIOM,Female,0,No,No,54,Yes,Yes,Fiber optic,No,...,No,No,Yes,Yes,Two year,Yes,Bank transfer (automatic),97.2,5129.45,No
2,3797-VTIDR,Male,0,Yes,No,1,No,No,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,23.45,23.45,Yes
3,2568-BRGYX,Male,0,No,No,4,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.2,237.95,Yes
5,4291-SHSBH,Male,0,No,No,7,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,69.55,521.35,No


<a id="creation_of_bins"></a>
# <p style="background-color: #4893D7; font-family:calibri; color:white; font-size:140%; font-family:Verdana; text-align:center; border-radius:15px 50px;">Step 7 | Creation of Bins</p>
<a id="libraries"></a>

In [97]:
# Create reusable function for Tenure and Contract type

def create_bins(df, columns, bins=5):
    for col in columns:
        bin_col_name = f"{col}_Bin"
        try:
            # Use .loc to avoid SettingWithCopyWarning
            df.loc[:, bin_col_name] = pd.qcut(df[col], q=bins, duplicates='drop')
            print(f"\nBin counts for {col}:")
            print(df[bin_col_name].value_counts().sort_index())
        except ValueError as e:
            print(f"Could not bin {col}: {e}")



In [99]:
columns_to_bin = ['TotalCharges', 'tenure']
df = df.copy()

create_bins(df, columns=columns_to_bin, bins=5)

# Check result
df[['TotalCharges', 'TotalCharges_Bin', 'tenure', 'tenure_Bin']].head()


Bin counts for TotalCharges:
TotalCharges_Bin
(18.799, 275.12]      1125
(275.12, 949.92]      1125
(949.92, 2087.03]     1124
(2087.03, 4478.99]    1125
(4478.99, 8684.8]     1125
Name: count, dtype: int64

Bin counts for tenure:
tenure_Bin
(0.999, 6.0]    1150
(6.0, 20.0]     1132
(20.0, 40.0]    1136
(40.0, 60.0]    1096
(60.0, 72.0]    1110
Name: count, dtype: int64


Unnamed: 0,TotalCharges,TotalCharges_Bin,tenure,tenure_Bin
0,1336.8,"(949.92, 2087.03]",21,"(20.0, 40.0]"
1,5129.45,"(4478.99, 8684.8]",54,"(40.0, 60.0]"
2,23.45,"(18.799, 275.12]",1,"(0.999, 6.0]"
3,237.95,"(18.799, 275.12]",4,"(0.999, 6.0]"
5,521.35,"(275.12, 949.92]",7,"(6.0, 20.0]"


<a id="creation_of_bins"></a>
# <p style="background-color: #4893D7; font-family:calibri; color:white; font-size:140%; font-family:Verdana; text-align:center; border-radius:15px 50px;">Step 8 | Interaction Terms</p>
<a id="libraries"></a>

In [100]:
def add_interaction_with_binary_condition(df, feature_1, feature_2, mask_1=None, mask_2=None):
    # Create masked or raw values
    col1_vals = (df[feature_1] == mask_1).astype(int) if mask_1 is not None else df[feature_1]
    col2_vals = (df[feature_2] == mask_2).astype(int) if mask_2 is not None else df[feature_2]

    # Define new interaction column name
    col1_name = f"{feature_1}_{mask_1}" if mask_1 else feature_1
    col2_name = f"{feature_2}_{mask_2}" if mask_2 else feature_2
    interaction_col = f"{col1_name}_X_{col2_name}"

    # Assign interaction term directly
    df.loc[:, interaction_col] = col1_vals * col2_vals

    return df


In [101]:
# Create function to loop through the interaction terms. 

def generate_interactions_from_spec(df, interaction_specs):
    for spec in interaction_specs:
        feature_1, feature_2, mask_1, mask_2 = spec
        df = add_interaction_with_binary_condition(df, feature_1, feature_2, mask_1, mask_2)
    
    return df


In [102]:
interaction_specs = [
    ('Contract', 'PaymentMethod', 'Two year', 'Electronic check'),
    ('OnlineSecurity', 'TechSupport', 'No', 'No'),
]

In [103]:
df = generate_interactions_from_spec(df, interaction_specs)

In [104]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,TotalCharges_Bin,tenure_Bin,Contract_Two year_X_PaymentMethod_Electronic check,OnlineSecurity_No_X_TechSupport_No
0,4223-BKEOR,Female,0,No,Yes,21,Yes,No,DSL,Yes,...,One year,No,Mailed check,64.85,1336.8,No,"(949.92, 2087.03]","(20.0, 40.0]",0,0
1,6035-RIIOM,Female,0,No,No,54,Yes,Yes,Fiber optic,No,...,Two year,Yes,Bank transfer (automatic),97.2,5129.45,No,"(4478.99, 8684.8]","(40.0, 60.0]",0,1
2,3797-VTIDR,Male,0,Yes,No,1,No,No,DSL,No,...,Month-to-month,Yes,Electronic check,23.45,23.45,Yes,"(18.799, 275.12]","(0.999, 6.0]",0,1
3,2568-BRGYX,Male,0,No,No,4,Yes,No,Fiber optic,No,...,Month-to-month,Yes,Electronic check,70.2,237.95,Yes,"(18.799, 275.12]","(0.999, 6.0]",0,1
5,4291-SHSBH,Male,0,No,No,7,Yes,No,Fiber optic,No,...,Month-to-month,Yes,Electronic check,69.55,521.35,No,"(275.12, 949.92]","(6.0, 20.0]",0,1


In [105]:
interaction_cols = [col for col in df.columns if '_X_' in col]

# If target is string like 'Yes'/'No', convert to binary
df['Churn_Binary'] = (df['Churn'] == 'Yes').astype(int)

correlations = df[interaction_cols + ['Churn_Binary']].corr()['Churn_Binary'].sort_values(ascending=False)
print(correlations)


Churn_Binary                                          1.000000
OnlineSecurity_No_X_TechSupport_No                    0.184318
Contract_Two year_X_PaymentMethod_Electronic check   -0.071238
Name: Churn_Binary, dtype: float64
