In [7]:
# notebooks/02_feature_engineering.ipynb

# Import necessary libraries
import pandas as pd
import numpy as np
import sys
import os

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../src')))

from data_processor import DataProcessor

# --- Section 1: Load and Clean Data ---
print("--- Section 1: Loading and Cleaning Data ---")

# Initialize DataProcessor
processor = DataProcessor(data_path=r"C:\Users\nicol\OneDrive\Documentos\customer-churn-prediction\data\WA_Fn-UseC_-Telco-Customer-Churn.csv")
processor.load_data()
processor.clean_data() # Apply initial cleaning steps

# Get the cleaned DataFrame and separate features (X) and target (y)
df_cleaned = processor.get_raw_data()
if df_cleaned is not None:
    X_raw = df_cleaned.drop('Churn', axis=1)
    y = df_cleaned['Churn']
    print(f"Cleaned data shape: {X_raw.shape}")
    print(f"Target variable shape: {y.shape}")

# --- Section 2: Define and Apply Preprocessing Pipeline ---
print("\n--- Section 2: Defining and Applying Preprocessing Pipeline ---")

# Call preprocess_data to set up the ColumnTransformer
processor.preprocess_data()

# Identify feature types as per the data processor
numerical_features = processor.numerical_features
categorical_features = processor.categorical_features

print(f"Numerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")

# Apply preprocessing to the raw features (X_raw)
# The `fit_transform` method of the preprocessor will be used later during `split_data`
# inside the DataProcessor. Here, we're just confirming the setup.
print("\nPreprocessor has been set up using ColumnTransformer.")
print("It will perform:")
print(f"  - StandardScaler on: {numerical_features}")
print(f"  - OneHotEncoder on: {categorical_features}")

# --- Section 3: Data Splitting and SMOTE for Imbalance Handling ---
print("\n--- Section 3: Data Splitting and SMOTE ---")

# Split data into training and test sets, and apply SMOTE to the training set
# X_train_processed and X_test_processed will be numpy arrays after transformation
X_train_processed, X_test_processed, y_train, y_test = processor.split_data(
    test_size=0.2, random_state=42, apply_smote=True
)

if X_train_processed is not None:
    print(f"\nShape of X_train (after preprocessing and SMOTE): {X_train_processed.shape}")
    print(f"Shape of y_train (after SMOTE): {y_train.shape}")
    print(f"Shape of X_test (after preprocessing): {X_test_processed.shape}")
    print(f"Shape of y_test: {y_test.shape}")

    print("\nClass distribution in y_train (after SMOTE):")
    print(y_train.value_counts())
    print("\nClass distribution in y_test:")
    print(y_test.value_counts())

    # Get the names of the processed features (useful for model interpretability later)
    processed_feature_names = processor.get_feature_names_out()
    if processed_feature_names is not None:
        print(f"\nTotal number of processed features: {len(processed_feature_names)}")
        print("Example processed feature names (first 10):", processed_feature_names[:10])

# --- Section 4: Save Processed Data for Next Notebook ---
# In a real pipeline, you might save these arrays, but passing them directly
# between notebooks in a live session is also common.
# For reproducibility and to ensure the next notebook starts with the exact same data,
# we'll create a quick way to pass them via global variables or save/load.
# Since Jupyter kernel restarts clear variables, saving is safer.
# For simplicity, we assume you're running notebooks sequentially and will pass these.

# It's good practice to save these for persistence if running independently.
# Example:
# np.save('../data/X_train_processed.npy', X_train_processed)
# np.save('../data/X_test_processed.npy', X_test_processed)
# np.save('../data/y_train.npy', y_train)
# np.save('../data/y_test.npy', y_test)
# joblib.dump(processor.get_preprocessor(), '../models/preprocessor.pkl') # Save the preprocessor

print("\nFeature Engineering Complete. The processed data (X_train_processed, X_test_processed, y_train, y_test)")
print("and the preprocessor are ready for model training in the next notebook.")


--- Section 1: Loading and Cleaning Data ---
Data loaded successfully from C:\Users\nicol\OneDrive\Documentos\customer-churn-prediction\data\WA_Fn-UseC_-Telco-Customer-Churn.csv
Initial shape: (7043, 21)
Starting data cleaning...
Dropped 'customerID' column.
Converted 'TotalCharges' to numeric.
Filled missing 'TotalCharges' with median: 1397.475
Replaced 'No internet service' with 'No'.
Replaced 'No phone service' with 'No' in 'MultipleLines'.
Mapped 'Churn' target variable to 1/0.
Data cleaning complete.
Cleaned data shape: (7043, 19)
Target variable shape: (7043,)

--- Section 2: Defining and Applying Preprocessing Pipeline ---
Starting data preprocessing...
Preprocessing pipeline (ColumnTransformer) created.
Numerical features: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Categorical features: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'St

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df['TotalCharges'].fillna(median_total_charges, inplace=True)


Training data resampled with SMOTE. New shape: (8278, 38), target shape: (8278,)

Shape of X_train (after preprocessing and SMOTE): (8278, 38)
Shape of y_train (after SMOTE): (8278,)
Shape of X_test (after preprocessing): (1409, 38)
Shape of y_test: (1409,)

Class distribution in y_train (after SMOTE):
Churn
0    4139
1    4139
Name: count, dtype: int64

Class distribution in y_test:
Churn
0    1035
1     374
Name: count, dtype: int64

Total number of processed features: 38
Example processed feature names (first 10): ['num__SeniorCitizen' 'num__tenure' 'num__MonthlyCharges'
 'num__TotalCharges' 'cat__gender_Female' 'cat__gender_Male'
 'cat__Partner_No' 'cat__Partner_Yes' 'cat__Dependents_No'
 'cat__Dependents_Yes']

Feature Engineering Complete. The processed data (X_train_processed, X_test_processed, y_train, y_test)
and the preprocessor are ready for model training in the next notebook.
