# Modelling

In [13]:
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin  # Added missing imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)
class AddFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.autopay_methods = ['Bank transfer (automatic)', 'Credit card (automatic)']
        self.contract_durations = {'Month-to-month': 1, 'One year': 12, 'Two year': 24}
        self.output_columns_ = None
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # X.drop(columns=['gender'], inplace=True)
        #Tenure group
        X['tenure_group'] = pd.cut(X['tenure'], bins=[0, 6, 12, 24, 48, 60, 72], 
                                  labels=['0–6', '6–12', '12–24', '24–48', '48–60', '60–72'])
        X['new_customer'] = (X['tenure_group'] == '0–6').astype(int)
        #Payment and add-on features
        X['is_autopay'] = X['PaymentMethod'].apply(lambda x: 1 if x in self.autopay_methods else 0)
        X['AddOnCount'] = (X[['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                              'TechSupport', 'StreamingTV', 'StreamingMovies']] == 'Yes').sum(axis=1)
        X['AddOnGroup'] = pd.cut(X['AddOnCount'], bins=[-1, 0, 2, 6], labels=['None', 'Low', 'High'])
        # Cost-related features
        X['ChargePerMonthRatio'] = X['MonthlyCharges'] / X['tenure'].replace(0, 1)
        X['MonthlyCharges_group'] = pd.cut(X['MonthlyCharges'], bins=[0, 40, 70, 100, np.inf], 
                                           labels=['Low', 'Medium', 'High', 'Very High'])
        # Interaction terms
        X['Fiber_NoTechSupport'] = ((X['InternetService'] == 'Fiber optic') & (X['TechSupport'] == 'No')).astype(int)
        X['FiberOptic_StreamingTV'] = ((X['InternetService'] == 'Fiber optic') & (X['StreamingTV'] == 'Yes')).astype(int)
        X['Senior_Contract'] = ((X['SeniorCitizen'] == 1) & (X['Contract'] == 'Month-to-month')).astype(int)
        X['Contract_Duration_Ratio'] = X['tenure'] / X['Contract'].map(self.contract_durations)
        X['M2M_ElectronicCheck'] = ((X['Contract'] == 'Month-to-month') & 
                                  (X['PaymentMethod'] == 'Electronic check')).astype(int)
        X['IsMonthToMonth'] = (X['Contract'] == 'Month-to-month').astype(int)

        obj_cols = X.select_dtypes(include='object').columns
        binary_cols = [col for col in obj_cols if X[col].nunique() == 2]
        for col in binary_cols:
            # Debug print to inspect values
            print(f"{col} before mapping: {X[col].unique()}")
            X[col] = X[col].replace([np.inf, -np.inf], np.nan).fillna('No')  # Handle inf and NaN
            X[col] = X[col].map({'No': 0, 'Yes': 1}).astype('Int64')  # Use nullable integer
            print(f"{col} after mapping: {X[col].unique()}")

        X.drop(columns=['TotalCharges', 'MonthlyCharges', 'tenure', 'SeniorCitizen', 'PaymentMethod', 'PhoneService', 'gender'], inplace=True)
        self.output_columns_ = X.columns.tolist()
        return X

    def get_feature_names_out(self, input_features=None):
        return self.output_columns_

# Step 1: Load the saved pipeline
pipeline = joblib.load('feature_pipeline.joblib')

# Step 2: Load the cleaned dataset
data = pd.read_csv('../data/cleaned_data.csv')

# Step 3: Transform the data
X = data.drop(columns=['Churn'])  # Features
X_transformed = pipeline.fit_transform(X)
y = data['Churn']  # Target

gender before mapping: ['Female' 'Male']
gender after mapping: <IntegerArray>
[<NA>]
Length: 1, dtype: Int64
MultipleLines before mapping: ['No' 'Yes']
MultipleLines after mapping: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64
OnlineSecurity before mapping: ['No' 'Yes']
OnlineSecurity after mapping: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64
OnlineBackup before mapping: ['Yes' 'No']
OnlineBackup after mapping: <IntegerArray>
[1, 0]
Length: 2, dtype: Int64
DeviceProtection before mapping: ['No' 'Yes']
DeviceProtection after mapping: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64
TechSupport before mapping: ['No' 'Yes']
TechSupport after mapping: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64
StreamingTV before mapping: ['No' 'Yes']
StreamingTV after mapping: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64
StreamingMovies before mapping: ['No' 'Yes']
StreamingMovies after mapping: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64


In [16]:
feature_gen = pipeline.named_steps['feature_gen']
feature_names = feature_gen.get_feature_names_out()
X_transformed_df = pd.DataFrame.sparse.from_spmatrix(X_transformed, columns=feature_names)

# Step 4: View the first few rows
print(X_transformed_df.head())

ValueError: Column length mismatch: 25 vs. 15