<a href="https://colab.research.google.com/github/mosomo82/COMP_SCI_5530/blob/main/Project_Customer_Churn/src/Project_Customer_Churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [56]:
from re import X
from itertools import combinations_with_replacement
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# ==============================================================================
# --- 1: EXTRACT and DATASET DISCOVERY---
# ==============================================================================

try:
  df = pd.read_csv('https://raw.githubusercontent.com/mosomo82/COMP_SCI_5530/refs/heads/main/Project_Customer_Churn/raw_data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
except FileNotFoundError:
  print("Dataset not found. Please check the link and add the 'Telco Customer Churn' dataset to your github.")
  df = None

if df is not None:
    # Check for the primary key uniqueness for customerID and remove this unecessarry
    # column for analysis
    if len(df) == df['customerID'].nunique():
         print("Primary key 'customerID' is unique.")
    else:
         print("Primary key 'customerID' is NOT unique.")
         df.drop_duplicates(subset=['customerID'], inplace=True)

    df = df.drop('customerID', axis=1)

    # Data type validation against schema
    print("DataFrame Info:")
    df.info()

    # Missing value check
    print("\nMissing Values:")
    print(df.isnull().sum())

    # check for empty spaces such as ' ' or 'N/A' and replace as NaN values
    df.replace(' ', np.nan, inplace=True)
    df.replace('N/A', np.nan, inplace=True)
    print("\nMissing Values after replacing empty spaces:")
    print(df.isnull().sum())

    # Convert 'TotalCharges' to number, coering errors to NaN
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

    # Check for missing values in TotalCharges after conversions
    missing_total_charges = df['TotalCharges'].isnull().sum()
    print(f"\nMissing values in 'TotalCharges' after conversion: {missing_total_charges}")

    # Option 1: Drop rows with missing values
    df.dropna(inplace=True)
    print(f"\nDataFrame shape after dropping missing values: {df.shape}")

    # Option 2: Impute missing values with the median
    #  median_total_charges = df['TotalCharges'].median()
    #  df['TotalCharges'].fillna(median_total_charges, inplace=True)


    # Check for duplicate rows
    duplicate_rows = df.duplicated().sum()
    print(f"\nNumber of duplicate rows: {duplicate_rows}")

    # Remove duplicate rows
    df.drop_duplicates(inplace=True)
    print(f"\nDataFrame shape after dropping duplicates: {df.shape}")

    # Numeric range check for the main numeric columns. Here is the  is breakdown
    # tenure: from 1 to 72 months and no negative value
    # MonthlyCharges: from $18 to $119 and no negative charges
    # TotalCharges: after cleaning, the range is from $18 to 8,700 as cumulative charge amount and no negative
    print("\nDescriptive Statistics for Numeric Columns:")
    print(df[['tenure', 'MonthlyCharges', 'TotalCharges']].describe())

    # Check the unique values and their counts for 'SeniroCitizen'
    print("\nUnique values and their counts for 'SeniorCitizen':")
    print(df['SeniorCitizen'].value_counts())

    # String strengh validation
    # loop through column with 'object' datatype
    for column in df.select_dtypes(include=['object']).columns:
        print(f"Number of unique values in '{column}': {df[column].nunique()}")
        print(df[column].value_counts())
        print("\\n")

# ==============================================================================
# --- 2. TRANSFORM ---
# ==============================================================================

    # Encode Categorical Variables: convert categorical columns into numerical
    # format like one-hot encoding or label encoding
    df['gender'] = df['gender'].map({'Female': 0, 'Male': 1})
    df['Partner'] = df['Partner'].map({'No': 0, 'Yes': 1})
    df['Dependents'] = df['Dependents'].map({'No': 0, 'Yes': 1})
    df['PhoneService'] = df['PhoneService'].map({'No': 0, 'Yes': 1})
    df['PaperlessBilling'] = df['PaperlessBilling'].map({'No': 0, 'Yes': 1})
    df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

    # Feature engineering: create a categorical tenure_group feature based on tenure
    # Define the bins and lables for the age groups
    bins = [0, 12, 24, 36, 48, 60, 72]
    labels = ['<12', '12-24', '24-36', '36-48', '48-60', '>60']
    tenure_group = pd.cut(df['tenure'], bins=bins, labels=labels, right=True)
    df['tenure_group'] = tenure_group

    # One-hot encode tenure_group into fix new binary columns
    df = pd.get_dummies(df, columns=['tenure_group'], prefix='tenure', dtype='int8')

    # print a sample dataframe
    #print("\nSample DataFrame:")
    #print(df.head())

    # ---Ordinal Encoding for Contract---
    # Define the correct order for the categories
    contract_order = ['Month-to-month', 'One year', 'Two year']

    # Initialize the encoder with the specific order
    ordinal_encoder = OrdinalEncoder(categories=[contract_order])

    # Reshape the data and apply the encoding and rename the column
    df['Contract_Encoded'] = ordinal_encoder.fit_transform(df[['Contract']]).astype(int)
    df.drop('Contract', axis=1, inplace=True)

    # ---One-Hot Encoding for InternetService ---
    internet_dummies = pd.get_dummies(df['InternetService'], prefix='InternetService', dtype='int8')
    df = pd.concat([df, internet_dummies], axis=1)
    df.drop('InternetService', axis=1, inplace=True)

    # ---Group the payment method into automatic and manual---
    df['AutomaticPayment'] = df['PaymentMethod'].str.contains('automatic', case=False, na=False).astype(int)
    df.drop('PaymentMethod', axis=1, inplace=True)

    print("\nSample DataFrame after transformation:")
    print(df.head())

    # ---Feature Scaling---
    # Scaling numerical feature to ensure that features with large ranges has no
    # domination over features with smaller ranges

    # Define features (X) and target (y)
    X = df.drop('Churn', axis=1)
    y = df['Churn']

    # Split data Before scaling
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the scaller
    scaler = StandardScaler()

    # Identify the numerical columns to scale
    numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

    # Fit on Training Data and Transform Both Sets
    # Fit ONLY on the training data
    scaler.fit(X_train[numerical_features])

    # Transform both the training and testing data
    X_train[numerical_features] = scaler.transform(X_train[numerical_features])
    X_test[numerical_features] = scaler.transform(X_test[numerical_features])

    # --- Display the result ---
    print("Data after Standardization:")
    print(X_train[numerical_features].head())

    # Output the final processed data into csv file
    df.to_csv('clean_data.csv')

Primary key 'customerID' is unique.
DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   obj