# Customer Churn Project

## Imports

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline # Use ImbPipeline for resampling steps
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import KFold

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

## Constants

In [3]:
DATA_PATH = '../data/'
DATA_FILE_NAME = 'Customer_Churn_Dataset.xlsx'

## Import and Inspect Data

In [4]:
df = pd.read_excel(DATA_PATH + DATA_FILE_NAME)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
df.describe()

In [None]:
df.head()

## Fix Data

### Fix Total Charges - String +> Numeric

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [None]:
df.info()

## Handle Missing Values

In [None]:
null_count = df['TotalCharges'].isna().sum()
non_null_count = df['TotalCharges'].count()
f" Percentage of Null Total Charges Values: {round(null_count*100/(non_null_count + null_count), 2)}%"

Since the percentage of null values is less than 1%, and thus small, the best strategy is to drop them

In [None]:
df.dropna(subset=['TotalCharges'], inplace=True)
f"New Null Count: {df['TotalCharges'].isna().sum()}"

## Encoding Categorical Values

### Get Range of Unique Values for Categorical Columns

In [None]:
df.info()

In [None]:
category_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                   'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                   'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']
for column in category_columns:
        unique_values = df[column].unique()
        print(f"Unique values in column '{column}': {unique_values}")

### Convert Binary Values to 0/1

In [None]:
df['gender'] = df['gender'].replace({'Female': 1, 'Male': 0})
df['Partner'] = df['Partner'].replace({'Yes': 1, 'No': 0})
df['Dependents'] = df['Dependents'].replace({'Yes': 1, 'No': 0})
df['PhoneService'] = df['PhoneService'].replace({'Yes': 1, 'No': 0})
df['PaperlessBilling'] = df['PaperlessBilling'].replace({'Yes': 1, 'No': 0})

## For the following "No service" also implies "NO"
df['MultipleLines'] = df['MultipleLines'].replace({'Yes': 1, 'No': 0, 'No phone service': 0})
df['OnlineSecurity'] = df['OnlineSecurity'].replace({'Yes': 1, 'No': 0, 'No internet service': 0})
df['OnlineBackup'] = df['OnlineBackup'].replace({'Yes': 1, 'No': 0, 'No internet service': 0})
df['DeviceProtection'] = df['DeviceProtection'].replace({'Yes': 1, 'No': 0, 'No internet service': 0})
df['TechSupport'] = df['TechSupport'].replace({'Yes': 1, 'No': 0, 'No internet service': 0})
df['StreamingTV'] = df['StreamingTV'].replace({'Yes': 1, 'No': 0, 'No internet service': 0})
df['StreamingMovies'] = df['StreamingMovies'].replace({'Yes': 1, 'No': 0, 'No internet service': 0})

# Target Column
df['Churn'] = df['Churn'].replace({'Yes': 1, 'No': 0})

In [None]:
for column in category_columns:
        unique_values = df[column].unique()
        print(f"Unique values in column '{column}': {unique_values}")

### Convert Multi-category columns

#### Has Sensible Order, So Adaptible to Numerical Transformation

In [None]:
df['Contract'] = df['Contract'].replace({'Month-to-month': 0,  'One year': 1, 'Two year': 2})

In [None]:
for column in category_columns:
        unique_values = df[column].unique()
        print(f"Unique values in column '{column}': {unique_values}")

#### One-Hot Encoded Categorical Columns

In [None]:
df_encoded = pd.get_dummies(df, columns=['InternetService', 'PaymentMethod'])

In [None]:
df_encoded.info()

### Scale Appropriate Columns

In [None]:
scaler = MinMaxScaler()
df_encoded['tenure'] = scaler.fit_transform(df[['tenure']])
df_encoded['MonthlyCharges'] = scaler.fit_transform(df[['MonthlyCharges']])
df_encoded['TotalCharges'] = scaler.fit_transform(df[['TotalCharges']])
df_encoded['numAdminTickets'] = scaler.fit_transform(df[['numAdminTickets']])
df_encoded['numTechTickets'] = scaler.fit_transform(df[['numTechTickets']])


## Drop Customer ID - Besides not providing numerical info, it hides personal info.

In [None]:
df_calc = df_encoded.drop('customerID', axis=1)
df_calc.head()

In [None]:
df_calc.corr()

## Prepare Data Input, Output 

In [None]:
X = df_calc.drop('Churn', axis=1)
y = df_calc['Churn']

## Function to Test a Generic a Model

In [None]:
def get_scores(model_meta, data, use_smote=False):
    X_train, X_test, y_train, y_test = train_test_split(data['X'], data['y'], test_size=0.2, random_state=42, stratify=y)
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    model = model_meta['model']
    pipeline = ImbPipeline([
        ('classifier', model)
    ])

    if use_smote:
        pipeline = ImbPipeline([
            ('smote', SMOTE(random_state=42)),
            ('classifier', model)
        ])
    
    # 3. Run the Cross-Validation on the pipeline
    precision_scores = cross_val_score(
        pipeline,
        X_train,
        y_train,
        cv=cv,
        scoring='precision',  # Or 'accuracy', 'recall', 'f1'
        n_jobs=-1
    )

    print('------------------------')
    print('Results for ' + model_meta['name'])
    print('------------------------')
    
    print(f"Average Precision with CV: {precision_scores.mean():.4f}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix')
    print(cm)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label=1)
    recall = recall_score(y_test, y_pred, pos_label=1)
    f1 = f1_score(y_test, y_pred, pos_label=1)

    print('------------------------')

    print(f"Accuracy: {accuracy}")
    print(f"Precision (positive class): {precision}")
    print(f"Recall (positive class): {recall}")
    print(f"F1-Score (positive class): {f1}")


## Run Models and Get Results

In [None]:
models = [
    {'model': LogisticRegression(max_iter=10000), 'name': 'Logistic Regression'}
    ,{'model': SVC(), 'name': 'Support Vector Classification'}
    ,{'model': RandomForestClassifier(class_weight='balanced', n_estimators=100), 'name': 'Random Forest'}
    ,{'model': xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42, max_depth=5, learning_rate=0.1
    ), 'name': 'XG Boost - 0.1'}
    ,{'model': xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42, max_depth=5, learning_rate=0.06
    ), 'name': 'XG Boost  - 0.01'}
]

for model_dict in models:
    get_scores(model_dict, {'X': X, 'y': y})

XG Boost is promising. For low learning rate, the precision increases, but it is at the expense of low recall.

Vary Learning to get the optimal value

In [None]:
learning_rate = 0.01
while learning_rate < 0.101:
    model_dict = {'model': xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42, max_depth=5, learning_rate=learning_rate
    ), 'name': f'XG Boost  - {learning_rate}'}
    get_scores(model_dict, {'X': X, 'y': y})
    learning_rate = learning_rate + 0.005

Best Vale is about 0.03.

Vary the maximum depth of the tree for this learning rate to see if this can be further optimized.

In [None]:
depth = 2
while depth < 11:
    model_dict = {'model': xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42, max_depth=depth, learning_rate=0.03
    ), 'name': f'XG Boost  - {depth}'}
    get_scores(model_dict, {'X': X, 'y': y})
    depth = depth + 1

Optimal maximum depth is 6 before accuracy degrades.

## Add SMOTE to check for improvement.

In [None]:
models = [
    {'model': LogisticRegression(max_iter=10000), 'name': 'Logistic Regression'}
    ,{'model': SVC(), 'name': 'Support Vector Classification'}
    ,{'model': RandomForestClassifier(class_weight='balanced', n_estimators=100), 'name': 'Random Forest'}
    ,{'model': xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42, max_depth=6, learning_rate=0.03
    ), 'name': 'XG Boost'}
]

for model_dict in models:
    get_scores(model_dict, {'X': X, 'y': y}, True)

Conclusion SMOTE made things worse! Precision decreased!

## Feature Engineering

### Combine Short-Term (Month-to-Month and Fiber)

In [None]:
df_calc_plus = df_calc.copy(deep=True)
df_calc_plus['ShortTermFiber'] = df_calc['Contract'] * df_calc['InternetService_Fiber optic']

### Normalization of charges to length being a customer - and rescale!!

In [None]:
df_calc_plus['AvgChargesPerMonth'] = df_calc['TotalCharges'] / (1.0 + df_calc['tenure'])
df_calc_plus['AvgChargesPerMonth'] = scaler.fit_transform(df_calc_plus[['AvgChargesPerMonth']])

### Find New Customer (_i.e._ customers in 1st year)

In [None]:
df_calc_plus['Is_New_Customer'] = (df_calc['tenure'] <= 12).astype(int)

## Get New Input/Output

In [None]:
X = df_calc_plus.drop('Churn', axis=1)
y = df_calc_plus['Churn']

### Test Models with Feature Engineering

In [None]:
models = [
    {'model': LogisticRegression(max_iter=10000), 'name': 'Logistic Regression'}
    ,{'model': SVC(), 'name': 'Support Vector Classification'}
    ,{'model': RandomForestClassifier(class_weight='balanced', n_estimators=100), 'name': 'Random Forest'}
    ,{'model': xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42, max_depth=6, learning_rate=0.03
    ), 'name': 'XG Boost'}
]

for model_dict in models:
    get_scores(model_dict, {'X': X, 'y': y})

Conclusion: Feature Engineering Did not make the overall results worse, but it also didn't really help.

In [None]:
model_dict = models[3]
feature_importances = model_dict['model'].feature_importances_

named_features = [{'name': s, 'value': float(v)} for s, v in zip(X.columns, feature_importances)]
sorted_reatures = sorted(named_features, key=lambda x: x['value'], reverse=True)
sorted_reatures

Feature Egineering was not successful in increasing the precision as evidenced by the low scores of the new features.

## One Last Experiment Drop Low Impact Features

In [None]:
df_calc_slim = df_calc.copy(deep=True)

X = df_calc_slim.drop(['Churn', 'DeviceProtection', 'OnlineBackup', 'PaymentMethod_Electronic check',
                      'numAdminTickets', 'TechSupport', 'Dependents', 'MonthlyCharges', 'PaymentMethod_Mailed check',
                      'MultipleLines', 'PhoneService', 'StreamingMovies', 'StreamingTV', 'PaperlessBilling',
                      'PaymentMethod_Credit card (automatic)', 'gender', 'TotalCharges', 'PaymentMethod_Bank transfer (automatic)',
                      'Partner', 'SeniorCitizen'], axis=1)
y = df_calc_slim['Churn']

# Just Use Best Model
get_scores({'model': xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42, max_depth=6, learning_rate=0.03
    ), 'name': 'XG Boost'}, {'X': X, 'y': y})

Limiting to only the most important features gave worse results.

## Revert to former features 

In [None]:
X = df_calc.drop('Churn', axis=1)
y = df_calc['Churn']

# Rerun Best Model
get_scores({'model': xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42, max_depth=6, learning_rate=0.03
    ), 'name': 'XG Boost'}, {'X': X, 'y': y})

*Conclusion* Best Case was XG Boost using a learning rate of 0.03 and maximum depth of 6 levels. It gave an accuracy of 86%
and a precision of 76% with acceptable recall.