# Churn Prediction

## Importing libraries

In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import lightgbm as lgb
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import set_config
set_config(display='diagram')

from typing import List
from typing import Dict

## Loading Data

In [15]:
df = pd.read_csv('../data/raw/customer_churn_dataset-training-master.csv')
display(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440833 entries, 0 to 440832
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   CustomerID         440832 non-null  float64
 1   Age                440832 non-null  float64
 2   Gender             440832 non-null  object 
 3   Tenure             440832 non-null  float64
 4   Usage Frequency    440832 non-null  float64
 5   Support Calls      440832 non-null  float64
 6   Payment Delay      440832 non-null  float64
 7   Subscription Type  440832 non-null  object 
 8   Contract Length    440832 non-null  object 
 9   Total Spend        440832 non-null  float64
 10  Last Interaction   440832 non-null  float64
 11  Churn              440832 non-null  float64
dtypes: float64(9), object(3)
memory usage: 40.4+ MB


None

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0


## Validating Data

* <code>CustomerID</code>must be unique
* <code>Age</code> must be non-negative
* <code>Age</code> can't go above 100
* <code>Gender</code> must be one of ['Male', 'Female']
* <code>Usage Frequency</code> must be non-negative
* <code>Support Calls</code> must be non-negative
* <code>Payment Delay</code> must be non-negative
* <code>Subscription Type</code> must be one of ['Standard', 'Basic', 'Premium']
* <code>Contract Length</code> must be one of ['Annual', 'Monthly', 'Quarterly']
* <code>Total Spend</code> must be non-negative
* <code>Last Interaction</code> must be non-negative
* <code>Churn</code> must be one of [1.0, 0.0]

In [69]:
def check_non_negative(data: pd.core.frame.DataFrame, ls_columns: List[str]) -> Dict[str,bool]:
    
    results = {column:False for column in ls_columns}
    
    for column in ls_columns:
        if not data[column].min() >= 0:
            results[column] = True

    return results

def valid_value_of_list(data: pd.core.frame.DataFrame, ls_columns: List[str], rules_ls: Dict[str,list]) -> Dict[str,bool]:
    
    results = {column:False for column in ls_columns}
    
    for column in ls_columns:
        results[column] = all(df[column].isin(rules_ls[column]))
        
        if not results[column]:
            results[column] = [False,set(df[column]).symmetric_difference(rules_ls[column])]
        
    return results

In [70]:
check_non_negative(df, ['Age','Usage Frequency','Support Calls','Payment Delay','Total Spend','Last Interaction'])

{'Age': False,
 'Usage Frequency': False,
 'Support Calls': False,
 'Payment Delay': False,
 'Total Spend': False,
 'Last Interaction': False}

In [71]:
valid_value_of_list(df, ['Gender','Subscription Type','Subscription Type','Contract Length','Churn'],
                    {'Gender':['Male', 'Female'],
                     'Subscription Type':['Standard', 'Basic', 'Premium'],
                     'Contract Length':['Annual', 'Monthly', 'Quarterly'],
                     'Churn':[1.0,0.0]
                    })

{'Gender': [False, {nan}],
 'Subscription Type': [False, {nan}],
 'Contract Length': [False, {nan}],
 'Churn': [False, {nan}]}

In [78]:
df.CustomerID.value_counts(ascending=False)

2.0         1
301011.0    1
301009.0    1
301008.0    1
301007.0    1
           ..
151752.0    1
151751.0    1
151750.0    1
151749.0    1
449999.0    1
Name: CustomerID, Length: 440832, dtype: int64

In [83]:
df.Age.max()

65.0

## EDA

## Preprocessing

## Model development & Evaluation