In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Installation for auto-sklearn
# !apt-get remove swig
# !apt-get install swig3.0 build-essential -y
# !ln -s /usr/bin/swig3.0 /usr/bin/swig
# !apt-get install build-essential
# !pip install --upgrade setuptools
# !pip install auto-sklearn
# !pip install -U scikit-learn
# !pip freeze | grep scikit-learn

**Goal**
* See why customers are churning
* How to stop customers from churning (retention)
* Predict customers likely to churn

**Useful insights**
* Lost revenue due to customers churning
* Revenue gained in x time from current customers not churning

In [3]:
pd.set_option('colwidth', None)
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format

In [4]:
# import data
df = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

In [5]:
# Inital EDA
print(df.shape)
print('\n')
print(df.describe())
print('\n')
print(df.info())
print('\n')
print(df.isnull().sum())
print('\n')
print(df.nunique())

No null values.. There are a few columns with less than 5 unique values. These can probably be used as features in the model. Let's see what the unique values are

In [6]:
# See what the unique values for columns with 4 or less unique values
for col in df.loc[:, df.nunique() <= 4].columns:
    print(col)
    print(df[col].unique())
    print(df[col].value_counts())
    print('\n')

In [7]:
# Total charges should be float value
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce', downcast='float')

In [8]:
# See what the null values are
print(df.TotalCharges.isna().sum())
print('\n')
df.loc[df.TotalCharges.isna()]

11 rows of NaN TotalCharges. These rows can be dropped so the dataset doesn't negatively affect the analysis. This drop affects less than 1% of the dataset

In [9]:
# Drop NAs
df.dropna(axis=0,
          how='any',
          subset=['TotalCharges'],
          inplace=True)

print(df.isna().sum()) # should be 0

In [10]:
# Change datatype to match with MonthlyCharges
df['TotalCharges'] = df.TotalCharges.astype(float)
df.dtypes

In [11]:
# Change values to Yes or No like the other columns
df['SeniorCitizen'] = np.select(condlist=[df.SeniorCitizen == 0, df.SeniorCitizen == 1], 
                                choicelist=['No', 'Yes'])

df['SeniorCitizen'].unique()

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
# See all features with 4 or less unique values and its relationship with customers churning
for i, col in enumerate(df.loc[:, df.nunique() <= 4].columns):
    plt.figure(i)
    sns.histplot(x=col, data=df, hue='Churn', multiple='stack').set_title(col+' & Churn')

Customers with added a particular set of services (Online Security, Online Backup, Tech Support, Device Protection) are **less likely to churn**

Customers with more flexibility (no partner, not dependent, month-to-month contract) are **more likely to churn**

Customers with streaming are **more likely to churn**. Most likely bc many streaming services do not have every movie or TV show packaged, so it makes customers more likely to hop around streaming services

Customers with Electronic Payments, Fiber Optic Internet, and Paperless Billing are **more likely to churn**

In [14]:
# Let's see how charges and tenure are affecting customer churn
for i, col in enumerate(df.select_dtypes(include=['int64','float64']).columns):
    plt.figure(i)
    sns.histplot(x=col, data=df, hue='Churn', multiple='stack').set_title(col+' & Churn')

Customers churn early, indicated by the high churn rate in Total Charges and Tenure in the lower tail of the distribution

Customers are likely to churn when their monthly rates are ~ $70 to 110

In [15]:
# How much money has been made from customers churning vs not
df.groupby('Churn').agg({'TotalCharges':'sum', 'tenure':'mean', 'MonthlyCharges':'mean'})

It seems customers that churn are getting charged more in total and monthly than charges of customers that do not churn

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
# Encode categorical variables and change column type to category
lb = LabelEncoder()
for col in df.loc[:, df.nunique() <= 4].columns:
    df[col] = lb.fit_transform(df[col])
    df[col] = df[col].astype('category')

In [18]:
from sklearn.preprocessing import MinMaxScaler

In [19]:
# Normalize data
scaler = MinMaxScaler()
df[df.select_dtypes(include=['int64','float64']).columns] = scaler.fit_transform(df.select_dtypes(include=['int64','float64']))

In [20]:
from autosklearn.classification import AutoSklearnClassifier

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [22]:
# Split data into X and y
X = df.iloc[:, 1:-1] # The numeric and categorical features, excluding CustomerID and Churn
y = df.iloc[:, -1] # Churn

# Train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [23]:
# define search
automl = AutoSklearnClassifier(time_left_for_this_task=2*60, per_run_time_limit=30, n_jobs=8)
automl.fit(X_train, y_train)
preds = automl.predict(X_test)
# Change to series and data type as category
y_preds = pd.Series(data=preds, 
                    index=y_test.index,
                    dtype='category',
                    name='ChurnPreds')

# performance
automl.performance_over_time_.plot(
    x='Timestamp',
    kind='line',
    legend=True,
    title='Auto-sklearn accuracy over time',
    grid=True,
)
plt.show()

# summarize
print(automl.sprint_statistics())
print(automl.show_models())

# evaluate best model
acc = accuracy_score(y_test, y_preds)
print("Accuracy: %.3f" % acc)

In [24]:
automl.leaderboard()

In [25]:
# Evaluate model with dataset
df['ChurnPreds'] = y_preds

# Create ConfusionMatrix
df['ConfusionMatrix'] = np.select(condlist=[(df.Churn == 1) & (df.ChurnPreds == 1), 
                                            (df.Churn == 1) & (df.ChurnPreds == 0), 
                                            (df.Churn == 0) & (df.ChurnPreds == 1), 
                                            (df.Churn == 0) & (df.ChurnPreds == 0)],
                                  choicelist=['TP', 'FP', 'FN', 'TN'],
                                  default=None)

# Customers likely to churn
print(str(df.loc[df.ConfusionMatrix == 'FN'].shape[0]) + ' customers likely to churn')
df.loc[df.ConfusionMatrix == 'FN'].head()

In [26]:
# Get original dataset and bring in the ConfusionMatrix column to evaluate results
df_orig = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df_orig['ConfusionMatrix'] = df.ConfusionMatrix

In [27]:
# See all features with 4 or less unique values and its relationship with Churning Predictions
for i, col in enumerate(df_orig.loc[:, df_orig.nunique() <= 4].columns):
    plt.figure(i)
    sns.histplot(x=col, 
                 data=df_orig, 
                 hue='ConfusionMatrix', 
                 multiple='stack',
                 palette={'TP':'green', 'TN':'blue', 'FP':'red', 'FN':'orange'}).set_title(col+' & ConfusionMatrix')


In [28]:
# Let's see how charges and tenure evaluate with customer churn
for i, col in enumerate(df_orig.select_dtypes(include=['int64','float64']).columns):
    plt.figure(i)
    sns.histplot(x=col, 
                 data=df_orig, 
                 hue='ConfusionMatrix', 
                 multiple='stack',
                 palette={'TP':'green', 'TN':'blue', 'FP':'red', 'FN':'orange'}).set_title(col+' & ConfusionMatrix')

In [29]:
# Customers likely to churn
df_orig.loc[df_orig.ConfusionMatrix == 'FN']

autosklearn does a good job with predicting which customers will churn. Performance could be improved by bootstrapping the churns since the data is imbalanced with more No Churns than Churns.

It seems the business model already does a good job with addressing customers churning by increasing monthly rates and overall charges to those likely to churn.

In order to prevent future customers from churning, try reaching out to the current customers and set them up with a longer term contract and lowering their monthly rate. These two features have almost none or least likeliness for churn.