In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Using df.head() doesn't display all the columns because of the size of the DF
# by using transpose, we make the columns the rows.
df.head().T

In [None]:
# lets clean up the data
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns

In [None]:
categorical_columns = list(df.dtypes[df.dtypes  == 'object'].index)
categorical_columns

In [None]:
# Now we go through and clean up the data in the columns
for c in categorical_columns: 
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [None]:
# setting error value to coerce will just return the input value as the output
tc = pd.to_numeric(df.totalcharges, errors='coerce')
# show the rows in DF that match tc.isnull and just show the two columns we want to see
df[tc.isnull()][['customerid', 'totalcharges']]

In [None]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

In [None]:
df.totalcharges = df.totalcharges.fillna(0)
df.totalcharges.isnull().sum()

In [None]:
# convert the yes no values into 1 and 0
df.churn = (df.churn == 'yes').astype(int)
df.churn

Set up the validaiton framework using Scikit-Learn

In [None]:
from sklearn.model_selection import train_test_split
df_full_train, df_test  = train_test_split(df, test_size=0.2, random_state=1)
len(df)

In [None]:
len(df_full_train), len(df_test)

In [None]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
len(df_train), len(df_val)

In [None]:
# get y_values
df_train.reset_index(drop=True)
df_val.reset_index(drop=True)
df_test.reset_index(drop=True)

In [None]:
y_train = df_train.churn.values
y_test = df_test.churn.values
y_val = df_val.churn.values

In [None]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

In [None]:
df_train.columns

EDA - Exploratory Data Analysiz

In [None]:
df_full_train = df_full_train.reset_index(drop=True)

In [None]:
df_full_train.churn.value_counts(normalize=True)
# normalize will give us the %, so we can see that the churn rate is 26.9%

In [None]:
global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate, 2)

In [None]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [None]:

df.columns
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [None]:
df_full_train[categorical].nunique()

Feature Importance

Churn Rate

In [None]:
df_full_train[df_full_train.gender == 'male'].churn.mean()

In [None]:
df_full_train[df_full_train.gender =='female'].churn.mean()

In [None]:
df_full_train.churn.mean()

In [None]:
churn_partner = df_full_train[df_full_train.partner == 'yes'].churn.mean()

In [None]:
churn_no_partner = df_full_train[df_full_train.partner == 'no'].churn.mean()
churn_partner, churn_no_partner

In [None]:
# Now we begin to see that Gender does not matter so much in churn rate but for Partner, is matters and is important


Risk Ratio

In [None]:
churn_no_partner / global_churn_rate

In [None]:
churn_partner / global_churn_rate

Doing this in SQL

SELECT 
    gender,
    AVG(churn),
    AVG(churn) - global_churn as diff,
    AVG(churn) / global_churn as risk
FROM
    data
GROUP BY 
    gender;

In [None]:
# Lets do that query in Pandas
df_group = df_full_train.groupby('gender').churn.agg(['mean', 'count'])
df_group['diff'] = df_group['mean'] - global_churn_rate
df_group['risk'] = df_group['mean'] / global_churn_rate
df_group

In [None]:
df_partner_group = df_full_train.groupby('partner').churn.agg(['mean', 'count'])
df_partner_group['diff'] = df_partner_group['mean'] - global_churn_rate
df_partner_group['risk'] = df_partner_group['mean'] / global_churn_rate
df_partner_group

In [None]:
from IPython.display import display    

In [None]:
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).churn.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_churn_rate
    df_group['risk'] = df_group['mean'] / global_churn_rate
    display(df_group)
    print()
    print()

Feature Importance: Mutual Information

In [None]:
from sklearn.metrics import mutual_info_score

In [None]:
for c in categorical:
    print(c)
    score = mutual_info_score(df_full_train[c], df_full_train['churn'])
    display(score)
    print()

In [None]:
# We can also define this as a function and apply it to out data frame using df.frame()
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.churn)



In [None]:
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)