In [238]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from IPython.display import display
from sklearn.metrics import mutual_info_score

### 1. Data importing 

In [200]:
df = pd.read_csv('telecom_churn.csv')

In [201]:
df.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


### 2. Data Preparation 

In [202]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [203]:
df.head().T

Unnamed: 0,0,1,2,3,4
state,KS,OH,NJ,OH,OK
account_length,128,107,137,84,75
area_code,415,415,415,408,415
international_plan,No,No,No,Yes,Yes
voice_mail_plan,Yes,Yes,No,No,No
number_vmail_messages,25,26,0,0,0
total_day_minutes,265.1,161.6,243.4,299.4,166.7
total_day_calls,110,123,114,71,113
total_day_charge,45.07,27.47,41.38,50.9,28.34
total_eve_minutes,197.4,195.5,121.2,61.9,148.3


In [204]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account_length          3333 non-null   int64  
 2   area_code               3333 non-null   int64  
 3   international_plan      3333 non-null   object 
 4   voice_mail_plan         3333 non-null   object 
 5   number_vmail_messages   3333 non-null   int64  
 6   total_day_minutes       3333 non-null   float64
 7   total_day_calls         3333 non-null   int64  
 8   total_day_charge        3333 non-null   float64
 9   total_eve_minutes       3333 non-null   float64
 10  total_eve_calls         3333 non-null   int64  
 11  total_eve_charge        3333 non-null   float64
 12  total_night_minutes     3333 non-null   float64
 13  total_night_calls       3333 non-null   int64  
 14  total_night_charge      3333 non-null   

In [205]:
df.isna().sum()

state                     0
account_length            0
area_code                 0
international_plan        0
voice_mail_plan           0
number_vmail_messages     0
total_day_minutes         0
total_day_calls           0
total_day_charge          0
total_eve_minutes         0
total_eve_calls           0
total_eve_charge          0
total_night_minutes       0
total_night_calls         0
total_night_charge        0
total_intl_minutes        0
total_intl_calls          0
total_intl_charge         0
customer_service_calls    0
churn                     0
dtype: int64

In [206]:
df.international_plan = (df.international_plan == 'Yes').astype(int)

In [207]:
df.voice_mail_plan = (df.voice_mail_plan == 'Yes').astype(int)

In [208]:
df.churn = (df.churn == True).astype(int)

In [209]:
(df.churn == 'True').astype(int)

0       0
1       0
2       0
3       0
4       0
       ..
3328    0
3329    0
3330    0
3331    0
3332    0
Name: churn, Length: 3333, dtype: int64

In [210]:
df.head()

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,churn
0,KS,128,415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,OH,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
2,NJ,137,415,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
3,OH,84,408,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,OK,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


In [211]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account_length          3333 non-null   int64  
 2   area_code               3333 non-null   int64  
 3   international_plan      3333 non-null   int64  
 4   voice_mail_plan         3333 non-null   int64  
 5   number_vmail_messages   3333 non-null   int64  
 6   total_day_minutes       3333 non-null   float64
 7   total_day_calls         3333 non-null   int64  
 8   total_day_charge        3333 non-null   float64
 9   total_eve_minutes       3333 non-null   float64
 10  total_eve_calls         3333 non-null   int64  
 11  total_eve_charge        3333 non-null   float64
 12  total_night_minutes     3333 non-null   float64
 13  total_night_calls       3333 non-null   int64  
 14  total_night_charge      3333 non-null   

In [212]:
categorical = list(df.dtypes[df.dtypes == 'object'].index)
categorical

['state']

In [213]:
numerical = df.select_dtypes(include=np.number).columns.tolist()
numerical

['account_length',
 'area_code',
 'international_plan',
 'voice_mail_plan',
 'number_vmail_messages',
 'total_day_minutes',
 'total_day_calls',
 'total_day_charge',
 'total_eve_minutes',
 'total_eve_calls',
 'total_eve_charge',
 'total_night_minutes',
 'total_night_calls',
 'total_night_charge',
 'total_intl_minutes',
 'total_intl_calls',
 'total_intl_charge',
 'customer_service_calls',
 'churn']

### 3. Data Splitting (Validation Framework)

In [214]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1, shuffle= True)
df_train, df_val = train_test_split(df_full_train, test_size=0.25 ,random_state=1, shuffle=True)

In [215]:
len(df), len(df_full_train), len (df_train), len (df_val), len(df_test)

(3333, 2666, 1999, 667, 667)

In [216]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [217]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values


del df_train['churn']
del df_val['churn']
del df_test['churn']

### 4. EDA 'Exploratory Data Analysis'

In [218]:
df_full_train = df_full_train.reset_index(drop=True)

In [219]:
df_full_train.isnull().sum()

state                     0
account_length            0
area_code                 0
international_plan        0
voice_mail_plan           0
number_vmail_messages     0
total_day_minutes         0
total_day_calls           0
total_day_charge          0
total_eve_minutes         0
total_eve_calls           0
total_eve_charge          0
total_night_minutes       0
total_night_calls         0
total_night_charge        0
total_intl_minutes        0
total_intl_calls          0
total_intl_charge         0
customer_service_calls    0
churn                     0
dtype: int64

In [220]:
df_full_train.churn.value_counts()

0    2281
1     385
Name: churn, dtype: int64

In [221]:
df_full_train.churn.mean()

0.1444111027756939

In [223]:
df_full_train[numerical].nunique()

account_length             211
area_code                    3
international_plan           2
voice_mail_plan              2
number_vmail_messages       45
total_day_minutes         1479
total_day_calls            118
total_day_charge          1479
total_eve_minutes         1447
total_eve_calls            117
total_eve_charge          1303
total_night_minutes       1441
total_night_calls          119
total_night_charge         883
total_intl_minutes         159
total_intl_calls            21
total_intl_charge          159
customer_service_calls      10
churn                        2
dtype: int64

In [224]:
df_full_train[categorical].nunique()

state    51
dtype: int64

### 5. Feature Importance

#### Churn Rate

In [227]:
churn_international_plan = df_full_train[df_full_train.international_plan == 1].churn.mean()
churn_international_plan

0.4367816091954023

In [228]:
churn_no_international_plan = df_full_train[df_full_train.international_plan == 0].churn.mean()
churn_no_international_plan

0.11268191268191269

In [230]:
global_churn = df_full_train.churn.mean()
global_churn

0.1444111027756939

In [231]:
global_churn - churn_international_plan

-0.2923705064197084

In [232]:
global_churn - churn_no_international_plan

0.03172919009378122

In [234]:
churn_cst_service = df_full_train[df_full_train.customer_service_calls >= 1].churn.mean()
churn_cst_service

0.1460620525059666

In [235]:
churn_no_cst_service = df_full_train[df_full_train.customer_service_calls < 1].churn.mean()
churn_no_cst_service

0.138353765323993

#### Risk Ratio

In [237]:
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).churn.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_churn
    df_group['risk'] = df_group['mean'] / global_churn
    display(df_group)
    print()
    print()

state


Unnamed: 0_level_0,mean,count,diff,risk
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AK,0.066667,45,-0.077744,0.461645
AL,0.112903,62,-0.031508,0.781818
AR,0.23913,46,0.094719,1.655901
AZ,0.083333,48,-0.061078,0.577056
CA,0.269231,26,0.12482,1.864336
CO,0.152542,59,0.008131,1.056306
CT,0.157895,57,0.013484,1.09337
DC,0.088889,45,-0.055522,0.615527
DE,0.145833,48,0.001422,1.009848
FL,0.12766,47,-0.016752,0.884001






### 6. Mutual Information and Correlation 

In [239]:
mutual_info_score(df_full_train.state,df_full_train.churn)

0.013741615376543012

In [240]:
df_full_train[numerical].corrwith(df_full_train.churn).abs()

account_length            0.016006
area_code                 0.006031
international_plan        0.274008
voice_mail_plan           0.109294
number_vmail_messages     0.097776
total_day_minutes         0.200082
total_day_calls           0.014284
total_day_charge          0.200084
total_eve_minutes         0.090056
total_eve_calls           0.018048
total_eve_charge          0.090056
total_night_minutes       0.041987
total_night_calls         0.001996
total_night_charge        0.041982
total_intl_minutes        0.071595
total_intl_calls          0.071652
total_intl_charge         0.071566
customer_service_calls    0.216366
churn                     1.000000
dtype: float64

In [243]:
corr = df_full_train.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,churn
account_length,1.0,-0.016101,0.022297,-0.01073,-0.016168,0.00674,0.029958,0.006737,-0.00223,0.016401,-0.002226,-0.011431,-0.021813,-0.011438,-0.002561,0.024995,-0.002553,-0.003687,0.016006
area_code,-0.016101,1.0,0.059935,-0.003604,-0.00631,-0.019102,-0.019999,-0.019102,-0.00192,-0.016817,-0.001882,0.000632,0.03213,0.000623,-0.016375,-0.020938,-0.016452,0.026529,0.006031
international_plan,0.022297,0.059935,1.0,-0.011009,-0.00594,0.061642,0.017776,0.061642,0.027353,0.004791,0.027358,-0.02634,0.026493,-0.026335,0.050342,0.002568,0.050221,-0.034475,0.274008
voice_mail_plan,-0.01073,-0.003604,-0.011009,1.0,0.956982,-0.001338,0.004215,-0.001341,0.011807,-0.017103,0.011822,0.018137,0.007184,0.018148,0.004871,0.004033,0.00492,-0.034046,-0.109294
number_vmail_messages,-0.016168,-0.00631,-0.00594,0.956982,1.0,0.000711,0.007157,0.000707,0.008763,-0.015767,0.00878,0.023786,-0.001015,0.023787,0.009042,0.008847,0.009088,-0.030173,-0.097776
total_day_minutes,0.00674,-0.019102,0.061642,-0.001338,0.000711,1.0,0.00402,1.0,0.012765,0.014696,0.012764,-0.010892,0.015652,-0.010911,-0.009545,-0.003042,-0.009464,-0.018928,0.200082
total_day_calls,0.029958,-0.019999,0.017776,0.004215,0.007157,0.00402,1.0,0.004019,-0.032081,0.011092,-0.032075,0.023592,-0.013038,0.023587,0.009559,-0.000452,0.009657,-0.013681,0.014284
total_day_charge,0.006737,-0.019102,0.061642,-0.001341,0.000707,1.0,0.004019,1.0,0.012777,0.014695,0.012777,-0.01089,0.015652,-0.010909,-0.009549,-0.00305,-0.009468,-0.018931,0.200084
total_eve_minutes,-0.00223,-0.00192,0.027353,0.011807,0.008763,0.012765,-0.032081,0.012777,1.0,-0.035357,1.0,-0.006744,0.008484,-0.006738,-0.012046,-0.003328,-0.012062,-0.017817,0.090056
total_eve_calls,0.016401,-0.016817,0.004791,-0.017103,-0.015767,0.014696,0.011092,0.014695,-0.035357,1.0,-0.035355,-0.022888,0.015857,-0.02284,0.015195,0.027905,0.015217,0.006301,0.018048


In [256]:
churn_day_minutes = df_full_train[df_full_train.total_day_minutes < 4].churn.mean()
churn_day_minutes

0.5

### 7. One Hot Encoding 

### 8. Model Training 

### 9. Using the model 