In [85]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('../data/customer_churn_data.csv')
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,CUST0000,Male,0,No,Yes,23,No,No phone service,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Bank transfer,49.85,1146.55,No
1,CUST0001,Female,0,Yes,No,43,No,No phone service,DSL,Yes,...,Yes,No,Yes,No,Month-to-month,No,Mailed check,100.7,4330.1,Yes
2,CUST0002,Male,1,No,No,51,Yes,No,DSL,No,...,Yes,Yes,No,No,One year,No,Electronic check,97.33,4963.83,Yes
3,CUST0003,Male,1,No,No,72,Yes,Yes,DSL,Yes,...,Yes,No,No,No,Month-to-month,No,Credit card,101.38,7299.36,No
4,CUST0004,Male,1,No,No,25,Yes,Yes,DSL,No,...,No,Yes,No,Yes,Month-to-month,No,Electronic check,52.22,1305.5,Yes


In [3]:
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

Number of rows: 5880
Number of columns: 21


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5880 entries, 0 to 5879
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        5880 non-null   object 
 1   gender            5880 non-null   object 
 2   SeniorCitizen     5880 non-null   int64  
 3   Partner           5880 non-null   object 
 4   Dependents        5880 non-null   object 
 5   tenure            5880 non-null   int64  
 6   PhoneService      5880 non-null   object 
 7   MultipleLines     5880 non-null   object 
 8   InternetService   5880 non-null   object 
 9   OnlineSecurity    5880 non-null   object 
 10  OnlineBackup      5880 non-null   object 
 11  DeviceProtection  5880 non-null   object 
 12  TechSupport       5880 non-null   object 
 13  StreamingTV       5880 non-null   object 
 14  StreamingMovies   5880 non-null   object 
 15  Contract          5880 non-null   object 
 16  PaperlessBilling  5880 non-null   object 


In [5]:
for col in df.columns:

    print(f"Details of the '{col}' Column:")
    print(df[col].value_counts())
    print('\n\n')

Details of the 'customerID' Column:
customerID
CUST5879    1
CUST0000    1
CUST0001    1
CUST0002    1
CUST0003    1
           ..
CUST0012    1
CUST0011    1
CUST0010    1
CUST0009    1
CUST0008    1
Name: count, Length: 5880, dtype: int64



Details of the 'gender' Column:
gender
Male      2950
Female    2930
Name: count, dtype: int64



Details of the 'SeniorCitizen' Column:
SeniorCitizen
1    2944
0    2936
Name: count, dtype: int64



Details of the 'Partner' Column:
Partner
Yes    2995
No     2885
Name: count, dtype: int64



Details of the 'Dependents' Column:
Dependents
No     3004
Yes    2876
Name: count, dtype: int64



Details of the 'tenure' Column:
tenure
15    101
48    100
17     98
18     98
10     97
     ... 
11     71
40     70
20     69
35     63
27     61
Name: count, Length: 72, dtype: int64



Details of the 'PhoneService' Column:
PhoneService
Yes    2941
No     2939
Name: count, dtype: int64



Details of the 'MultipleLines' Column:
MultipleLines
No phone servic

#### To change the categorical values of columns to numericals using .map

For the PaymentMethod column

In [6]:
""" df['PaymentMethod'] = df['PaymentMethod'].map({
    'Credit card': 0,
    'Electronic check': 1,
    'Mailed check': 2,
    'Bank transfer': 3,
})
df.to_csv('../data/preprocessed_data.csv') """

For columns 'OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies'

In [15]:
""" df['StreamingTV'] = df['StreamingTV'].map({
    'No': 0,
    'Yes': 1,
    'No internet service': 2,
})
df.to_csv('../data/preprocessed_data.csv') """

For columns 'PhoneService, MultipleLines, InternetService, PaperlessBilling'

In [23]:
df['PaperlessBilling'] = df['PaperlessBilling'].map({
    'No': 0,
    'Yes': 1,
    #'Fiber optic': 2,
})
df.to_csv('../data/preprocessed_data.csv')

In [24]:
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,CUST0000,Male,0,No,Yes,23,0,2,0,2,2,2,2,2,2,Month-to-month,1,3,49.85,1146.55,No
1,CUST0001,Female,0,Yes,No,43,0,2,1,1,0,1,0,1,0,Month-to-month,0,2,100.7,4330.1,Yes
2,CUST0002,Male,1,No,No,51,1,0,1,0,1,1,1,0,0,One year,0,1,97.33,4963.83,Yes
3,CUST0003,Male,1,No,No,72,1,1,1,1,0,1,0,0,0,Month-to-month,0,0,101.38,7299.36,No
4,CUST0004,Male,1,No,No,25,1,1,1,0,0,0,1,0,1,Month-to-month,0,1,52.22,1305.5,Yes


In [54]:
df['TotalCharges'] = df['tenure'] * df['MonthlyCharges']
df.to_csv('../data/preprocessed_data.csv')

In [55]:
df.head(4)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Actual_Charges
0,CUST0000,Male,0,No,Yes,23,0,2,0,2,2,2,2,2,2,Month-to-month,1,3,49.85,1146.55,No,1146.55
1,CUST0001,Female,0,Yes,No,43,0,2,1,1,0,1,0,1,0,Month-to-month,0,2,100.7,4330.1,Yes,4330.1
2,CUST0002,Male,1,No,No,51,1,0,1,0,1,1,1,0,0,One year,0,1,97.33,4963.83,Yes,4963.83
3,CUST0003,Male,1,No,No,72,1,1,1,1,0,1,0,0,0,Month-to-month,0,0,101.38,7299.36,No,7299.36


In [64]:
df.drop('Actual_Charges', axis=1, inplace=True)

In [93]:
data = pd.read_csv('../data/preprocessed_data.csv')

In [92]:
df['Contract'] = df['Contract'].map({
    'Month-to-month': 0,
    'One year': 1,
    'Two year': 2,
})
df.to_csv('../data/preprocessed_data.csv')

In [75]:
data.to_csv('../data/preprocessed_data.csv', index=False)

#### Feature Selection

Will be dropping off with customerID, PaperlessBilling.

In [95]:
data.drop(['customerID','PaperlessBilling', 'Unnamed: 0'], axis=1, inplace=True)

In [101]:
for cols in data.columns:
    print(f"Details of {cols} column:")
    print(data[cols].value_counts())
    print('\n\n')

Details of gender column:
gender
0    2950
1    2930
Name: count, dtype: int64



Details of SeniorCitizen column:
SeniorCitizen
1    2944
0    2936
Name: count, dtype: int64



Details of Partner column:
Partner
1    2995
0    2885
Name: count, dtype: int64



Details of Dependents column:
Dependents
0    3004
1    2876
Name: count, dtype: int64



Details of tenure column:
tenure
15    101
48    100
17     98
18     98
10     97
     ... 
11     71
40     70
20     69
35     63
27     61
Name: count, Length: 72, dtype: int64



Details of PhoneService column:
PhoneService
1    2941
0    2939
Name: count, dtype: int64



Details of MultipleLines column:
MultipleLines
2    2939
1    1512
0    1429
Name: count, dtype: int64



Details of InternetService column:
InternetService
0    2029
1    1936
2    1915
Name: count, dtype: int64



Details of OnlineSecurity column:
OnlineSecurity
2    2029
0    1947
1    1904
Name: count, dtype: int64



Details of OnlineBackup column:
OnlineBackup
2

In [102]:
from sklearn.model_selection import train_test_split

In [103]:
X = data.drop('Churn', axis=1)
y = data['Churn']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Saving to train and test data
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test,y_test], axis=1)

train_data.to_csv('../data/train_data.csv', index=False)
test_data.to_csv('../data/test_data.csv', index=False) 