In [1]:
import tensorflow as tf
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
filepath = 'telco_customer_churn.csv'
df = pd.read_csv(filepath)
df = df.drop(columns=['customerID'])
df.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
5975,Male,0,Yes,Yes,7,Yes,No,Fiber optic,No,Yes,No,Yes,Yes,No,Month-to-month,Yes,Electronic check,89.75,608.8,Yes
4584,Male,1,No,No,69,Yes,Yes,DSL,Yes,No,Yes,Yes,No,Yes,Two year,No,Credit card (automatic),74.1,5031.0,No
1493,Male,0,No,No,11,Yes,No,Fiber optic,Yes,No,Yes,No,No,No,Month-to-month,Yes,Bank transfer (automatic),79.5,795.65,No
958,Male,0,No,No,25,Yes,Yes,DSL,Yes,Yes,Yes,Yes,No,Yes,Month-to-month,No,Bank transfer (automatic),81.75,2028.8,No
1431,Female,0,Yes,No,43,Yes,Yes,DSL,No,No,No,Yes,Yes,No,One year,Yes,Bank transfer (automatic),64.85,2908.2,No


In [3]:
df.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [4]:
features_cat = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
               'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
features_num = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [5]:
for col in features_cat:
    df[col] = df[col].astype('category')

for col in features_num:
    try:
        df[col] = df[col].astype('float')
    except:
        df[col] = pd.to_numeric(df[col], errors='coerce')

df.dtypes

gender              category
SeniorCitizen       category
Partner             category
Dependents          category
tenure               float64
PhoneService        category
MultipleLines       category
InternetService     category
OnlineSecurity      category
OnlineBackup        category
DeviceProtection    category
TechSupport         category
StreamingTV         category
StreamingMovies     category
Contract            category
PaperlessBilling    category
PaymentMethod       category
MonthlyCharges       float64
TotalCharges         float64
Churn                 object
dtype: object

In [6]:
# Check for missing values
for c in df.columns:
    print(f'{c}: {df[c].isna().sum()}')

gender: 0
SeniorCitizen: 0
Partner: 0
Dependents: 0
tenure: 0
PhoneService: 0
MultipleLines: 0
InternetService: 0
OnlineSecurity: 0
OnlineBackup: 0
DeviceProtection: 0
TechSupport: 0
StreamingTV: 0
StreamingMovies: 0
Contract: 0
PaperlessBilling: 0
PaymentMethod: 0
MonthlyCharges: 0
TotalCharges: 11
Churn: 0


In [7]:
df = df.dropna()

In [8]:
np.unique(df['Churn'], return_counts=True)

(array(['No', 'Yes'], dtype=object), array([5163, 1869], dtype=int64))

In [9]:
from FeatureEncoder import FeatureEncoder
from sklearn.preprocessing import LabelEncoder

y = df['Churn'].values
X = df.drop(columns=['Churn'])

args = {'dataframe': X, 'features_cat': features_cat, 'features_num': features_num}

feat_enc = FeatureEncoder(**args)
feat_enc.encode()
X = feat_enc.get_encoded_features()

labelEnc = LabelEncoder()
y = labelEnc.fit_transform(y)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, train_size=0.7, random_state=0)
X_vald, X_test, y_vald, y_test = train_test_split(X_tmp, y_tmp, train_size=0.5, random_state=0)

print(X_train.shape)
print(X_vald.shape)
print(X_test.shape)

print(np.unique(y_train, return_counts=True))
print(np.unique(y_vald, return_counts=True))
print(np.unique(y_test, return_counts=True))

(4922, 46)
(1055, 46)
(1055, 46)
(array([0, 1]), array([3608, 1314], dtype=int64))
(array([0, 1]), array([759, 296], dtype=int64))
(array([0, 1]), array([796, 259], dtype=int64))


In [11]:
def array_to_dataset(data, target, shuffle=True, batch_size=64):
    ds = tf.data.Dataset.from_tensor_slices((data, target))
    if shuffle:
        ds = ds.shuffle(batch_size*2).batch(batch_size).prefetch(batch_size)
    else:
        ds = ds.batch(batch_size)
    return ds

train_ds = array_to_dataset(X_train, y_train)
vald_ds = array_to_dataset(X_vald, y_vald, shuffle=False)
test_ds = array_to_dataset(X_test, y_test, shuffle=False)

In [12]:
import IterativeFeatureExclusion as IFE

n_features = X_train.shape[1]
_, counts = np.unique(y_train, return_counts=True)
n_classes = len(counts)
hidden_size = 64
r = 5.6498

print(f'Number of classes: {n_classes}')
model = IFE.IFENetClassifier(n_features=n_features, n_classes=n_classes, hidden_size=hidden_size, r=r)

loss_fn = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

checkpoint_path = 'checkpoints/ifeNet_telco.weights.h5'

callbacks = [tf.keras.callbacks.EarlyStopping(patience=10, monitor='val_loss'),
             tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, monitor='val_accuracy')]

epochs = 50
batch_size = 64
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

Number of classes: 2


In [13]:
history = model.fit(train_ds, validation_data=vald_ds, epochs=epochs, callbacks=callbacks)

Epoch 1/50
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 23ms/step - accuracy: 0.7343 - loss: 0.7290 - val_accuracy: 0.7839 - val_loss: 0.5429
Epoch 2/50
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7818 - loss: 0.5273 - val_accuracy: 0.7896 - val_loss: 0.4833
Epoch 3/50
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7866 - loss: 0.4695 - val_accuracy: 0.8000 - val_loss: 0.4638
Epoch 4/50
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7935 - loss: 0.4475 - val_accuracy: 0.7962 - val_loss: 0.4477
Epoch 5/50
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7971 - loss: 0.4347 - val_accuracy: 0.7953 - val_loss: 0.4442
Epoch 6/50
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8020 - loss: 0.4322 - val_accuracy: 0.7896 - val_loss: 0.4421
Epoch 7/50
[1m77/77[0m [32m━━━━━━━━

In [14]:
model.load_weights(checkpoint_path)

In [15]:
y_pred = np.empty((0,))
y_test = np.empty((0,))

for data,label in test_ds:
    y_hat = model(data)
    y_hat = np.round(y_hat)
    y_pred = np.append(y_pred, y_hat.ravel())

    label = label.numpy()
    y_test = np.append(y_test, label.ravel())

In [16]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8
[[696 100]
 [111 148]]
              precision    recall  f1-score   support

         0.0       0.86      0.87      0.87       796
         1.0       0.60      0.57      0.58       259

    accuracy                           0.80      1055
   macro avg       0.73      0.72      0.73      1055
weighted avg       0.80      0.80      0.80      1055



In [17]:
feat_scores = feat_enc.get_feature_scores(input_scores = model.input_scores)

gender: [0.00396583 0.00368709]
SeniorCitizen: [0.01455062 0.04766959]
Partner: [0.00592292 0.00313444]
Dependents: [0.00289369 0.02467238]
tenure: [0.02871167]
PhoneService: [0.01177441 0.00842367]
MultipleLines: [0.06750987 0.01140814 0.0047435 ]
InternetService: [0.00809666 0.06747852 0.00463734]
OnlineSecurity: [0.01259974 0.00471788 0.04169517]
OnlineBackup: [0.02384704 0.00612591 0.01761342]
DeviceProtection: [0.00324003 0.00421393 0.00467762]
TechSupport: [0.0337355  0.00347348 0.02277196]
StreamingTV: [0.00418844 0.00739328 0.01919173]
StreamingMovies: [0.00429465 0.00544259 0.03856732]
Contract: [0.08250404 0.03982641 0.11185734]
PaperlessBilling: [0.0795919  0.00390136]
PaymentMethod: [0.00423606 0.00759046 0.06826049 0.00615047]
MonthlyCharges: [0.01448499]
TotalCharges: [0.00452651]
