In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
# Check if GPU is available
import tensorflow as tf

# Hide GPU from visible devices
USE_GPU = False
if not (USE_GPU):
    tf.config.set_visible_devices([], 'GPU')

print(tf.test.is_gpu_available())

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True


In [3]:
def display_scores(scores):
    print("Scores:",scores)
    print("Mean:",scores.mean())
    print("Std dev:",scores.std())

# Import and prepare data

In [4]:
churn = pd.read_csv('dataset/churn.csv')

In [5]:
churn['Total Charges'] = pd.to_numeric(churn['Total Charges'], errors='coerce')
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7044 entries, 0 to 7043
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Customer ID        7044 non-null   object 
 1   Gender             7044 non-null   object 
 2   Senior Citizen     7044 non-null   int64  
 3   Partner            7044 non-null   object 
 4   Dependents         7044 non-null   object 
 5   tenure             7044 non-null   int64  
 6   Phone Service      7044 non-null   object 
 7   Multiple Lines     7044 non-null   object 
 8   Internet Service   7044 non-null   object 
 9   Online Security    7044 non-null   object 
 10  Online Backup      7044 non-null   object 
 11  Device Protection  7044 non-null   object 
 12  Tech Support       7044 non-null   object 
 13  Streaming TV       7044 non-null   object 
 14  Streaming Movies   7044 non-null   object 
 15  Contract           7044 non-null   object 
 16  Paperless Billing  7044 

In [6]:
X = churn.drop(['Customer ID', 'Churn'], axis=1)
y = churn['Churn'].apply(lambda x: 1 if x=='Yes' else 0)

In [7]:
X_cat = X.drop(['Monthly Charges', 'Total Charges'], axis=1)
X_cat.head()

Unnamed: 0,Gender,Senior Citizen,Partner,Dependents,tenure,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check
1,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check
2,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check
3,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check
4,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic)


In [8]:
X_num = X[['Monthly Charges', 'Total Charges']]
X_num.head()

Unnamed: 0,Monthly Charges,Total Charges
0,29.85,29.85
1,29.85,29.85
2,56.95,1889.5
3,53.85,108.15
4,42.3,1840.75


In [9]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('one_hot', OneHotEncoder()),
])

num_attribs = list(X_num)
cat_attribs = list(X_cat)

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])

In [10]:
X_prepared = full_pipeline.fit_transform(X).todense()

# Split in train and test datasets 
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=.2)

In [11]:
X_train.shape

(5635, 118)

In [12]:
y_train

142     1
4747    1
181     1
6409    1
3645    0
       ..
3202    0
4236    0
1671    1
5286    0
6460    0
Name: Churn, Length: 5635, dtype: int64

# (a) ML models

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

## 1. SGD Classifier

In [None]:
sgd_scores = cross_val_score(
    SGDClassifier(penalty='l1',alpha=0.01,random_state=42,n_jobs=-1),
    X_train,
    y_train,
    scoring = "accuracy",
    cv=3
)
display_scores(sgd_scores)

In [None]:
sgd_clf = SGDClassifier(penalty='l1',alpha=0.01,random_state=42,n_jobs=-1)
sgd_clf.fit(X_train,y_train)

In [None]:
y_hat = sgd_clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_hat)

## 2. KNN classifier

In [None]:
knn_scores = cross_val_score(
    KNeighborsClassifier(n_neighbors=3,n_jobs=-1),
    X_train,
    y_train,
    scoring = "accuracy",
    cv=3
)
display_scores(knn_scores)

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors=3,n_jobs=-1)
knn_clf.fit(X_train, y_train)

In [None]:
y_hat = knn_clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_hat)

## 3. Gaussian Naive Bayes

In [None]:
gnb_scores = cross_val_score(
    GaussianNB(),
    X_train,
    y_train,
    scoring = "accuracy",
    cv=3
)
display_scores(gnb_scores)

In [None]:
gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)

In [None]:
y_hat = gnb_clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_hat)

## 4. Decision Tree

In [None]:
tree_scores = cross_val_score(
    DecisionTreeClassifier(splitter='best', max_depth=30),
    X_train,
    y_train,
    scoring = "accuracy",
    cv=3
)
display_scores(tree_scores)

In [None]:
tree_clf = DecisionTreeClassifier(splitter='best', max_depth=30)
tree_clf.fit(X_train, y_train)

In [None]:
y_hat = tree_clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_hat)

## 5. Suport Vector Machine Classifier

In [None]:
svc_scores = cross_val_score(
    SVC(random_state=42),
    X_train,
    y_train,
    scoring = "accuracy",
    cv=3
)
display_scores(svc_scores)

In [None]:
svc_clf = SVC(random_state=42)
svc_clf.fit(X_train, y_train)

In [None]:
y_hat = svc_clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_hat)

## 6. Random Forest

In [None]:
rf_scores = cross_val_score(
    RandomForestClassifier(n_estimators=500,max_leaf_nodes=20,n_jobs=-1),
    X_train,
    y_train,
    scoring = "accuracy",
    cv=3
)
display_scores(rf_scores)

In [None]:
rf_clf = RandomForestClassifier(n_estimators=500,max_leaf_nodes=20,n_jobs=-1)
rf_clf.fit(X_train, y_train)

In [None]:
y_hat = rf_clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_hat)

# (b) Neural Networks with TensorFlow

## 1. Import Dependencies

In [13]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense

## 2. Build and Compile Model

In [14]:
model = Sequential()
model.add(Dense(units=32, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

In [15]:
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics='accuracy')

## 3. Fit, Predict and Evaluate

In [None]:
model.fit(X_train, y_train, epochs=200, batch_size=32)

In [None]:
y_hat = model.predict(X_test)
y_hat = [0 if val < 0.5 else 1 for val in y_hat]

In [None]:
accuracy_score(y_test, y_hat)

## 4. Saving and Reloading

In [None]:
model.save('tfmodel')

In [None]:
del model 

In [None]:
model = load_model('tfmodel')