In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import shap
import matplotlib.pyplot as plt
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pickle
with open('model.pkl', 'rb') as f:
    svm_rbf = pickle.load(f)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:
train_data = pd.read_csv("customer_churn_dataset-training-master.csv")
test_data = pd.read_csv("customer_churn_dataset-testing-master.csv")

In [4]:
numerical_cols = train_data.select_dtypes(include=['int64', 'float64']).columns
train_data[numerical_cols] = train_data[numerical_cols].fillna(train_data[numerical_cols].median())

categorical_cols = train_data.select_dtypes(include=['object']).columns
train_data[categorical_cols] = train_data[categorical_cols].fillna(train_data[categorical_cols].mode().iloc[0])

In [5]:
label_encoder = LabelEncoder()
for col in categorical_cols:
    train_data[col] = label_encoder.fit_transform(train_data[col])

In [6]:
y = train_data['Churn']
X = train_data.drop(columns=['CustomerID', 'Churn'])

In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [9]:
svm_rbf = SVC(kernel='rbf', probability=True, random_state=42)
svm_rbf.fit(X_train, y_train)

In [10]:
y_val_pred = svm_rbf.predict(X_val)

In [11]:
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
print("Validation Set Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Validation Set Metrics:
Accuracy: 0.60
Precision: 0.59
Recall: 1.00
F1 Score: 0.74


In [12]:
test_data[numerical_cols] = test_data[numerical_cols].fillna(test_data[numerical_cols].median())
test_data[categorical_cols] = test_data[categorical_cols].fillna(test_data[categorical_cols].mode().iloc[0])
for col in categorical_cols:
    test_data[col] = label_encoder.fit_transform(test_data[col])

y_test = test_data['Churn']
X_test = test_data.drop(columns=['CustomerID', 'Churn'])
X_test_scaled = scaler.transform(X_test)

# Make predictions on the test set
y_test_pred = svm_rbf.predict(X_test_scaled)

# Generate classification report for test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
print("\nTest Set Metrics:")
print(f"Accuracy: {test_accuracy:.2f}")
print(f"Precision: {test_precision:.2f}")
print(f"Recall: {test_recall:.2f}")
print(f"F1 Score: {test_f1:.2f}")


Test Set Metrics:
Accuracy: 0.48
Precision: 0.47
Recall: 1.00
F1 Score: 0.64


In [13]:
explainer = shap.KernelExplainer(svm_rbf.predict, X_train[:100])

In [None]:
shap_values = explainer.shap_values(X_test_scaled[:100])

  1%|          | 1/100 [00:49<1:22:02, 49.72s/it]

In [None]:
average_shap_values = np.abs(shap_values).mean(axis=0)
feature_names = X_train.columns

# Plot the average SHAP values as a bar plot
plt.figure(figsize=(10, 6))
plt.barh(feature_names, average_shap_values)
plt.xlabel("Mean |SHAP value|")
plt.title("Average SHAP Values over All Examples")
plt.show()

In [None]:
random_index = np.random.randint(0, X_test_scaled.shape[0])
shap.force_plot(explainer.expected_value, shap_values[random_index], X_test.iloc[random_index, :], feature_names=feature_names, matplotlib=True)

In [None]:
shap_interaction_values = explainer.shap_interaction_values(X_test_scaled[:10])

In [None]:
shap.summary_plot(shap_interaction_values[0], X_test.iloc[:10, :], feature_names=feature_names, plot_type="dot")