In [None]:
import os
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (confusion_matrix, f1_score, make_scorer,roc_auc_score)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


In [None]:
data_file_paths = [f'Churn_Modelling_{country}.csv'  for country in ['Spain', 'France']]

In [None]:
df = pd.concat([pd.read_csv(fpath) for fpath in data_file_paths])
df.head()

In [None]:
df.shape

In [None]:
# Are there missing values?
df.isna().sum()

In [None]:
df['Geography'].value_counts()

In [None]:
feat_cols = ['CreditScore', 'Age', 'Tenure', 
               'Balance', 'NumOfProducts', 'HasCrCard',
               'IsActiveMember', 'EstimatedSalary']
targ_col = 'Exited'

In [None]:
X, y = df[feat_cols], df[targ_col]

In [None]:
y.mean()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
random_state = 42
train_params = {'n_estimators': 100, 'max_depth': 10}

clf = RandomForestClassifier(random_state=random_state, 
                            **train_params)
model = Pipeline(
    steps=[("preprocessor", SimpleImputer()), ("clf", clf)]
    )

model.fit(X_train, y_train)

In [None]:
y_prob = model.predict_proba(X_test)
y_pred = y_prob[:, 1] >= 0.5
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob[:, 1])
f1, roc_auc

In [None]:
cm = confusion_matrix(y_test, y_pred, normalize='true') 
sns.heatmap(cm, annot=True, cmap=plt.cm.Blues)

In [None]:
out_feat_names = model[:-1].get_feature_names_out(feat_cols)
out_feat_names

In [None]:
import eli5
from sklearn.metrics import make_scorer
from eli5.sklearn import PermutationImportance


preprocessor = model.named_steps['preprocessor']
clf = model.named_steps['clf']
X_test_transformed = preprocessor.transform(X_test)

perm = PermutationImportance(clf, scoring=make_scorer(f1_score), random_state=random_state).fit(X_test_transformed, y_test)
eli5.show_weights(perm, feature_names=out_feat_names)

In [None]:
feat_imp = zip(X_test.columns.tolist(), perm.feature_importances_)
df_feat_imp = pd.DataFrame(feat_imp, 
                    columns=['feature', 'importance'])
df_feat_imp = df_feat_imp.sort_values(by='importance', ascending=False)
df_feat_imp

In [None]:
feat_importance_fpath = 'feat_imp.csv'
df_feat_imp.to_csv(feat_importance_fpath, index=False)

In [None]:
from joblib import dump

dump(model, 'clf-model.joblib');