In [2]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image

In [4]:
import pandas as pd
# Read in data and display first 5 ro
bank_data= pd.read_csv('churn_modelling.csv')
bank_data.head(10)



Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [9]:
#Identify Anomalies/ Missing Data
# Missing data can impact an analysis as can incorrect data or outliers.
#In this case, the missing data will not have a large effect, and the data quality is good because of the source
print('The shape of our bank_data is:', bank_data.shape)

The shape of our bank_data is: (10000, 14)


In [10]:
#We can easily compute summary statistics to find anomalies.
# Descriptive statistics for each column
bank_data.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [12]:
# One-hot encode the data using pandas get_dummies
bank_data = pd.get_dummies(bank_data)
# Display the first 5 rows of the last 12 columns
bank_data.iloc[:,5:].head(10)

Unnamed: 0,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Surname_Abazu,Surname_Abbie,Surname_Abbott,Surname_Abdullah,...,Surname_Zubarev,Surname_Zubareva,Surname_Zuev,Surname_Zuyev,Surname_Zuyeva,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,0.0,1,1,1,101348.88,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,83807.86,1,0,1,112542.58,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,159660.8,3,1,0,113931.57,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,0.0,2,0,0,93826.63,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,125510.82,1,1,1,79084.1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
5,113755.78,2,1,0,149756.71,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
6,0.0,2,1,1,10062.8,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
7,115046.74,4,1,0,119346.88,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
8,142051.07,2,0,1,74940.5,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
9,134603.88,1,1,1,71725.73,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [13]:
# code provides an overview of the data stored in a particular dataset.
#It displays information about the dataset including the number of columns, the data types for each column, the number of non-null values for each column, and the memory usage. 
#This information can be used to identify potential issues with the dataset, such as missing values or columns with incorrect data types.
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 2948 entries, RowNumber to Gender_Male
dtypes: float64(2), int64(9), uint8(2937)
memory usage: 28.8 MB


In [17]:
# Use numpy to convert to arrays
import numpy as np
# Labels are the values we want to predict
labels = np.array(bank_data['CreditScore'])
# Remove the labels from the bank_data
# axis 1 refers to the columns
bank_data= bank_data.drop('CreditScore', axis = 1)
# Saving bank_data names for later use
bank_data_list = list(bank_data.columns)
# Convert to numpy array
bank_data = np.array(bank_data)

In [19]:
Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_bank_data, test_bank_data, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [20]:
print('Training Features Shape:', train_bank_data.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_bank_data.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (7500, 2947)
Training Labels Shape: (7500,)
Testing Features Shape: (2500, 2947)
Testing Labels Shape: (2500,)


In [23]:
# The baseline predictions are the historical averages
baseline_preds = test_bank_data[:, bank_data_list.index('Tenure')]
# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - test_labels)
print('bmi baseline error: ', round(np.mean(baseline_errors), 2))

bmi baseline error:  641.1


In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_bank_data, train_labels);

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('age Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
# Get numerical feature importances
importances = list(rf.bank_data_importances_)
# List of tuples with variable and importance
bank_data_importances = [(bank_data, round(importance, 2)) for bank_data, importance in zip(bank_data_list, importances)]
# Sort the feature importances by most important first
bank_data_importances = sorted(bank_data_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in bank_data_importances];
#At the top of the list is charges, the max charges of the age.
#This tells us the best predictor of the max charges for a age is the max chargws of the age, a rather intuitive finding. 
#The second most important factor is the historical average max bmi, also not that surprising.
#Your friend turns out to not be very helpful, along with the age, the  sex, the smoker, and the regions.

In [None]:
# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators= 1000, random_state=42)
# Extract the two most important features
important_indices = [feature_list.index('Tenure'), feature_list.index('	CreditScore')]
train_important = train_features[:, important_indices]
test_important = test_features[:, important_indices]
# Train the random forest
rf_most_important.fit(train_important, train_labels)
# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)
errors = abs(predictions - test_labels)
# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
mape = np.mean(100 * (errors / test_labels))
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')
#This demonstrates that we don't truly need all the information we gathered to make reliable forecasts. 
#If we were to continue using this model, we could merely collect the two variables and attain approximately the same performance.
#In a production environment, we would have to balance the loss of accuracy against the additional time needed to gather more data.

In [None]:
# Import matplotlib for plotting and use magic command for Jupyter Notebooks
import matplotlib.pyplot as plt
%matplotlib inline
# Set the style
plt.style.use('fivethirtyeight')
# list of x locations for plotting
x_values = list(range(len(importances)))
# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical')
# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');
# we can plot the entire dataset with predictions highlighted. 
#This requires a little data manipulation, but its not too difficult. 
#We can use this plot to determine if there are any outliers in either the data or our predictions.