In [1]:
# Dependencies
import pandas as pd
from pathlib import Path

# File to Load 
customer_data_to_load = Path("Resources/churn.csv")

# Read Data File and store into Pandas DataFrames
churn_df = pd.read_csv(customer_data_to_load)

# View the dataset
churn_df.head()

Unnamed: 0,Names,Age,Total Purchase,Account Manager,Years,Number of Websites Used,Onboard Date,Location,Company,Churn
0,Cameron Williams,42.0,11066.8,0,7.22,8.0,2013-08-30 07:00:40,"10265 Elizabeth Mission Barkerburgh, AK 89518",Harvey LLC,1
1,Kevin Mueller,41.0,11916.22,0,6.5,11.0,2013-08-13 00:38:46,"6157 Frank Gardens Suite 019 Carloshaven, RI 1...",Wilson PLC,1
2,Eric Lozano,38.0,12884.75,0,6.67,12.0,2016-06-29 06:20:07,"1331 Keith Court Alyssahaven, DE 90114","Miller, Johnson and Wallace",1
3,Phillip White,42.0,8010.76,0,6.71,10.0,2014-04-22 12:43:12,"13120 Daniel Mount Angelabury, WY 30645-4695",Smith Inc,1
4,Cynthia Norton,37.0,9191.58,0,5.56,9.0,2016-01-19 15:31:15,"765 Tricia Row Karenshire, MH 71730",Love-Jones,1


In [2]:
# Drop columns that are not needed for PCA analysis
churn_df = churn_df.drop(columns=["Names", "Onboard Date", "Location", "Company"], axis=1)

In [3]:
# Define features set
X = churn_df.copy()
X.drop("Churn", axis=1, inplace=True)
X.head()

Unnamed: 0,Age,Total Purchase,Account Manager,Years,Number of Websites Used
0,42.0,11066.8,0,7.22,8.0
1,41.0,11916.22,0,6.5,11.0
2,38.0,12884.75,0,6.67,12.0
3,42.0,8010.76,0,6.71,10.0
4,37.0,9191.58,0,5.56,9.0


In [4]:
# Define target vector
y = churn_df["Churn"].values.reshape(-1, 1)
y[:5]

array([[1],
       [1],
       [1],
       [1],
       [1]], dtype=int64)

In [5]:
# imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [6]:
# Splitting data into test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create standard scaler instance
scaler = StandardScaler()

# Fit scaler with training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create the random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [8]:
# Make predictions
predictions = rf_model.predict(X_test_scaled)

In [9]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [10]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,179,4
Actual 1,25,17


Accuracy Score : 0.8711111111111111
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       183
           1       0.81      0.40      0.54        42

    accuracy                           0.87       225
   macro avg       0.84      0.69      0.73       225
weighted avg       0.86      0.87      0.85       225



In [11]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.407965561602517, 'Number of Websites Used'),
 (0.23033678162195417, 'Years'),
 (0.191671643059909, 'Total Purchase'),
 (0.14051942625630776, 'Age'),
 (0.02950658745931214, 'Account Manager')]