In [1]:
import pandas as pd
import numpy as np

# import cleaned csv file

file_path = '..\\DataCleaning\\cleaned_dataset.csv'

try:
    df = pd.read_csv(file_path)
    print("File loaded successfully.")
except FileNotFoundError:
    print("File not found in the specified path.")
except PermissionError:
    print("Permission denied to read the file.")
except Exception as e:
    print(f"An error occurred: {e}")

File loaded successfully.


Handling text and categorical variables (Feature Engineering)

In [2]:
categorical_variables = df.select_dtypes(include=['object']).columns
categorical_variables

Index(['Gender', 'Segment', 'SubscriptionPlan', 'FrequencyOfInteractions'], dtype='object')

In [3]:
df['SubscriptionPlan'].value_counts()

SubscriptionPlan
Basic        680
Student      661
Essential    656
Unlimited    653
Bronze       649
Plus         647
Silver       646
Select       629
Gold         622
Prime        620
Eco          615
Flex         614
Deluxe       614
Smart        613
Pro          610
VIP          609
Express      593
Family       593
Trial        585
Elite        574
Name: count, dtype: int64

In [4]:
# Ordinal Encoding for categorical variables
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(categories=[['Basic', 'Student', 'Essential', 'Unlimited', 'Bronze', 'Plus', 'Silver', 'Select', 'Gold', 'Prime', 'Eco', 'Flex', 'Deluxe', 'Smart', 'Pro', 'VIP', 'Express', 'Family', 'Trial', 'Elite']])
df['SubscriptionPlan_encoded'] = ordinal_encoder.fit_transform(df[['SubscriptionPlan']])

In [5]:
df.head()

Unnamed: 0,CustomerID,Age,Gender,Segment,NPS,ChurnLabel,TotalPurchaseValue,SubscriptionPlan,TotalLatePayments,PageViews,TimeSpentOnWebsite,NumberOfLogins,FrequencyOfInteractions,Rating,SubscriptionPlan_encoded
0,1001,31,Male,Segment B,3,1,20819.84,Express,40,49,15,19,Weekly,1,16.0
1,1002,66,Female,Segment C,6,0,3804.05,Pro,10,100,9,9,Weekly,2,14.0
2,1003,36,Female,Segment B,3,0,5259.54,Essential,8,1,97,19,Monthly,4,2.0
3,1004,62,Female,Segment C,1,1,6067.36,Smart,79,25,31,4,Daily,1,13.0
4,1005,68,Female,Segment C,3,0,18165.53,Basic,2,77,51,12,Weekly,3,0.0


To avoid introducing a large number of columms into the dataset with One-Hot Encoding of the SubscriptionPlan column, I decided to use Ordinal Encoding instead.

In [6]:
# One-hot Encoding for Gender variable

gender_dummies = pd.get_dummies(df['Gender'], prefix='Gender')
df = pd.concat([df, gender_dummies], axis=1)


In [7]:
# One-hot Encoding for Segment column
segment_dummies = pd.get_dummies(df['Segment'], prefix='Segment')
df = pd.concat([df, segment_dummies], axis=1)

In [8]:
# One-hot Encoding for FrequencyOfInteractions column
frequency_dummies = pd.get_dummies(df['FrequencyOfInteractions'], prefix='FrequencyOfInteractions')
df = pd.concat([df, frequency_dummies], axis=1)

In [9]:
df.head()

Unnamed: 0,CustomerID,Age,Gender,Segment,NPS,ChurnLabel,TotalPurchaseValue,SubscriptionPlan,TotalLatePayments,PageViews,...,Rating,SubscriptionPlan_encoded,Gender_Female,Gender_Male,Segment_Segment A,Segment_Segment B,Segment_Segment C,FrequencyOfInteractions_Daily,FrequencyOfInteractions_Monthly,FrequencyOfInteractions_Weekly
0,1001,31,Male,Segment B,3,1,20819.84,Express,40,49,...,1,16.0,False,True,False,True,False,False,False,True
1,1002,66,Female,Segment C,6,0,3804.05,Pro,10,100,...,2,14.0,True,False,False,False,True,False,False,True
2,1003,36,Female,Segment B,3,0,5259.54,Essential,8,1,...,4,2.0,True,False,False,True,False,False,True,False
3,1004,62,Female,Segment C,1,1,6067.36,Smart,79,25,...,1,13.0,True,False,False,False,True,True,False,False
4,1005,68,Female,Segment C,3,0,18165.53,Basic,2,77,...,3,0.0,True,False,False,False,True,False,False,True


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12483 entries, 0 to 12482
Data columns (total 23 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   CustomerID                       12483 non-null  int64  
 1   Age                              12483 non-null  int64  
 2   Gender                           12483 non-null  object 
 3   Segment                          12483 non-null  object 
 4   NPS                              12483 non-null  int64  
 5   ChurnLabel                       12483 non-null  int64  
 6   TotalPurchaseValue               12483 non-null  float64
 7   SubscriptionPlan                 12483 non-null  object 
 8   TotalLatePayments                12483 non-null  int64  
 9   PageViews                        12483 non-null  int64  
 10  TimeSpentOnWebsite               12483 non-null  int64  
 11  NumberOfLogins                   12483 non-null  int64  
 12  FrequencyOfInterac

In [11]:
# Optimise the data for memory efficiency
# Convert float columns to float32 and int columns to int32 for memory optimization
# Select only numeric columns for validation
numeric_cols = df.select_dtypes(include=[np.number])
print(numeric_cols)

for col in numeric_cols.columns:
    if df[col].dtype == "float64":
        df[col] = df[col].astype("float32")
    elif df[col].dtype == "int64":
        df[col] = df[col].astype("int32")

# Display final dataset summary
print("Final Dataset Info After Optimization:")
df.info()

       CustomerID  Age  NPS  ChurnLabel  TotalPurchaseValue  \
0            1001   31    3           1            20819.84   
1            1002   66    6           0             3804.05   
2            1003   36    3           0             5259.54   
3            1004   62    1           1             6067.36   
4            1005   68    3           0            18165.53   
...           ...  ...  ...         ...                 ...   
12478       13479   55    8           0             6190.74   
12479       13480   29    7           0              710.57   
12480       13481   38    1           1            30987.04   
12481       13482   26    0           0            38268.87   
12482       13483   29    2           1              154.20   

       TotalLatePayments  PageViews  TimeSpentOnWebsite  NumberOfLogins  \
0                     40         49                  15              19   
1                     10        100                   9               9   
2                 

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12483 entries, 0 to 12482
Data columns (total 23 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   CustomerID                       12483 non-null  int32  
 1   Age                              12483 non-null  int32  
 2   Gender                           12483 non-null  object 
 3   Segment                          12483 non-null  object 
 4   NPS                              12483 non-null  int32  
 5   ChurnLabel                       12483 non-null  int32  
 6   TotalPurchaseValue               12483 non-null  float32
 7   SubscriptionPlan                 12483 non-null  object 
 8   TotalLatePayments                12483 non-null  int32  
 9   PageViews                        12483 non-null  int32  
 10  TimeSpentOnWebsite               12483 non-null  int32  
 11  NumberOfLogins                   12483 non-null  int32  
 12  FrequencyOfInterac

In [13]:
df.columns

Index(['CustomerID', 'Age', 'Gender', 'Segment', 'NPS', 'ChurnLabel',
       'TotalPurchaseValue', 'SubscriptionPlan', 'TotalLatePayments',
       'PageViews', 'TimeSpentOnWebsite', 'NumberOfLogins',
       'FrequencyOfInteractions', 'Rating', 'SubscriptionPlan_encoded',
       'Gender_Female', 'Gender_Male', 'Segment_Segment A',
       'Segment_Segment B', 'Segment_Segment C',
       'FrequencyOfInteractions_Daily', 'FrequencyOfInteractions_Monthly',
       'FrequencyOfInteractions_Weekly'],
      dtype='object')

In [14]:
# Drop caregorical columns
df = df.drop(['Gender', 'Segment', 'FrequencyOfInteractions', 'SubscriptionPlan'], axis=1)

# Drop CustomerID column
df = df.drop(['CustomerID'], axis=1)

In [15]:
df.columns

Index(['Age', 'NPS', 'ChurnLabel', 'TotalPurchaseValue', 'TotalLatePayments',
       'PageViews', 'TimeSpentOnWebsite', 'NumberOfLogins', 'Rating',
       'SubscriptionPlan_encoded', 'Gender_Female', 'Gender_Male',
       'Segment_Segment A', 'Segment_Segment B', 'Segment_Segment C',
       'FrequencyOfInteractions_Daily', 'FrequencyOfInteractions_Monthly',
       'FrequencyOfInteractions_Weekly'],
      dtype='object')

In [16]:
df.head()

Unnamed: 0,Age,NPS,ChurnLabel,TotalPurchaseValue,TotalLatePayments,PageViews,TimeSpentOnWebsite,NumberOfLogins,Rating,SubscriptionPlan_encoded,Gender_Female,Gender_Male,Segment_Segment A,Segment_Segment B,Segment_Segment C,FrequencyOfInteractions_Daily,FrequencyOfInteractions_Monthly,FrequencyOfInteractions_Weekly
0,31,3,1,20819.839844,40,49,15,19,1,16.0,False,True,False,True,False,False,False,True
1,66,6,0,3804.050049,10,100,9,9,2,14.0,True,False,False,False,True,False,False,True
2,36,3,0,5259.540039,8,1,97,19,4,2.0,True,False,False,True,False,False,True,False
3,62,1,1,6067.359863,79,25,31,4,1,13.0,True,False,False,False,True,True,False,False
4,68,3,0,18165.529297,2,77,51,12,3,0.0,True,False,False,False,True,False,False,True


In [17]:
y = df['ChurnLabel']
X = df.drop(['ChurnLabel'], axis=1)

In [19]:
X.head()

Unnamed: 0,Age,NPS,TotalPurchaseValue,TotalLatePayments,PageViews,TimeSpentOnWebsite,NumberOfLogins,Rating,SubscriptionPlan_encoded,Gender_Female,Gender_Male,Segment_Segment A,Segment_Segment B,Segment_Segment C,FrequencyOfInteractions_Daily,FrequencyOfInteractions_Monthly,FrequencyOfInteractions_Weekly
0,31,3,20819.839844,40,49,15,19,1,16.0,False,True,False,True,False,False,False,True
1,66,6,3804.050049,10,100,9,9,2,14.0,True,False,False,False,True,False,False,True
2,36,3,5259.540039,8,1,97,19,4,2.0,True,False,False,True,False,False,True,False
3,62,1,6067.359863,79,25,31,4,1,13.0,True,False,False,False,True,True,False,False
4,68,3,18165.529297,2,77,51,12,3,0.0,True,False,False,False,True,False,False,True


In [20]:
y.head()

0    1
1    0
2    0
3    1
4    0
Name: ChurnLabel, dtype: int32

In [None]:
# Perform the test-train split, setting apart 20%
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Model Training using RandomForest

In [26]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [27]:
predictions = model.predict(X_test)
predictions

array([0, 1, 0, ..., 1, 0, 1])

In [28]:
# Evaluate accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9720


The accuracy is 97.20% meaning that the model has captured the pattern in the data. I will therefore proceed to fine-tune the model to see whether it will improve the accuracy

Model tuning
 - RandomizedSearchCV

In [29]:
from sklearn.model_selection import RandomizedSearchCV

parameter_grid = {
    "n_estimators": [10, 50, 100], 
    "max_features": [2, 8, 13],
    "max_depth": [2, 10, None],
}

# Use RandomizedSearchCV instead of GridSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=parameter_grid,
    n_iter=10, 
    cv=3,  
    scoring="accuracy",  # Use accuracy for classification
    return_train_score=True,
    n_jobs=-1,  # Use all available CPU cores for parallel processing
    random_state=42
)

# Fit the model
random_search.fit(X, y)

In [30]:
print(f"Best Parameters RandomizedSearchCV: {random_search.best_params_}")

Best Parameters RandomizedSearchCV: {'n_estimators': 100, 'max_features': 13, 'max_depth': 2}


In [32]:
final_model = RandomForestClassifier(n_estimators=100, max_features=13, max_depth=2, random_state=42)
final_model.fit(X_train, y_train)

In [33]:
predictions_encoded = final_model.predict(X_test)
predictions

array([0, 1, 0, ..., 1, 0, 1])

In [35]:
accuracy = accuracy_score(y_test, predictions_encoded)
print(f"Accuracy: {accuracy:.4f}")

for i in range(5):  # Show first 5 examples
    print(f"Actual: {([y_test.iloc[i]])[0]}, Predicted: {predictions[i]}")

Accuracy: 0.9724
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1


The final accuracy is 97.24% which is a slight improvement by 0.04%. This model is accurate enough for production and I will demonstrate the script to make it deployable

In [36]:
import joblib
joblib.dump(final_model, "customer_churn_prediction_model.pk1")

['customer_churn_prediction_model.pk1']

The model can now be deployed in production.