In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier 
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import accuracy_score
import pickle

## Step 1: Load Data

In [2]:
print("Loading dataset...")
df = pd.read_csv("C:/Users/PMLS/eccomerce-churn-analysis/data/fact_customer.csv")
print("Dataset loaded successfully!\n")

Loading dataset...
Dataset loaded successfully!



In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,invoice_no,category,item,quantity,price,location_id,customer_id,invoice_date,payment_id,...,dob,email,phone_number,is_churned,days_since_last_purchase,tenure,discount_used,last_purchase_date,purchase_frequency,avg_purchase_value
0,0,100000,Hair Care,Hair Oil,1,17.45,1,398,2023-03-22,2,...,1999-07-13,michealbender@gmail.com,(309)938-3205x8088,1.0,1056.0,8.0,0.0,2021-08-15,30.0,280.82
1,1,100001,Body Care and Hygiene,Deodorant,4,20.81,21,938,2024-04-06,9,...,1995-01-18,nicholasclark@flowers.net,(363)377-7602,1.0,1494.0,1.0,0.0,2020-01-19,30.0,52.4
2,2,100002,Skin Care,Face Wash,4,40.96,17,258,2022-05-31,10,...,2002-10-11,geraldcastro@mosley.com,661-041-4070x466,0.0,386.0,6.0,1.0,2021-04-06,42.0,165.9
3,3,100003,Body Care and Hygiene,Body Lotion,2,27.5,9,892,2021-01-01,10,...,1992-12-29,vguerrero@green.org,712-518-4596x428,0.0,482.0,3.0,1.0,2022-05-17,28.0,427.81
4,4,100004,Oral Care,Toothpaste,1,16.43,55,978,2020-02-10,7,...,1989-08-17,qfloyd@gmail.com,249-201-6223,0.0,1309.0,2.0,1.0,2023-01-28,31.0,321.2


## Step 2: Initial Data Exploration

In [4]:
print("\nDataset Info:")
print(df.info())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                10000 non-null  int64  
 1   invoice_no                10000 non-null  int64  
 2   category                  10000 non-null  object 
 3   item                      10000 non-null  object 
 4   quantity                  10000 non-null  int64  
 5   price                     10000 non-null  float64
 6   location_id               10000 non-null  int64  
 7   customer_id               10000 non-null  int64  
 8   invoice_date              10000 non-null  object 
 9   payment_id                10000 non-null  int64  
 10  payment_method            10000 non-null  object 
 11  card_type                 6020 non-null   object 
 12  Unnamed: 3                0 non-null      float64
 13  shopping_mall             10000 non-null  objec

In [5]:
print("\nChecking for missing values:")
print(df.isnull().sum())


Checking for missing values:
Unnamed: 0                      0
invoice_no                      0
category                        0
item                            0
quantity                        0
price                           0
location_id                     0
customer_id                     0
invoice_date                    0
payment_id                      0
payment_method                  0
card_type                    3980
Unnamed: 3                  10000
shopping_mall                   0
city                            0
province_state                  0
country                         0
first_name                      4
last_name                       4
gender                          4
age                             4
dob                             4
email                           4
phone_number                    4
is_churned                      4
days_since_last_purchase        4
tenure                          4
discount_used                   4
last_purchase_date

In [6]:

print("\nChecking data types:")
print(df.dtypes)



Checking data types:
Unnamed: 0                    int64
invoice_no                    int64
category                     object
item                         object
quantity                      int64
price                       float64
location_id                   int64
customer_id                   int64
invoice_date                 object
payment_id                    int64
payment_method               object
card_type                    object
Unnamed: 3                  float64
shopping_mall                object
city                         object
province_state               object
country                      object
first_name                   object
last_name                    object
gender                       object
age                         float64
dob                          object
email                        object
phone_number                 object
is_churned                  float64
days_since_last_purchase    float64
tenure                      float64
discou

In [11]:
df["is_churned"].dtype
# df["is_churned"] = df["is_churned"].astype(int)


dtype('float64')

In [12]:
print("\nSummary Statistics:")
print(df.describe())


Summary Statistics:
        Unnamed: 0    invoice_no      quantity         price   location_id  \
count  10000.00000   10000.00000  10000.000000  10000.000000  10000.000000   
mean    4999.50000  104999.50000      2.490400     31.062060     30.655300   
std     2886.89568    2886.89568      1.121443     24.461435     17.300682   
min        0.00000  100000.00000      1.000000      2.010000      1.000000   
25%     2499.75000  102499.75000      1.000000     12.980000     16.000000   
50%     4999.50000  104999.50000      2.000000     22.815000     31.000000   
75%     7499.25000  107499.25000      3.000000     42.972500     46.000000   
max     9999.00000  109999.00000      4.000000    100.000000     60.000000   

        customer_id    payment_id  Unnamed: 3          age   is_churned  \
count  10000.000000  10000.000000         0.0  9996.000000  9996.000000   
mean     503.279500      5.491200         NaN    44.094038     0.490496   
std      288.437875      2.863345         NaN    15

In [13]:

print("\nChecking for duplicate rows:")
print(df.duplicated().sum())


Checking for duplicate rows:
0


In [14]:
print("\nChecking unique values in categorical columns:")
for col in df.select_dtypes(include=['object']).columns:
    print(f"{col}: {df[col].nunique()} unique values")


Checking unique values in categorical columns:
category: 4 unique values
item: 12 unique values
invoice_date: 1866 unique values
payment_method: 10 unique values
card_type: 4 unique values
shopping_mall: 60 unique values
city: 50 unique values
province_state: 31 unique values
country: 4 unique values
first_name: 346 unique values
last_name: 503 unique values
gender: 3 unique values
dob: 974 unique values
email: 999 unique values
phone_number: 999 unique values
last_purchase_date: 782 unique values


## Step 3: Data Preprocessing

In [15]:
print("Cleaning and processing data...")

# Drop unnecessary columns
df.drop(columns=['Unnamed: 3','Unnamed: 0'], inplace=True, errors='ignore')

Cleaning and processing data...


In [16]:
# Handle missing values
print("Filling missing values...")
df.fillna(method='ffill', inplace=True)


Filling missing values...


  df.fillna(method='ffill', inplace=True)


In [17]:
# Convert dates to datetime format
print("Converting date columns...")
df['invoice_date'] = pd.to_datetime(df['invoice_date'])
df['last_purchase_date'] = pd.to_datetime(df['last_purchase_date'])


Converting date columns...


In [18]:
df['invoice_date']

0      2023-03-22
1      2024-04-06
2      2022-05-31
3      2021-01-01
4      2020-02-10
          ...    
9995   2020-05-07
9996   2021-09-22
9997   2021-02-25
9998   2023-06-24
9999   2025-02-12
Name: invoice_date, Length: 10000, dtype: datetime64[ns]

In [19]:
df['last_purchase_date']

0      2021-08-15
1      2020-01-19
2      2021-04-06
3      2022-05-17
4      2023-01-28
          ...    
9995   2024-03-29
9996   2024-06-26
9997   2023-10-26
9998   2020-09-30
9999   2023-07-27
Name: last_purchase_date, Length: 10000, dtype: datetime64[ns]

In [20]:

# Feature Engineering
print("Performing feature engineering...")
df['recency'] = (df['invoice_date'].max() - df['last_purchase_date']).dt.days
df['purchase_per_tenure'] = df['purchase_frequency'] / (df['tenure'] + 1)
df['discount_ratio'] = df['discount_used'] / (df['price'] + 1)


Performing feature engineering...


In [21]:
df[['recency', 'purchase_per_tenure', 'discount_ratio']].head()

Unnamed: 0,recency,purchase_per_tenure,discount_ratio
0,1279,3.333333,0.0
1,1853,15.0,0.0
2,1410,6.0,0.023832
3,1004,7.0,0.035088
4,748,10.333333,0.057372


In [22]:
df[['category','item', 'payment_method', 'card_type', 'shopping_mall', 'city', 'province_state', 'country', 'gender']]

Unnamed: 0,category,item,payment_method,card_type,shopping_mall,city,province_state,country,gender
0,Hair Care,Hair Oil,Credit Card - MasterCard,MasterCard,Mall of America,Bloomington,Minnesota,US,Female
1,Body Care and Hygiene,Deodorant,Apple Pay,MasterCard,Fashion Show Mall,Las Vegas,Nevada,US,Male
2,Skin Care,Face Wash,Google Pay,MasterCard,Lakeside Shopping Centre,Thurrock,Essex,UK,Female
3,Body Care and Hygiene,Body Lotion,Google Pay,MasterCard,Bullring & Grand Central,Birmingham,West Midlands,UK,Other
4,Oral Care,Toothpaste,Cash,MasterCard,The Galleria at Fort Lauderdale,Fort Lauderdale,Florida,US,Other
...,...,...,...,...,...,...,...,...,...
9995,Hair Care,Conditioner,PayPal,Visa,Metrocentre,Gateshead,Tyne and Wear,UK,Female
9996,Skin Care,Face Wash,Credit Card - AMEX,AMEX,The Gardens Mall,Palm Beach Gardens,Florida,US,Female
9997,Body Care and Hygiene,Hand Sanitizer,Apple Pay,AMEX,The Shops at La Cantera,San Antonio,Texas,US,Female
9998,Skin Care,Face Wash,Debit Card - MasterCard,MasterCard,Lenox Square,Atlanta,Georgia,US,Male


In [23]:
# Encode categorical variables
print("Encoding categorical features...")
label_encoders = {}
categorical_cols = ['category','item','location_id', 'payment_method', 'card_type', 'shopping_mall', 'city', 'province_state', 'country', 'gender']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

Encoding categorical features...


In [24]:
df[['category','item', 'payment_method', 'card_type', 'shopping_mall', 'city', 'province_state', 'country', 'gender']]

Unnamed: 0,category,item,payment_method,card_type,shopping_mall,city,province_state,country,gender
0,1,4,4,2,17,5,16,3,0
1,0,2,0,2,12,24,17,3,1
2,3,3,8,2,15,47,8,2,0
3,0,0,8,2,7,4,30,2,2
4,2,11,2,2,38,18,9,3,2
...,...,...,...,...,...,...,...,...,...
9995,1,1,9,3,19,21,27,2,0
9996,3,3,3,1,39,36,9,3,0
9997,0,5,0,1,46,40,26,3,0
9998,3,3,6,2,16,1,10,3,1


In [31]:

# Normalize numerical features
print("Scaling numerical features...")
scaler = StandardScaler()
numerical_cols = ['quantity', 'price', 'shopping_mall','city','province_state','category','item','country','gender','age','days_since_last_purchase', 'tenure', 'discount_used', 'purchase_frequency', 'avg_purchase_value', 'recency', 'purchase_per_tenure', 'discount_ratio']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


Scaling numerical features...


In [32]:
df.head(3)

Unnamed: 0,invoice_no,category,item,quantity,price,location_id,customer_id,invoice_date,payment_id,payment_method,...,is_churned,days_since_last_purchase,tenure,discount_used,last_purchase_date,purchase_frequency,avg_purchase_value,recency,purchase_per_tenure,discount_ratio
0,100000,-0.492368,-0.434533,-1.329069,-0.556498,0,398,2023-03-22,2,4,...,1.0,0.276269,0.821407,-1.01187,2021-08-15,0.320442,0.22217,0.618873,-0.374158,-0.623628
1,100001,-1.331584,-1.014567,1.346191,-0.419132,13,938,2024-04-06,9,0,...,1.0,1.112218,-1.584991,-1.01187,2020-01-19,0.320442,-1.381175,1.703033,2.071564,-0.623628
2,100002,1.186064,-0.72455,1.346191,0.404655,8,258,2022-05-31,10,8,...,0.0,-1.002465,0.133864,0.988269,2021-04-06,1.143267,-0.584486,0.866303,0.184865,-0.115951


## Step 4: Prepare Data for Model Training

In [36]:
X = df.drop(columns=['is_churned', 'invoice_no', 'customer_id', 'email', 'phone_number', 'location_id', 'payment_id',
                     'first_name', 'last_name', 'dob', 'invoice_date', 'last_purchase_date','	payment_method','card_type','payment_method'], errors='ignore')

In [37]:
print("Preparing training and testing datasets...")
y = df['is_churned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Preparing training and testing datasets...


In [38]:
X_train

Unnamed: 0,category,item,quantity,price,shopping_mall,city,province_state,country,gender,age,days_since_last_purchase,tenure,discount_used,purchase_frequency,avg_purchase_value,recency,purchase_per_tenure,discount_ratio
9254,0.346848,1.595588,-0.437316,-0.757232,0.602673,0.163494,-1.171105,0.512275,-1.232014,0.829945,1.316434,0.821407,-1.011870,-1.530913,0.304646,-0.221635,-1.003058,-0.623628
1561,-0.492368,-0.434533,-0.437316,-0.125594,-1.236900,-1.239705,0.112989,0.512275,1.227587,-1.678315,1.234366,-0.553678,-1.011870,-0.913795,-0.487269,0.135344,-0.569815,-0.623628
1670,1.186064,1.305570,-0.437316,0.463117,0.602673,0.163494,-1.171105,0.512275,-0.002214,0.379744,-0.244767,0.477636,-1.011870,-0.913795,-1.200850,1.102401,-0.758485,-0.623628
6087,-1.331584,-1.594602,-1.329069,-0.875383,-0.317114,-1.660664,-0.470690,0.512275,-1.232014,-0.649286,-0.824969,-1.584991,0.988269,0.183305,1.522913,-0.718385,1.861931,1.376571
6669,0.346848,0.725536,-0.437316,-1.022152,0.602673,0.163494,-1.171105,0.512275,-0.002214,-0.584971,0.577822,1.165178,0.988269,-1.462345,1.139449,0.671758,-0.989082,2.393670
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0.346848,0.725536,1.346191,-0.576939,0.142780,-0.397786,1.397082,0.512275,-1.232014,0.572687,-1.221950,-0.209907,-1.011870,-1.119501,0.196409,-1.749660,-0.758485,-0.623628
5191,1.186064,0.435519,-1.329069,2.078393,1.579946,1.145732,-1.171105,0.512275,-1.232014,-1.549687,-1.231493,0.821407,0.988269,-0.845226,-0.500466,-0.059200,-0.770132,-0.366667
5390,-0.492368,-0.434533,-0.437316,0.526076,0.372726,0.654613,-0.587426,0.512275,1.227587,-0.970857,0.877465,-0.553678,0.988269,-0.776657,-1.336813,-0.484176,-0.485962,-0.149510
860,-1.331584,-1.594602,-1.329069,-0.947746,-1.351873,0.514293,0.696667,0.512275,-0.002214,-0.520657,-0.282939,0.821407,-1.011870,0.594717,-0.059374,0.758642,-0.280987,-0.623628


In [39]:
X_train.to_csv("x_train.csv")

## Step 5: Training and Evaluating ML Models 

### 1. Logistic regression

In [40]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)
accuracy=round(accuracy_score(y_test,y_pred)*100,2)
print("accuracy",accuracy,"%")

accuracy 54.65 %


### 2. XGboost

In [41]:
xgb_model =XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print("XGboost Classifier")
print("---"*30)
xgb_accuracy=round(accuracy_score(y_test,y_pred_xgb)*100,2)
print("accuracy",xgb_accuracy,"%")

XGboost Classifier
------------------------------------------------------------------------------------------
accuracy 100.0 %


### 3. Decision Tree

In [42]:
from sklearn.tree import DecisionTreeClassifier
dt_model=DecisionTreeClassifier()
dt_model.fit(X_train,y_train)
y_pred_dt=dt_model.predict(X_test)
print("Decision Tree Classifier")
print("---"*30)
dt_accuracy=round(accuracy_score(y_test,y_pred_dt)*100,2)
print("accuracy",dt_accuracy,"%")

Decision Tree Classifier
------------------------------------------------------------------------------------------
accuracy 99.85 %


### 4. Random Forest

In [43]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc.fit(X_train,y_train)
y_pred_rfc=rfc.predict(X_test)
print("Random Forest Classifier")
print("---"*30)
rfc_accuracy=round(accuracy_score(y_test,y_pred_rfc)*100,2)
print("accuracy",rfc_accuracy,"%")


Random Forest Classifier
------------------------------------------------------------------------------------------
accuracy 100.0 %


### 5. Gradient Boosting

In [44]:
from sklearn.ensemble import GradientBoostingClassifier
gbc=GradientBoostingClassifier()
gbc.fit(X_train,y_train)
y_pred_gbc=gbc.predict(X_test)
print("Gradient boosting Classifier")
print("---"*30)
gbc_accuracy=round(accuracy_score(y_test,y_pred_gbc)*100,2)
print("accuracy",gbc_accuracy,"%")

Gradient boosting Classifier
------------------------------------------------------------------------------------------
accuracy 79.6 %


## Step 6: Save Model

In [45]:
print("Saving the model...")
with open('churn_prediction_model.pkl', 'wb') as model_file:
    pickle.dump(xgb_model, model_file)
print("Model saved successfully!")

Saving the model...
Model saved successfully!
