# This is Purchase Propensity Project

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.model_selection import train_test_split


In [2]:
data=pd.read_csv('online_retail_II.csv')
data.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067371 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Invoice      1067371 non-null  object 
 1   StockCode    1067371 non-null  object 
 2   Description  1062989 non-null  object 
 3   Quantity     1067371 non-null  int64  
 4   InvoiceDate  1067371 non-null  object 
 5   Price        1067371 non-null  float64
 6   Customer ID  824364 non-null   float64
 7   Country      1067371 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 65.1+ MB


In [4]:
data.isnull().sum()

Invoice             0
StockCode           0
Description      4382
Quantity            0
InvoiceDate         0
Price               0
Customer ID    243007
Country             0
dtype: int64

In [5]:
data.isna().sum()

Invoice             0
StockCode           0
Description      4382
Quantity            0
InvoiceDate         0
Price               0
Customer ID    243007
Country             0
dtype: int64

In [6]:
# Cleaining the data 
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])

In [7]:
data=data.dropna(subset=['Customer ID'])

In [8]:
data['Customer ID']=data['Customer ID'].astype('int64')

In [9]:
data.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085,United Kingdom


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 824364 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      824364 non-null  object        
 1   StockCode    824364 non-null  object        
 2   Description  824364 non-null  object        
 3   Quantity     824364 non-null  int64         
 4   InvoiceDate  824364 non-null  datetime64[ns]
 5   Price        824364 non-null  float64       
 6   Customer ID  824364 non-null  int64         
 7   Country      824364 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(4)
memory usage: 56.6+ MB


In [11]:
data.isna().sum()

Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64

In [12]:
data.isnull().sum()

Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64

Now we want to make total price for each transaction as a new feature.

In [13]:
data['total_price'] = data['Quantity'] * data['Price']

In [14]:
data.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,total_price
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085,United Kingdom,83.4
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom,81.0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom,81.0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085,United Kingdom,100.8
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085,United Kingdom,30.0


In [15]:
# We will now create a snapshot date - its like the most recent data used to find number of days passed since last purchase

snapshot_date = data['InvoiceDate'].max() + pd.Timedelta(days=1)
print("Snapshot Date:", snapshot_date)

Snapshot Date: 2011-12-10 12:50:00


In [16]:
# On the basis of this data we will create purchase propensity features such as RFM(Recency, Frequency, Monetary Value)

Cretaing Recency 

Recency = Days since last purchase.

In [17]:
# Recency
last_purchase=data.groupby('Customer ID')['InvoiceDate'].max()
recency=(snapshot_date - last_purchase).dt.days
recency.head()

Customer ID
12346    326
12347      2
12348     75
12349     19
12350    310
Name: InvoiceDate, dtype: int64

In [18]:
# Frequency
frequency=data.groupby('Customer ID')['Invoice'].nunique()
frequency.head()

# nunique is used to find number of unique values

Customer ID
12346    17
12347     8
12348     5
12349     5
12350     1
Name: Invoice, dtype: int64

In [19]:
# Monetary 
monetary=data.groupby('Customer ID')['total_price'].sum()
monetary.head()

Customer ID
12346     -64.68
12347    5633.32
12348    2019.40
12349    4404.54
12350     334.40
Name: total_price, dtype: float64

In [20]:
# Combining the rfm 
rfm=pd.DataFrame({
  'Recenccy':recency,
  'Frequency':frequency,
  'Monetary':monetary
})

In [21]:
rfm.head()

Unnamed: 0_level_0,Recenccy,Frequency,Monetary
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346,326,17,-64.68
12347,2,8,5633.32
12348,75,5,2019.4
12349,19,5,4404.54
12350,310,1,334.4


Creating the target varioable in rfm 

A customer is considered as "BuyAgain = 1" if they purchased within last 30 day

In [22]:
rfm['Buy_again']=(rfm['Recenccy']<30).astype(int)

In [23]:
rfm.head()

Unnamed: 0_level_0,Recenccy,Frequency,Monetary,Buy_again
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12346,326,17,-64.68,0
12347,2,8,5633.32,1
12348,75,5,2019.4,0
12349,19,5,4404.54,1
12350,310,1,334.4,0


In [24]:
# Seperating target and features 
X=rfm.drop('Buy_again',axis=1)
y=rfm['Buy_again']

In [25]:
# SPlitting into train and test split 

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42,stratify=y)

In [26]:
rfm['Buy_again'].value_counts()

Buy_again
0    4288
1    1654
Name: count, dtype: int64

In [27]:
# Scaling the values for better model perfromance 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
numeric_cols=['Recenccy', 'Frequency', 'Monetary']

preprocessor=ColumnTransformer([
    ('num',StandardScaler(),numeric_cols)
])



Training Models 

In [28]:
# Linear Regression 
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
log_pipe=Pipeline([
    ('preprocessor',preprocessor),
    ('model',LogisticRegression())
])
log_pipe.fit(X_train, y_train)
log_pred = log_pipe.predict(X_test)
log_proba = log_pipe.predict_proba(X_test)[:,1]

print("\n=== Logistic Regression ===")
print(classification_report(y_test, log_pred))
print("AUC =", roc_auc_score(y_test, log_proba))


=== Logistic Regression ===
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      1072
           1       0.93      1.00      0.96       414

    accuracy                           0.98      1486
   macro avg       0.96      0.99      0.97      1486
weighted avg       0.98      0.98      0.98      1486

AUC = 0.9992992465210181


In [29]:
from sklearn.ensemble import RandomForestClassifier
rf_pipe = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=300,
        class_weight='balanced',
        random_state=42
    ))
])

rf_pipe.fit(X_train, y_train)
rf_pred = rf_pipe.predict(X_test)
rf_proba = rf_pipe.predict_proba(X_test)[:,1]

print("\n=== Random Forest ===")
print(classification_report(y_test, rf_pred))
print("AUC =", roc_auc_score(y_test, rf_proba))



=== Random Forest ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1072
           1       1.00      1.00      1.00       414

    accuracy                           1.00      1486
   macro avg       1.00      1.00      1.00      1486
weighted avg       1.00      1.00      1.00      1486

AUC = 1.0


In [30]:
from xgboost import XGBClassifier
xgb_pipe = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', XGBClassifier(
        n_estimators=250,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric='logloss'
    ))
])

xgb_pipe.fit(X_train, y_train)
xgb_pred = xgb_pipe.predict(X_test)
xgb_proba = xgb_pipe.predict_proba(X_test)[:,1]

print("\n=== XGBoost ===")
print(classification_report(y_test, xgb_pred))
print("AUC =", roc_auc_score(y_test, xgb_proba))



=== XGBoost ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1072
           1       1.00      1.00      1.00       414

    accuracy                           1.00      1486
   macro avg       1.00      1.00      1.00      1486
weighted avg       1.00      1.00      1.00      1486

AUC = 1.0


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [31]:
print("\n================ MODEL COMPARISON ================")
print("Logistic Regression AUC:", roc_auc_score(y_test, log_proba))
print("Random Forest AUC:", roc_auc_score(y_test, rf_proba))
print("XGBoost AUC:", roc_auc_score(y_test, xgb_proba))



Logistic Regression AUC: 0.9992992465210181
Random Forest AUC: 1.0
XGBoost AUC: 1.0


 These models give such a high accuracy means that there is data leakage 

Removing Recency column and testing again 

⭐ NEW CORRECT PIPELINE (NO DATA LEAKAGE)
✔ We will use past behavior (Frequency, Monetary)

✔ We will NOT use Recency as a feature

✔ Recency is ONLY used to create the target



In [34]:
df = pd.read_csv("online_retail_II.csv", encoding="latin1")

df.head()


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [37]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df = df.dropna(subset=['Customer ID'])
df['Customer ID'] = df['Customer ID'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Customer ID'] = df['Customer ID'].astype(int)


Create Monetary per transaction

In [39]:
df['TotalPrice'] = df['Quantity'] * df['Price']


Set Snapshot Date (for Recency)

In [40]:
snapshot_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)


Compute RFM Features 

In [41]:
# Recency (used ONLY for label creation)
last_purchase = df.groupby('Customer ID')['InvoiceDate'].max()
recency = (snapshot_date - last_purchase).dt.days


In [43]:
#Frequency
frequency = df.groupby('Customer ID')['Invoice'].nunique()


In [44]:
# Monetary
monetary = df.groupby('Customer ID')['TotalPrice'].sum()


Build RFM DataFrame

In [45]:
rfm = pd.DataFrame({
    'Recency': recency,
    'Frequency': frequency,
    'Monetary': monetary
})


In [46]:
# Creating target 
rfm['BuyAgain'] = (rfm['Recency'] < 30).astype(int)


In [47]:
# Not using recency to avoid data leakages 
X = rfm[['Frequency', 'Monetary']]   # <-- FIXED HERE
y = rfm['BuyAgain']


In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


In [49]:
numeric_features = ['Frequency', 'Monetary']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ]
)


In [50]:
log_pipe = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', LogisticRegression(max_iter=1000))
])

log_pipe.fit(X_train, y_train)
log_pred = log_pipe.predict(X_test)
log_proba = log_pipe.predict_proba(X_test)[:,1]

print("\n=== Logistic Regression ===")
print(classification_report(y_test, log_pred))
print("AUC:", roc_auc_score(y_test, log_proba))



=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.77      0.96      0.86      1072
           1       0.72      0.26      0.38       414

    accuracy                           0.77      1486
   macro avg       0.74      0.61      0.62      1486
weighted avg       0.76      0.77      0.72      1486

AUC: 0.7372557502343355


In [51]:
rf_pipe = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        class_weight='balanced'
    ))
])

rf_pipe.fit(X_train, y_train)
rf_pred = rf_pipe.predict(X_test)
rf_proba = rf_pipe.predict_proba(X_test)[:,1]

print("\n=== Random Forest ===")
print(classification_report(y_test, rf_pred))
print("AUC:", roc_auc_score(y_test, rf_proba))



=== Random Forest ===
              precision    recall  f1-score   support

           0       0.79      0.78      0.79      1072
           1       0.45      0.46      0.46       414

    accuracy                           0.69      1486
   macro avg       0.62      0.62      0.62      1486
weighted avg       0.70      0.69      0.69      1486

AUC: 0.672169046073978


In [52]:
xgb_pipe = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric='logloss'
    ))
])

xgb_pipe.fit(X_train, y_train)
xgb_pred = xgb_pipe.predict(X_test)
xgb_proba = xgb_pipe.predict_proba(X_test)[:,1]

print("\n=== XGBoost ===")
print(classification_report(y_test, xgb_pred))
print("AUC:", roc_auc_score(y_test, xgb_proba))



=== XGBoost ===
              precision    recall  f1-score   support

           0       0.78      0.93      0.85      1072
           1       0.64      0.34      0.44       414

    accuracy                           0.76      1486
   macro avg       0.71      0.63      0.64      1486
weighted avg       0.74      0.76      0.74      1486

AUC: 0.742364941596366


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [53]:
print("\n========== MODEL COMPARISON ==========")
print("Logistic Regression AUC:", roc_auc_score(y_test, log_proba))
print("Random Forest AUC:", roc_auc_score(y_test, rf_proba))
print("XGBoost AUC:", roc_auc_score(y_test, xgb_proba))



Logistic Regression AUC: 0.7372557502343355
Random Forest AUC: 0.672169046073978
XGBoost AUC: 0.742364941596366
