In [2]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('./Data/online_retail_data.csv')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [5]:
df.shape

(541909, 8)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [7]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [8]:
print(df.duplicated().sum())

5268


In [9]:
df = df.drop_duplicates()
df = df.dropna(subset=['CustomerID'])
df.loc[:, 'CastCustomerID'] = df['CustomerID'].astype(int).astype(str)
df = df.drop(columns='CustomerID')
df.rename(columns = {'CastCustomerID':'CustomerID'} , inplace=True)

In [10]:
df['InvoiceDateTime'] = pd.to_datetime(df['InvoiceDate'])

In [11]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,Country,CustomerID,InvoiceDateTime
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,United Kingdom,17850,2010-12-01 08:26:00
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,United Kingdom,17850,2010-12-01 08:26:00
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,United Kingdom,17850,2010-12-01 08:26:00
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,United Kingdom,17850,2010-12-01 08:26:00
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,United Kingdom,17850,2010-12-01 08:26:00


In [12]:
df['Amount'] = df['Quantity'] * df['UnitPrice']
df.drop(columns = ['Description','InvoiceDate','Country','Quantity',	'UnitPrice'],axis = 1 , inplace=True)

In [13]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,CustomerID,InvoiceDateTime,Amount
0,536365,85123A,17850,2010-12-01 08:26:00,15.3
1,536365,71053,17850,2010-12-01 08:26:00,20.34
2,536365,84406B,17850,2010-12-01 08:26:00,22.0
3,536365,84029G,17850,2010-12-01 08:26:00,20.34
4,536365,84029E,17850,2010-12-01 08:26:00,20.34


In [14]:
cdf = df.groupby('CustomerID') \
        .agg({'InvoiceNo':'nunique' , 'StockCode':'nunique' , 'Amount':'sum' , 'InvoiceDateTime': ['min', 'max'] })

cdf.columns = ['unique_invoiceno_count', 'unique_stockcode_count', 'total_amount', 'invoice_min_time', 'invoice_max_time']
cdf['active_days'] = (cdf['invoice_max_time']-cdf['invoice_min_time']).dt.days

cdf.drop(columns = ['invoice_min_time'	, 'invoice_max_time'],axis = 1 , inplace=True)

In [15]:
cdf = cdf.reset_index()

In [16]:
cdf.head()

Unnamed: 0,CustomerID,unique_invoiceno_count,unique_stockcode_count,total_amount,active_days
0,12346,2,1,0.0,0
1,12347,7,103,4310.0,365
2,12348,4,22,1797.24,282
3,12349,1,73,1757.55,0
4,12350,1,17,334.4,0


In [17]:
######  cdf for featured data 

In [18]:
# Define churn status and churn status

In [19]:
scaled_cdf = pd.DataFrame()

In [20]:
scaled_cdf['CustomerID'] = cdf['CustomerID']
scaled_cdf['invoice_count'] = (cdf['unique_invoiceno_count'] - cdf['unique_invoiceno_count'].min()) / (cdf['unique_invoiceno_count'].max() - cdf['unique_invoiceno_count'].min())
scaled_cdf['stockcode_count'] = (cdf['unique_stockcode_count'] - cdf['unique_stockcode_count'].min()) / (cdf['unique_stockcode_count'].max() - cdf['unique_stockcode_count'].min())
scaled_cdf['total_amount'] = (cdf['total_amount'] - cdf['total_amount'].min()) / (cdf['total_amount'].max() - cdf['total_amount'].min())
scaled_cdf['active_days'] = (cdf['active_days'] - cdf['active_days'].min()) / (cdf['active_days'].max() - cdf['active_days'].min())

scaled_cdf['churn_score'] = 1 - (0.3 * scaled_cdf['total_amount'] + 0.3 * scaled_cdf['invoice_count'] + 0.2 * scaled_cdf['stockcode_count'] + 0.2 * scaled_cdf['active_days'])

scaled_cdf['churn_status'] = scaled_cdf['churn_score'].apply(lambda x:1 if x>=0.95 else 0)

In [21]:
cdf['Churn_Status'] = scaled_cdf['churn_status']

In [22]:
cdf.head()

Unnamed: 0,CustomerID,unique_invoiceno_count,unique_stockcode_count,total_amount,active_days,Churn_Status
0,12346,2,1,0.0,0,1
1,12347,7,103,4310.0,365,0
2,12348,4,22,1797.24,282,0
3,12349,1,73,1757.55,0,1
4,12350,1,17,334.4,0,1


In [23]:
############# Model training ###########

In [24]:
X = cdf[['unique_invoiceno_count',	'unique_stockcode_count', 'total_amount',	'active_days']]
Y = cdf['Churn_Status'].values

In [25]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [26]:
x_train , x_test , y_train , y_test = train_test_split(X,Y,random_state=42)

In [27]:
scale = MinMaxScaler()
x_train_scaled = scale.fit_transform(x_train)
x_test_scaled = scale.transform(x_test)

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [29]:
lr = LogisticRegression()
lr.fit(x_train_scaled,y_train)
y_test_predict = lr.predict(x_test_scaled)


accuracy = accuracy_score(y_test, y_test_predict)
conf_matrix = confusion_matrix(y_test, y_test_predict)
class_report = classification_report(y_test, y_test_predict)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

Accuracy: 0.9826
Confusion Matrix:
[[578  17]
 [  2 496]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       595
           1       0.97      1.00      0.98       498

    accuracy                           0.98      1093
   macro avg       0.98      0.98      0.98      1093
weighted avg       0.98      0.98      0.98      1093

