In [1]:
# importing necessary libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [3]:
customer = pd.read_csv('cleaned_online_retail.csv')
customer.head()

Unnamed: 0,Transaction_ID,Product_Code,Units_Sold,Transaction_Timestamp,Unit_Cost,Client_ID,Client_Region,Product_Title_Cleaned,Neg_Units
0,489434,85048,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,15cm christmas glass ball 20 lights,False
1,489434,79323P,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,pink cherry lights,False
2,489434,79323W,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,white cherry lights,False
3,489434,22041,48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,"record frame 7"" single size",False
4,489434,21232,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,strawberry ceramic trinket box,False


In [4]:
# purchase predictions checks if a customer is likely to buy again or not

## Feature Engineering

In [5]:
customer['Transaction_Timestamp'] = pd.to_datetime(customer['Transaction_Timestamp'])

In [6]:
# receny feature
latest_date = customer['Transaction_Timestamp'].max()
last_purchase = customer.groupby('Client_ID')['Transaction_Timestamp'].transform('max')
customer['recency'] = (latest_date - last_purchase).dt.days
customer[['Client_ID', 'Transaction_Timestamp', 'recency']].head()

Unnamed: 0,Client_ID,Transaction_Timestamp,recency
0,13085.0,2009-12-01 07:45:00,314
1,13085.0,2009-12-01 07:45:00,314
2,13085.0,2009-12-01 07:45:00,314
3,13085.0,2009-12-01 07:45:00,314
4,13085.0,2009-12-01 07:45:00,314


In [7]:
# frequency feature
customer['frequency'] = customer.groupby('Client_ID')['Transaction_Timestamp'].transform('count')
customer[['Client_ID', 'Transaction_ID', 'frequency']].head()

Unnamed: 0,Client_ID,Transaction_ID,frequency
0,13085.0,489434,62
1,13085.0,489434,62
2,13085.0,489434,62
3,13085.0,489434,62
4,13085.0,489434,62


In [8]:
# monetery feature
# money per each transaction
customer['monetary'] = customer['Unit_Cost'] * customer['Units_Sold']
# adding all the transactions for a customer
customer['monetary'] = customer.groupby('Client_ID')['monetary'].transform('sum')
customer[['Client_ID', 'monetary']].drop_duplicates().head()

Unnamed: 0,Client_ID,monetary
0,13085.0,2017.2
12,13078.0,16904.51
31,15362.0,613.08
54,18102.0,349164.35
71,12682.0,10665.69


In [9]:
#avg_per_transaction
customer['avg_per_transaction'] = customer['monetary'] - customer['frequency']

In [10]:
# customer tenure
first_purchase = customer.groupby('Client_ID')['Transaction_Timestamp'].transform('min')
last_purchase = customer.groupby('Client_ID')['Transaction_Timestamp'].transform('max')
customer['customer_tenure'] = (last_purchase - first_purchase).dt.days
customer[['Client_ID', 'customer_tenure']].drop_duplicates().head()

Unnamed: 0,Client_ID,customer_tenure
0,13085.0,59
12,13078.0,372
31,15362.0,290
54,18102.0,373
71,12682.0,373


In [11]:
#purchase frequency
#to avoid division by zero
customer['customer_tenure'] = customer['customer_tenure'].replace(0, 1)
customer['purchase_frequency'] = customer['frequency'] / customer['customer_tenure']
customer[['Client_ID', 'purchase_frequency']].drop_duplicates().head()

Unnamed: 0,Client_ID,purchase_frequency
0,13085.0,1.050847
12,13078.0,1.19086
31,15362.0,0.137931
54,18102.0,1.680965
71,12682.0,1.321716


## Feature Transformation

In [12]:
customer.columns

Index(['Transaction_ID', 'Product_Code', 'Units_Sold', 'Transaction_Timestamp',
       'Unit_Cost', 'Client_ID', 'Client_Region', 'Product_Title_Cleaned',
       'Neg_Units', 'recency', 'frequency', 'monetary', 'avg_per_transaction',
       'customer_tenure', 'purchase_frequency'],
      dtype='object')

In [13]:
num_cols = ['Units_Sold', 'Unit_Cost', 'recency', 'frequency', 'monetary', 'avg_per_transaction', 'customer_tenure', 'purchase_frequency']

In [14]:
# we should transform the columns that are numerical and skewed

In [15]:
for col in num_cols:
    print(f"{col} skewness: {customer[col].skew()}")

Units_Sold skewness: 87.92501905128374
Unit_Cost skewness: 20.546140656682553
recency skewness: 2.8781371785118184
frequency skewness: 1.4617250349168318
monetary skewness: 1.450616718283678
avg_per_transaction skewness: 1.4487261716098148
customer_tenure skewness: -1.041556188256739
purchase_frequency skewness: 1.4124957303056647


In [16]:
skewed_cols = ['Units_Sold', 'Unit_Cost', 'recency', 'frequency', 'monetary', 'avg_per_transaction', 'purchase_frequency']

In [17]:
log1p_transformations = ['recency', 'frequency', 'monetary', 'avg_per_transaction', 'purchase_frequency']
yeojohnson_transformations = ['Units_Sold', 'Unit_Cost']

In [18]:
for col in log1p_transformations:
    customer[col + '_log'] = np.log1p(customer[col])

In [19]:
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer(method='yeo-johnson')
customer_yeojohnson = pt.fit_transform(customer[yeojohnson_transformations])
customer_yeojohnson_df = pd.DataFrame(customer_yeojohnson, columns=[col + '_yj' for col in yeojohnson_transformations])

customer = pd.concat([customer, customer_yeojohnson_df], axis=1)

In [20]:
for col in customer.columns:
    if col.endswith('_log') or col.endswith('_yj'):
        print(f"{col} skewness: {customer[col].skew()}")

recency_log skewness: 0.07861280265680148
frequency_log skewness: 0.8787266318786177
monetary_log skewness: 0.7628170002460615
avg_per_transaction_log skewness: 0.7423302018711314
purchase_frequency_log skewness: 0.9737004565418438
Units_Sold_yj skewness: 0.188105003995073
Unit_Cost_yj skewness: 0.05816525139905832


In [21]:
# any numerical columns are not skewed now

## Pipeline for Feature Scaling and Categorical Encoding

In [22]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [23]:
num_features = [ 'Units_Sold_yj', 'Unit_Cost_yj', 'recency_log', 'frequency_log', 'monetary_log', 'avg_per_transaction_log', 'purchase_frequency_log']
cat_features = ['Client_Region']

In [24]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown = 'ignore'), cat_features)
    ]
)

pipeline = Pipeline(steps = [
    ('preprocessing', preprocessor)
])

In [25]:
# dataframe with useful features only
x = customer[num_features + cat_features]
#to learn and aplly scalings, encodings on the dataframe x
x_preprocessed = pipeline.fit_transform(x)

In [27]:
x.columns

Index(['Units_Sold_yj', 'Unit_Cost_yj', 'recency_log', 'frequency_log',
       'monetary_log', 'avg_per_transaction_log', 'purchase_frequency_log',
       'Client_Region'],
      dtype='object')

In [29]:
features = ['Units_Sold_yj', 'Unit_Cost_yj', 'recency_log', 'frequency_log', 'monetary_log', 'avg_per_transaction_log', 'purchase_frequency_log', 'Client_Region']

In [None]:
# since we do not have target variable, we will do unsupervised learning