### Data Dictionary
- InvoiceNo: Invoice number. Nominal. A 6-digit integral number uniquely assigned to each transaction. If this code starts with the letter 'c', it indicates a cancellation.
- StockCode: Product (item) code. Nominal. A 5-digit integral number uniquely assigned to each distinct product.
- Description: Product (item) name. Nominal.
- Quantity: The quantities of each product (item) per transaction. Numeric.
- InvoiceDate: Invice date and time. Numeric. The day and time when a transaction was generated.
- UnitPrice: Unit price. Numeric. Product price per unit in sterling (Â£).
- CustomerID: Customer number. Nominal. A 5-digit integral number uniquely assigned to each customer.
- Country: Country name. Nominal. The name of the country where a customer resides.

In [105]:
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np
from azure.storage.blob import ContainerClient, BlobClient
import pandas as pd
from io import BytesIO
from copy import deepcopy

StatementMeta(RetailPool, 11, 27, Finished, Available)

In [106]:
CONNECTIONSTRING = 'DefaultEndpointsProtocol=https;AccountName=#STORAGE_ACCOUNT_NAME#;AccountKey=#STORAGE_ACCOUNT_KEY#;EndpointSuffix=core.windows.net'
CONTAINER_NAME = 'customer-churn-data'

BLOBNAME = 'online_retail_II.xlsx'
blob = BlobClient.from_connection_string(conn_str=CONNECTIONSTRING, container_name=CONTAINER_NAME, blob_name=BLOBNAME)
blob_data = blob.download_blob()
BytesIO(blob_data.content_as_bytes())
data = pd.read_excel(BytesIO(blob_data.content_as_bytes()))

StatementMeta(RetailPool, 11, 28, Finished, Available)

In [107]:
data.head()

StatementMeta(RetailPool, 11, 29, Finished, Available)

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [108]:
# data.describe(include = 'all')

StatementMeta(RetailPool, 11, 30, Finished, Available)

In [109]:
data.shape

StatementMeta(RetailPool, 11, 31, Finished, Available)

(525461, 8)

In [110]:
data.dtypes

StatementMeta(RetailPool, 11, 32, Finished, Available)

Invoice                object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
Price                 float64
Customer ID           float64
Country                object
dtype: object

In [111]:
data.dropna(inplace=True)

StatementMeta(RetailPool, 11, 33, Finished, Available)

In [112]:
data.isnull().sum() 

StatementMeta(RetailPool, 11, 34, Finished, Available)

Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64

In [113]:
data.describe()

StatementMeta(RetailPool, 11, 35, Finished, Available)

Unnamed: 0,Quantity,Price,Customer ID
count,417534.0,417534.0,417534.0
mean,12.758815,3.887547,15360.645478
std,101.220424,71.131797,1680.811316
min,-9360.0,0.0,12346.0
25%,2.0,1.25,13983.0
50%,4.0,1.95,15311.0
75%,12.0,3.75,16799.0
max,19152.0,25111.09,18287.0


In [114]:
# Convert InvocieDate to date.
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])

StatementMeta(RetailPool, 11, 36, Finished, Available)

In [115]:
df_cancel = data[data["Invoice"].str.contains("C",na = False)]

StatementMeta(RetailPool, 11, 37, Finished, Available)

In [116]:
df_cancel.groupby("Invoice").agg(Invoice_Count=('Invoice', 'count')).sort_values("Invoice_Count", ascending=False).head(5)

StatementMeta(RetailPool, 11, 38, Finished, Available)

Unnamed: 0_level_0,Invoice_Count
Invoice,Unnamed: 1_level_1
C524235,45
C536164,44
C509015,40
C531807,38
C512272,38


In [117]:
data[data['Invoice'] == 'C524235'].shape

StatementMeta(RetailPool, 11, 39, Finished, Available)

(45, 8)

In [125]:
data = data[~data.Invoice.isin(df_cancel.Invoice)]

StatementMeta(RetailPool, 11, 47, Finished, Available)

In [126]:
data["InvoiceDate"].max()

StatementMeta(RetailPool, 11, 48, Finished, Available)

Timestamp('2010-12-09 20:01:00')

In [127]:
today_date=data["InvoiceDate"].max()

StatementMeta(RetailPool, 11, 49, Finished, Available)

In [128]:
data["TotalPrice"] = data["Quantity"] * data["Price"]

StatementMeta(RetailPool, 11, 50, Finished, Available)

In [138]:
rfm = data.groupby('Customer ID').agg({'InvoiceDate': lambda InvoiceDate: (today_date - InvoiceDate.max()).days,
                                     'Invoice': lambda Invoice : Invoice.nunique(),
                                     'TotalPrice': lambda TotalPrice : TotalPrice.sum()
                                     })

StatementMeta(RetailPool, 11, 60, Finished, Available)

In [139]:
rfm.columns = ['recency', 'frequency', 'monetary']


StatementMeta(RetailPool, 11, 61, Finished, Available)

In [140]:
rfm.reset_index(inplace = True)

StatementMeta(RetailPool, 11, 62, Finished, Available)

In [141]:
rfm

StatementMeta(RetailPool, 11, 63, Finished, Available)

Unnamed: 0,Customer ID,recency,frequency,monetary
0,12346.0,164,11,372.86
1,12347.0,2,2,1323.32
2,12348.0,73,1,222.16
3,12349.0,42,3,2671.14
4,12351.0,10,1,300.93
...,...,...,...,...
4309,18283.0,17,6,641.77
4310,18284.0,66,1,461.68
4311,18285.0,295,1,427.00
4312,18286.0,111,2,1296.43


In [136]:
data[data['Customer ID'] ==12346.0]

StatementMeta(RetailPool, 11, 58, Finished, Available)

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,TotalPrice
27994,491725,TEST001,This is a test product.,10,2009-12-14 08:34:00,4.5,12346.0,United Kingdom,45.0
28251,491742,TEST001,This is a test product.,5,2009-12-14 11:00:00,4.5,12346.0,United Kingdom,22.5
28254,491744,TEST001,This is a test product.,5,2009-12-14 11:02:00,4.5,12346.0,United Kingdom,22.5
39398,492718,TEST001,This is a test product.,5,2009-12-18 10:47:00,4.5,12346.0,United Kingdom,22.5
39411,492722,TEST002,This is a test product.,1,2009-12-18 10:55:00,1.0,12346.0,United Kingdom,1.0
45228,493410,TEST001,This is a test product.,5,2010-01-04 09:24:00,4.5,12346.0,United Kingdom,22.5
45230,493412,TEST001,This is a test product.,5,2010-01-04 09:53:00,4.5,12346.0,United Kingdom,22.5
56117,494450,TEST001,This is a test product.,5,2010-01-14 13:50:00,4.5,12346.0,United Kingdom,22.5
66084,495295,TEST001,This is a test product.,5,2010-01-22 13:30:00,4.5,12346.0,United Kingdom,22.5
107800,499763,20682,RED SPOTTY CHILDS UMBRELLA,1,2010-03-02 13:08:00,3.25,12346.0,United Kingdom,3.25
