# Association analysis sample (Online Retail II)

- Online Retail II Data Set https://archive.ics.uci.edu/ml/datasets/Online+Retail+II

In [1]:
import pandas as pd
import dask.dataframe as dd
import dask

In [2]:
# Dataset
_read_xlsx_delayed = dask.delayed(pd.read_excel)(
    'online_retail_II.xlsx'
)
DF_RAW = dd.from_delayed(_read_xlsx_delayed).compute()

In [3]:
display(DF_RAW.shape[0])
display(DF_RAW.head())

525461

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [4]:
display(DF_RAW.isnull().sum())

Invoice             0
StockCode           0
Description      2928
Quantity            0
InvoiceDate         0
Price               0
Customer ID    107927
Country             0
dtype: int64

In [5]:
print(
    DF_RAW['Country'].value_counts().head(15)
)

United Kingdom     485852
EIRE                 9670
Germany              8129
France               5772
Netherlands          2769
Spain                1278
Switzerland          1187
Portugal             1101
Belgium              1054
Channel Islands       906
Sweden                902
Italy                 731
Australia             654
Cyprus                554
Austria               537
Name: Country, dtype: int64


In [6]:
# Data preprocessing.
df = DF_RAW.copy()

df['order_type'] = df['Invoice'].map(
    lambda x: str(x)[0]
)

display(df.head(3))
display(df.tail(3))
display(df['order_type'].value_counts())

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,order_type
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,4
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,4
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,4


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,order_type
525458,538171,22751,FELTCRAFT PRINCESS OLIVIA DOLL,1,2010-12-09 20:01:00,3.75,17530.0,United Kingdom,5
525459,538171,20970,PINK FLORAL FELTCRAFT SHOULDER BAG,2,2010-12-09 20:01:00,3.75,17530.0,United Kingdom,5
525460,538171,21931,JUMBO STORAGE BAG SUKI,2,2010-12-09 20:01:00,1.95,17530.0,United Kingdom,5


5    406763
4    108489
C     10206
A         3
Name: order_type, dtype: int64

In [7]:
# Limit new orders only.
df = df[
    df['order_type'].isin(['5', '4'])
]

display(df['order_type'].value_counts())
display(df.shape[0])

5    406763
4    108489
Name: order_type, dtype: int64

515252

In [8]:
df = df[
    df['Country'] == 'Germany'
]

display(df.shape[0])

7661

In [9]:
# Convert vertically arranged data to horizontally.
qt_groupeby_invoice_and_stockcode = df.groupby(
    ['Invoice', 'StockCode']
)['Quantity'].sum()

display(qt_groupeby_invoice_and_stockcode.head())

Invoice  StockCode
489526   20676         8
         20682         6
         20718        10
         20914        12
         20964         3
Name: Quantity, dtype: int64

In [10]:
moved_stockcode_to_column = \
    qt_groupeby_invoice_and_stockcode.unstack().reset_index().fillna(0).set_index('Invoice')

display(moved_stockcode_to_column.shape)
display(moved_stockcode_to_column.head())

(347, 1457)

StockCode,10002,10125,10135,11001,15034,15036,15039,16012,16016,16033,...,85232A,85232B,85232D,90018A,90019A,90200E,ADJUST,M,PADS,POST
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
490395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
490563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
490564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
490682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [11]:
basket_df = moved_stockcode_to_column.apply(lambda x: x > 0)

display(basket_df.head())

StockCode,10002,10125,10135,11001,15034,15036,15039,16012,16016,16033,...,85232A,85232B,85232D,90018A,90019A,90200E,ADJUST,M,PADS,POST
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489526,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
490395,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
490563,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
490564,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
490682,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [12]:
# Prepare items name dictionary.
stockcodes_and_descriptions = df[
    ['StockCode', 'Description']
].drop_duplicates()

stockcodes_and_descriptions['StockCode'].astype('str')
stockcodes_and_descriptions['Description'].astype('str')

stockcodes_and_descriptions = stockcodes_and_descriptions.set_index('StockCode')

display(stockcodes_and_descriptions.head())
display(stockcodes_and_descriptions.loc[
    stockcodes_and_descriptions.index[:10]
])

Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
85049E,SCANDINAVIAN REDS RIBBONS
21976,PACK OF 60 MUSHROOM CAKE CASES
21498,RED SPOTS WRAP
22077,6 RIBBONS RUSTIC CHARM
84946,ANTIQUE SILVER TEA GLASS ETCHED


Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
85049E,SCANDINAVIAN REDS RIBBONS
21976,PACK OF 60 MUSHROOM CAKE CASES
21498,RED SPOTS WRAP
21498,RED RETROSPOT WRAP
22077,6 RIBBONS RUSTIC CHARM
84946,ANTIQUE SILVER TEA GLASS ETCHED
84948,SILVER HANGING T-LIGHT HOLDER DOME
21537,RETRO SPOTS PUDDING BOWL
21537,RED RETROSPOT PUDDING BOWL
21733,RED HANGING HEART T-LIGHT HOLDER


There are 2 different Description for the same product, and the policy is to select the product with the longest string.

In [13]:
merge_different_desc_items = stockcodes_and_descriptions.copy()

merge_different_desc_items = merge_different_desc_items[
    merge_different_desc_items['Description'].map(
        lambda x: x.isupper()
    )
]

merge_different_desc_items['characters'] = merge_different_desc_items['Description'].map(len)
merge_different_desc_items = merge_different_desc_items.sort_values(
    ['StockCode', 'characters'],
    ascending=[True, False]
)


KeyError: 0