In [13]:
import pandas as pd
import numpy as np
import sys
from itertools import combinations, groupby
from collections import Counter
from IPython.display import display
import time
import matplotlib as mpl
import os, sys
import matplotlib.pyplot as plt
import seaborn as sns
mpl.rcParams['agg.path.chunksize'] = 10000

In [14]:
df = pd.read_csv(r'Data/NORMAL_DATA_RFM.csv')

In [15]:
df

Unnamed: 0,DOCNUM_ID,PEOPLE,STKCOD,DOCNUM,TRNQTY,STKDES
0,1,บ-001,T2355519DHPAMZBS,RR1507040003,1.0,ยางนอก BS 235/55-19 DHPAMZ
1,2,บ-001,T2355519DHPAMZBS,RR1507090001,1.0,ยางนอก BS 235/55-19 DHPAMZ
2,3,บ-001,T2257015EP850BS00,RR1507280006,5.0,ยางนอก BS 225/70-15 EP850
3,4,บ-001,T2354018TZ700FS,RR1507280009,2.0,ยางนอก FS 235/40-18 TZ700
4,5,บ-001,T2154517GS32BS,RR1508250002,4.0,ยางนอก BS 215/45-17 MY02
...,...,...,...,...,...,...
20640,9859,บต-2573,M-SERVICE-01,HN1901092,1.0,ค่าบริการ
20641,9859,บต-2573,M-08-02-0002,HN1901092,1.0,ตั้งศูนย์
20642,9859,บต-2573,S-LAMP,HN1901092,1.0,หลอดไฟหน้า-ขวา
20643,9859,บต-2573,B-3K-VS120R,HN1901092,1.0,แบตเตอรี่ 3K VS120R:รถยนต์ 95D31R


# Function that returns the size of an object in MB

In [16]:
def size(obj):
    return "{0:.2f} MB".format(sys.getsizeof(obj) / (1000 * 1000))

### Part 1:  Data Preparation

In [17]:
df_re=df.reindex(columns= [ 'DOCNUM_ID','DOCNUM', 'TRNQTY','STKDES','STKCOD']) 
print('orders -- dimensions: {0};   size: {1}'.format(df_re.shape, size(df_re)))
df_re

orders -- dimensions: (20645, 5);   size: 5.58 MB


Unnamed: 0,DOCNUM_ID,DOCNUM,TRNQTY,STKDES,STKCOD
0,1,RR1507040003,1.0,ยางนอก BS 235/55-19 DHPAMZ,T2355519DHPAMZBS
1,2,RR1507090001,1.0,ยางนอก BS 235/55-19 DHPAMZ,T2355519DHPAMZBS
2,3,RR1507280006,5.0,ยางนอก BS 225/70-15 EP850,T2257015EP850BS00
3,4,RR1507280009,2.0,ยางนอก FS 235/40-18 TZ700,T2354018TZ700FS
4,5,RR1508250002,4.0,ยางนอก BS 215/45-17 MY02,T2154517GS32BS
...,...,...,...,...,...
20640,9859,HN1901092,1.0,ค่าบริการ,M-SERVICE-01
20641,9859,HN1901092,1.0,ตั้งศูนย์,M-08-02-0002
20642,9859,HN1901092,1.0,หลอดไฟหน้า-ขวา,S-LAMP
20643,9859,HN1901092,1.0,แบตเตอรี่ 3K VS120R:รถยนต์ 95D31R,B-3K-VS120R


#### B. Convert order data into format expected by the association rules function

# Convert from DataFrame to a Series, with order_id as index and item_id as value

In [18]:
orders = df_re.set_index('DOCNUM_ID')['STKCOD'].rename('item_id')
display(orders.head(10))
type(orders)

DOCNUM_ID
1      T2355519DHPAMZBS
2      T2355519DHPAMZBS
3     T2257015EP850BS00
4       T2354018TZ700FS
5        T2154517GS32BS
6         T2457016R16FS
7      T1955516MY02BS00
8        T2157015LE02FS
9       T1956015F01FS00
10     T2157015R611BS00
Name: item_id, dtype: object

pandas.core.series.Series

#### C. Display summary statistics for order data

In [19]:
print('dimensions: {0};   size: {1};   unique_orders: {2};   unique_items: {3}'
      .format(orders.shape, size(orders), len(orders.index.unique()), len(orders.value_counts())))

dimensions: (20645,);   size: 1.60 MB;   unique_orders: 9859;   unique_items: 1784


### Part 2: Association Rules Function

#### A. Helper functions to the main association rules functionv

In [20]:
# Returns frequency counts for items and item pairs
def freq(iterable):
    if type(iterable) == pd.core.series.Series:
        return iterable.value_counts().rename("freq")
    else: 
        return pd.Series(Counter(iterable)).rename("freq")

    
# Returns number of unique orders
def order_count(order_item):
    return len(set(order_item.index))


# Returns generator that yields item pairs, one at a time
def get_item_pairs(order_item):
    order_item = order_item.reset_index().values
    for order_id, order_object in groupby(order_item, lambda x: x[0]):
        item_list = [item[1] for item in order_object]
              
        for item_pair in combinations(item_list, 2):
            yield item_pair
            

# Returns frequency and support associated with item
def merge_item_stats(item_pairs, item_stats):
    return (item_pairs
                .merge(item_stats.rename(columns={'freq': 'freqA', 'support': 'supportA'}), left_on='item_A', right_index=True)
                .merge(item_stats.rename(columns={'freq': 'freqB', 'support': 'supportB'}), left_on='item_B', right_index=True))


# Returns name associated with item
def merge_item_name(rules, item_name):
    columns = ['itemA','itemB','freqAB','supportAB','freqA','supportA','freqB','supportB', 
               'confidenceAtoB','confidenceBtoA','lift']
    rules = (rules
                .merge(item_name.rename(columns={'item_name': 'itemA'}), left_on='item_A', right_on='item_id')
                .merge(item_name.rename(columns={'item_name': 'itemB'}), left_on='item_B', right_on='item_id'))
    return rules[columns]               

#### B. Association rules function

In [21]:
def association_rules(order_item, min_support):
    
    print("Starting order_item: {:22d}".format(len(order_item)))


    # Calculate item frequency and support
    item_stats             = freq(order_item).to_frame("freq")
    item_stats['support']  = item_stats['freq'] / order_count(order_item) * 100


    # Filter from order_item items below min support 
    qualifying_items       = item_stats[item_stats['support'] >= min_support].index
    order_item             = order_item[order_item.isin(qualifying_items)]

    print("Items with support >= {}: {:15d}".format(min_support, len(qualifying_items)))
    print("Remaining order_item: {:21d}".format(len(order_item)))


    # Filter from order_item orders with less than 2 items
    order_size             = freq(order_item.index)
    qualifying_orders      = order_size[order_size >= 2].index
    order_item             = order_item[order_item.index.isin(qualifying_orders)]

    print("Remaining orders with 2+ items: {:11d}".format(len(qualifying_orders)))
    print("Remaining order_item: {:21d}".format(len(order_item)))


    # Recalculate item frequency and support
    item_stats             = freq(order_item).to_frame("freq")
    item_stats['support']  = item_stats['freq'] / order_count(order_item) * 100


    # Get item pairs generator
    item_pair_gen          = get_item_pairs(order_item)


    # Calculate item pair frequency and support
    item_pairs              = freq(item_pair_gen).to_frame("freqAB")
    item_pairs['supportAB'] = item_pairs['freqAB'] / len(qualifying_orders) * 100

    print("Item pairs: {:31d}".format(len(item_pairs)))


    # Filter from item_pairs those below min support
    item_pairs              = item_pairs[item_pairs['supportAB'] >= min_support]

    print("Item pairs with support >= {}: {:10d}\n".format(min_support, len(item_pairs)))


    # Create table of association rules and compute relevant metrics
    item_pairs = item_pairs.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'})
    item_pairs = merge_item_stats(item_pairs, item_stats)
    
    item_pairs['confidenceAtoB'] = item_pairs['supportAB'] / item_pairs['supportA']
    item_pairs['confidenceBtoA'] = item_pairs['supportAB'] / item_pairs['supportB']
    item_pairs['lift']           = item_pairs['supportAB'] / (item_pairs['supportA'] * item_pairs['supportB'])
    
    
    # Return association rules sorted by lift in descending order
    return item_pairs.sort_values('lift', ascending=False)

### Part 3:  Association Rules Mining

In [22]:
%%time
rules = association_rules(orders, 0.01) 

Starting order_item:                  20645
Items with support >= 0.01:            1784
Remaining order_item:                 20645
Remaining orders with 2+ items:        3468
Remaining order_item:                 14254
Item pairs:                           17791
Item pairs with support >= 0.01:      17791

Wall time: 8.93 s


# Replace item ID with item name and display association rules

In [24]:
# Replace item ID with item name and display association rules
item_name   = pd.read_csv(r'Data/DATA_PRODUCT.csv')
item_name   = item_name.rename(columns={'STKCOD':'item_id', 'STKDES':'item_name'})
#item_name

rules_final = merge_item_name(rules, item_name).sort_values('lift', ascending=False)
rules_final.to_csv(r'ASSOCIATION.csv', encoding='utf-8')
display(rules_final)

Unnamed: 0,itemA,itemB,freqAB,supportAB,freqA,supportA,freqB,supportB,confidenceAtoB,confidenceBtoA,lift
0,ยางนอก YK 205/55-16 AA01,ยางนอก YK 215/70-16 G055,1,0.028835,1,0.028835,1,0.028835,1.000000,1.000000,34.680000
3758,โช้ค R NISSAN,โช้ค R NISSAN,1,0.028835,1,0.028835,1,0.028835,1.000000,1.000000,34.680000
5415,ยางนอก KUMHO 195/65-15 KH17,ยางนอก KUMHO 265/40-22 KL12,1,0.028835,1,0.028835,1,0.028835,1.000000,1.000000,34.680000
5414,ยางนอก KUMHO 195/60-15 KH17,ยางนอก KUMHO 265/40-22 KL12,1,0.028835,1,0.028835,1,0.028835,1.000000,1.000000,34.680000
8138,ยางนอก GY 175 /65 -15 GT3,ยางนอก GY 185 /55 -16 EXCELLENT,1,0.028835,1,0.028835,1,0.028835,1.000000,1.000000,34.680000
...,...,...,...,...,...,...,...,...,...,...,...
12655,ค่าบริการปรับแต่งช่วงล่าง+ปรับน๊อตมุมโท,เทิร์นยางเก่า,3,0.086505,1251,36.072664,316,9.111880,0.002398,0.009494,0.000263
6426,ตั้งศูนย์ (เปลี่ยนยาง 4 เส้น ฟรี),จุ๊บเติมลม ฟรี,1,0.028835,337,9.717416,443,12.773933,0.002967,0.002257,0.000232
11428,ค่าบริการปรับแต่งช่วงล่าง+ปรับน๊อตมุมโท,เติมลมไนโตรเจน,1,0.028835,1251,36.072664,143,4.123414,0.000799,0.006993,0.000194
6447,จุ๊บเติมลม ฟรี,จุ๊บเติมลม ฟรี,1,0.028835,443,12.773933,443,12.773933,0.002257,0.002257,0.000177
