# Setting

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import fpgrowth

# 데이터 불러오기 및 구조 확인

Transaction : Transaction id which is unique for each order   
Item : List of items to be ordered/placed by customer  

In [2]:
bread_df = pd.read_csv('bread basket.csv')
print(bread_df.head(4))

   Transaction           Item         date_time period_day weekday_weekend
0            1          Bread  30-10-2016 09:58    morning         weekend
1            2   Scandinavian  30-10-2016 10:05    morning         weekend
2            2   Scandinavian  30-10-2016 10:05    morning         weekend
3            3  Hot chocolate  30-10-2016 10:07    morning         weekend


In [3]:
print(bread_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20507 entries, 0 to 20506
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Transaction      20507 non-null  int64 
 1   Item             20507 non-null  object
 2   date_time        20507 non-null  object
 3   period_day       20507 non-null  object
 4   weekday_weekend  20507 non-null  object
dtypes: int64(1), object(4)
memory usage: 801.2+ KB
None


In [4]:
# 장바구니 데이터 고유항목 구분 출력
items = set()   # 중복을 허용하지 않는 set 구조 활용
for col in bread_df:
    items.update(bread_df['Item'].unique())

print(f'{len(items)}개의 항목이 존재합니다.')
print(items)

94개의 항목이 존재합니다.
{'Chimichurri Oil', 'Chicken Stew', 'Fairy Doors', 'Salad', 'Jammie Dodgers', 'Drinking chocolate spoons ', 'Lemon and coconut', 'My-5 Fruit Shoot', 'Focaccia', 'Baguette', 'Bread', 'Toast', 'Vegan Feast', 'Juice', 'Argentina Night', 'Chicken sand', 'Pick and Mix Bowls', "Valentine's card", 'Chocolates', 'Tea', 'Eggs', 'Adjustment', 'Pastry', 'Hot chocolate', 'Scone', 'Panatone', 'Brownie', 'Truffles', 'Gift voucher', 'Granola', 'Brioche and salami', 'Nomad bag', 'Tshirt', 'Dulce de Leche', 'Raspberry shortbread sandwich', 'Olum & polenta', 'Muffin', 'Basket', 'Sandwich', 'Mighty Protein', 'Bakewell', 'Coke', 'Victorian Sponge', 'Vegan mincepie', 'Afternoon with the baker', 'Coffee granules ', 'Alfajores', 'Empanadas', 'Cherry me Dried fruit', 'Scandinavian', 'Smoothies', 'Jam', "Ella's Kitchen Pouches", 'Siblings', 'Tiffin', 'Bacon', 'Honey', 'Mineral water', 'Polenta', 'Bread Pudding', 'Hearty & Seasonal', 'Extra Salami or Feta', 'Muesli', 'Caramel bites', 'Crisps', '

# 데이터 전처리

In [5]:
group_trans = bread_df.groupby(['Transaction', 'Item'])['Item'].count().reset_index(name ='Count')
print(group_trans)

       Transaction           Item  Count
0                1          Bread      1
1                2   Scandinavian      2
2                3        Cookies      1
3                3  Hot chocolate      1
4                3            Jam      1
...            ...            ...    ...
18882         9682   Tacos/Fajita      1
18883         9682            Tea      1
18884         9683         Coffee      1
18885         9683         Pastry      1
18886         9684      Smoothies      1

[18887 rows x 3 columns]


In [6]:
basket = group_trans.pivot_table(index='Transaction', columns='Item', values='Count', aggfunc='sum').fillna(0)
#print(basket.head())
basket.head()

# basket.loc[2, 'Scandinavian']은 현재 2

Item,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# 0, 1 로 변환해주는 함수
def zero_or_one(x):
    if x<=0:
        return 0
    if x>=1:    # 한 구매에 같은 상품을 2번 이상 구매해도 1로 입력되도록 변경
        return 1

basket = basket.map(zero_or_one)
print(basket.loc[2, 'Scandinavian'])   # 2였던 값이 1로 변했음을 확인함

1


# Apriori

In [17]:
frequent_itemsets = apriori(basket, min_support = 0.035, use_colnames = True)
print(frequent_itemsets.sort_values('support', ascending = False))

     support             itemsets
4   0.478394             (Coffee)
1   0.327205              (Bread)
13  0.142631                (Tea)
3   0.103856               (Cake)
14  0.090016      (Bread, Coffee)
11  0.086107             (Pastry)
12  0.071844           (Sandwich)
9   0.061807          (Medialuna)
7   0.058320      (Hot chocolate)
15  0.054728       (Coffee, Cake)
5   0.054411            (Cookies)
19  0.049868        (Coffee, Tea)
17  0.047544     (Pastry, Coffee)
2   0.040042            (Brownie)
6   0.039197         (Farm House)
8   0.038563              (Juice)
10  0.038457             (Muffin)
18  0.038246   (Coffee, Sandwich)
0   0.036344          (Alfajores)
16  0.035182  (Medialuna, Coffee)




In [18]:
rules = association_rules(frequent_itemsets, metric = "lift", min_threshold = 1)  # 양의 상관관계
rules.sort_values('confidence', ascending = False, inplace = True)   # confidence 순으로 정렬
# print(rules)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2,(Medialuna),(Coffee),0.061807,0.478394,0.035182,0.569231,1.189878,0.005614,1.210871,0.170091
4,(Pastry),(Coffee),0.086107,0.478394,0.047544,0.552147,1.154168,0.006351,1.164682,0.146161
7,(Sandwich),(Coffee),0.071844,0.478394,0.038246,0.532353,1.112792,0.003877,1.115384,0.109205
1,(Cake),(Coffee),0.103856,0.478394,0.054728,0.526958,1.101515,0.005044,1.102664,0.10284
0,(Coffee),(Cake),0.478394,0.103856,0.054728,0.114399,1.101515,0.005044,1.011905,0.176684
5,(Coffee),(Pastry),0.478394,0.086107,0.047544,0.099382,1.154168,0.006351,1.01474,0.256084
6,(Coffee),(Sandwich),0.478394,0.071844,0.038246,0.079947,1.112792,0.003877,1.008807,0.194321
3,(Coffee),(Medialuna),0.478394,0.061807,0.035182,0.073542,1.189878,0.005614,1.012667,0.305936


Lift 값이 1 이상으로 양의 상관관계가 있다고 보이는 조합을 추출  
신뢰도 기준으로 정렬하였을 때 결과는  
1. Medialuna -> Coffee
2. Pastry -> Coffee
3. Sandwich -> Coffee
4. Cake -> Coffee

### 커피 다음으로 많이 팔린 것이 빵이지만, 빵은 연관 규칙이 나타나지 않았음

In [19]:
# 최소 지지도를 기반으로 음의 상관관계 확인
rules_support = association_rules(frequent_itemsets, metric = "support", min_threshold = 0.035)
negative_correlation_rules = rules_support[rules_support['lift'] < 1]   # 음의 상관관계
print(negative_correlation_rules)

   antecedents consequents  antecedent support  consequent support   support  \
0      (Bread)    (Coffee)            0.327205            0.478394  0.090016   
1     (Coffee)     (Bread)            0.478394            0.327205  0.090016   
10    (Coffee)       (Tea)            0.478394            0.142631  0.049868   
11       (Tea)    (Coffee)            0.142631            0.478394  0.049868   

    confidence      lift  leverage  conviction  zhangs_metric  
0     0.275105  0.575059 -0.066517    0.719561      -0.523431  
1     0.188163  0.575059 -0.066517    0.828731      -0.586210  
10    0.104240  0.730840 -0.018366    0.957142      -0.413856  
11    0.349630  0.730840 -0.018366    0.802014      -0.300482  


### support와 confidence로 보아 강하진 않지만, 빵과 커피, 차와 커피는 음의 상관관계를 보이고 있음  
빵과 커피가 음의 상관관계를 가졌음은 의외의 결과임  
커피를 마시는 고객과 차를 마시는 고객이 나뉘므로 당연한 것으로 판단됨

In [12]:
print(negative_correlation_rules['antecedents'][0])

frozenset({'Bread'})


In [21]:
# 빵에 대한 데이터를 추가 확인
frequent_itemsets = apriori(basket, min_support = 0.002, use_colnames = True)
rules_support = association_rules(frequent_itemsets, metric = "support", min_threshold = 0.001)
rules_bread = rules_support[rules_support['antecedents'] == frozenset({'Bread'})]
rules_bread.sort_values('lift', ascending = False, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rules_bread.sort_values('lift', ascending = False, inplace = True)


In [22]:
rules_bread.head(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
54,(Bread),(Jammie Dodgers),0.327205,0.013207,0.004649,0.014207,1.075777,0.000327,1.001015,0.104696
43,(Bread),(Focaccia),0.327205,0.005705,0.002007,0.006135,1.075324,0.000141,1.000432,0.104114
66,(Bread),(Pastry),0.327205,0.086107,0.02916,0.089119,1.034977,0.000985,1.003306,0.050231
52,(Bread),(Jam),0.327205,0.015003,0.005071,0.015499,1.033076,0.000162,1.000504,0.047588
58,(Bread),(Keeping It Local),0.327205,0.006656,0.002219,0.006781,1.018728,4.1e-05,1.000126,0.027324


빵과 양의 상관관계를 나타내는 것은 비슷한 빵류로 보임. 이외의 항목들은 모두 음의 상관관계를 나타냄  
빵을 사서 다른 디저트를 사지 않는 경우, 빵과 함께 Pastry와 Jam등의 디저트를 구매한 경우의 2가지 형태가 모두 나타나는 것으로 보임  

In [23]:
rules_bread.tail(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
419,(Bread),"(Coffee, Tea)",0.327205,0.049868,0.007396,0.022603,0.453248,-0.008921,0.972104,-0.641958
371,(Bread),"(Juice, Coffee)",0.327205,0.020602,0.002958,0.009041,0.438837,-0.003783,0.988333,-0.655251
407,(Bread),"(Coffee, Soup)",0.327205,0.015848,0.002219,0.006781,0.427866,-0.002967,0.990871,-0.665272
383,(Bread),"(Coffee, Muffin)",0.327205,0.018806,0.00243,0.007427,0.3949,-0.003723,0.988535,-0.694889
40,(Bread),(Farm House),0.327205,0.039197,0.004966,0.015176,0.387171,-0.00786,0.975609,-0.701726


빵과 음의 상관관계를 나타내는 항목은 Coffee임을 재차 확인함

### 일반적으로 생각했을 때 빵과 커피의 음의 상관관계는 이해하기 어려우며, 해당 데이터가 온라인 주문 내역이 포함되어 있어, 온라인 데이터가 섞이면서 이런 결과가 나타났다고 생각됨. 온/오프라인 판매 구분이 없어 해당사항을 확인할 수는 없었음

#### FP Growth로 수행했을 때도 유사한 결과가 나타나 해당 과정은 생략함

---