# **Apriori로 연관 분석 구현 방법**

In [1]:
import apriori

In [2]:
from imp import reload
reload(apriori)

<module 'apriori' from 'C:\\MLPython3\\Apriori\\apriori.py'>

In [3]:
dataSet = apriori.loadDataSet()
dataSet

[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

In [4]:
C1 = apriori.createC1(dataSet)
C1

[frozenset({1}),
 frozenset({2}),
 frozenset({3}),
 frozenset({4}),
 frozenset({5})]

In [5]:
D = list(map(set, dataSet))

In [6]:
L1, suppData0 = apriori.scanD(D, C1, 0.5)

In [7]:
L, suppData = apriori.apriori(dataSet)

In [8]:
L

[[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})],
 [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})],
 [frozenset({2, 3, 5})],
 []]

L은 빈발 아이템 집합에 대한 목록을 포함하고 있으며, 최소 지지도는 0.5이다. 이를 확인해보자.

In [9]:
L[0]

[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]

In [10]:
L[1]

[frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})]

In [11]:
L[2]

[frozenset({2, 3, 5})]

In [12]:
L[3]

[]

In [13]:
suppData

{frozenset({1}): 0.5,
 frozenset({3}): 0.75,
 frozenset({4}): 0.25,
 frozenset({2}): 0.75,
 frozenset({5}): 0.75,
 frozenset({1, 3}): 0.5,
 frozenset({2, 5}): 0.75,
 frozenset({3, 5}): 0.5,
 frozenset({2, 3}): 0.5,
 frozenset({1, 5}): 0.25,
 frozenset({1, 2}): 0.25,
 frozenset({2, 3, 5}): 0.5}

이들 각각의 아이템 집합은 aprioriGen()과 함께 apriori()에서 생성된다.

In [14]:
apriori.aprioriGen(L[0], 2)

[frozenset({2, 5}),
 frozenset({3, 5}),
 frozenset({1, 5}),
 frozenset({2, 3}),
 frozenset({1, 2}),
 frozenset({1, 3})]

여기에 있는 6개의 아이템은 후보 아이템 집합인 Ck이다. 이 아이템들 중 4개는 L[1]에 있고 다른 2개의 아이템은 scanD()에 의해 걸러지게 된다.
지지도를 70%로 변경하여 시도해 보자.

In [15]:
L, suppData = apriori.apriori(dataSet, minSupport=0.7)
L

[[frozenset({5}), frozenset({2}), frozenset({3})], [frozenset({2, 5})], []]

## 빈발 아이템 집합으로 연관 규칙 마이닝하기

지지도를 0.5로 하는 빈발 아이템 집합을 생성하자.

In [16]:
L, suppData = apriori.apriori(dataSet, minSupport=0.5)
rules = apriori.generateRules(L, suppData, minConf=0.7)

frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({1}) --> frozenset({3}) conf: 1.0


In [17]:
rules

[(frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({1}), frozenset({3}), 1.0)]

여기서는 {1}-->{3}, {5}-->{2}, {2}-->{5}의 3개의 규칙이 생성되었다. 흥미로운 것은 2와 5를 가지는 규칙은 서로 뒤바뀔 수 있는 반면, 1과 3을 가지는 규칙은 그렇지 않다는 것이다.
신뢰도의 임계 값을 더 낮게 설정하고 어떤 규칙이 생성되는지 확인해 보자.

In [18]:
rules = apriori.generateRules(L, suppData, minConf=0.5)

frozenset({3}) --> frozenset({2}) conf: 0.6666666666666666
frozenset({2}) --> frozenset({3}) conf: 0.6666666666666666
frozenset({5}) --> frozenset({3}) conf: 0.6666666666666666
frozenset({3}) --> frozenset({5}) conf: 0.6666666666666666
frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({3}) --> frozenset({1}) conf: 0.6666666666666666
frozenset({1}) --> frozenset({3}) conf: 1.0
frozenset({5}) --> frozenset({2, 3}) conf: 0.6666666666666666
frozenset({3}) --> frozenset({2, 5}) conf: 0.6666666666666666
frozenset({2}) --> frozenset({3, 5}) conf: 0.6666666666666666


In [19]:
rules

[(frozenset({3}), frozenset({2}), 0.6666666666666666),
 (frozenset({2}), frozenset({3}), 0.6666666666666666),
 (frozenset({5}), frozenset({3}), 0.6666666666666666),
 (frozenset({3}), frozenset({5}), 0.6666666666666666),
 (frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({3}), frozenset({1}), 0.6666666666666666),
 (frozenset({1}), frozenset({3}), 1.0),
 (frozenset({5}), frozenset({2, 3}), 0.6666666666666666),
 (frozenset({3}), frozenset({2, 5}), 0.6666666666666666),
 (frozenset({2}), frozenset({3, 5}), 0.6666666666666666)]

신뢰도를 더 낮게 설정하여 더 많은 규칙(11개)을 얻게 되었다. 

## 예제: 장바구니 항목의 연관 분석

In [20]:
import apriori as ap

In [21]:
fr = open('groceries.csv')
dataSet = [inst.strip().split(',') for inst in fr.readlines()]


In [22]:
dataSet

[['citrus fruit', 'semi-finished bread', 'margarine', 'ready soups'],
 ['tropical fruit', 'yogurt', 'coffee'],
 ['whole milk'],
 ['pip fruit', 'yogurt', 'cream cheese', 'meat spreads'],
 ['other vegetables',
  'whole milk',
  'condensed milk',
  'long life bakery product'],
 ['whole milk', 'butter', 'yogurt', 'rice', 'abrasive cleaner'],
 ['rolls/buns'],
 ['other vegetables',
  'UHT-milk',
  'rolls/buns',
  'bottled beer',
  'liquor (appetizer)'],
 ['potted plants'],
 ['whole milk', 'cereals'],
 ['tropical fruit',
  'other vegetables',
  'white bread',
  'bottled water',
  'chocolate'],
 ['citrus fruit',
  'tropical fruit',
  'whole milk',
  'butter',
  'curd',
  'yogurt',
  'flour',
  'bottled water',
  'dishes'],
 ['beef'],
 ['frankfurter', 'rolls/buns', 'soda'],
 ['chicken', 'tropical fruit'],
 ['butter', 'sugar', 'fruit/vegetable juice', 'newspapers'],
 ['fruit/vegetable juice'],
 ['packaged fruit/vegetables'],
 ['chocolate'],
 ['specialty bar'],
 ['other vegetables'],
 ['butter mi

In [23]:
C1 = ap.createC1(dataSet)

In [24]:
C1

[frozenset({'Instant food products'}),
 frozenset({'UHT-milk'}),
 frozenset({'abrasive cleaner'}),
 frozenset({'artif. sweetener'}),
 frozenset({'baby cosmetics'}),
 frozenset({'baby food'}),
 frozenset({'bags'}),
 frozenset({'baking powder'}),
 frozenset({'bathroom cleaner'}),
 frozenset({'beef'}),
 frozenset({'berries'}),
 frozenset({'beverages'}),
 frozenset({'bottled beer'}),
 frozenset({'bottled water'}),
 frozenset({'brandy'}),
 frozenset({'brown bread'}),
 frozenset({'butter'}),
 frozenset({'butter milk'}),
 frozenset({'cake bar'}),
 frozenset({'candles'}),
 frozenset({'candy'}),
 frozenset({'canned beer'}),
 frozenset({'canned fish'}),
 frozenset({'canned fruit'}),
 frozenset({'canned vegetables'}),
 frozenset({'cat food'}),
 frozenset({'cereals'}),
 frozenset({'chewing gum'}),
 frozenset({'chicken'}),
 frozenset({'chocolate'}),
 frozenset({'chocolate marshmallow'}),
 frozenset({'citrus fruit'}),
 frozenset({'cleaner'}),
 frozenset({'cling film/bags'}),
 frozenset({'cocoa drink

In [25]:
D = list(map(set, dataSet))

In [26]:
D

[{'citrus fruit', 'margarine', 'ready soups', 'semi-finished bread'},
 {'coffee', 'tropical fruit', 'yogurt'},
 {'whole milk'},
 {'cream cheese', 'meat spreads', 'pip fruit', 'yogurt'},
 {'condensed milk',
  'long life bakery product',
  'other vegetables',
  'whole milk'},
 {'abrasive cleaner', 'butter', 'rice', 'whole milk', 'yogurt'},
 {'rolls/buns'},
 {'UHT-milk',
  'bottled beer',
  'liquor (appetizer)',
  'other vegetables',
  'rolls/buns'},
 {'potted plants'},
 {'cereals', 'whole milk'},
 {'bottled water',
  'chocolate',
  'other vegetables',
  'tropical fruit',
  'white bread'},
 {'bottled water',
  'butter',
  'citrus fruit',
  'curd',
  'dishes',
  'flour',
  'tropical fruit',
  'whole milk',
  'yogurt'},
 {'beef'},
 {'frankfurter', 'rolls/buns', 'soda'},
 {'chicken', 'tropical fruit'},
 {'butter', 'fruit/vegetable juice', 'newspapers', 'sugar'},
 {'fruit/vegetable juice'},
 {'packaged fruit/vegetables'},
 {'chocolate'},
 {'specialty bar'},
 {'other vegetables'},
 {'butter mi

In [27]:
L1, suppData0 = ap.scanD(D, C1, 0.08)

In [28]:
L1

[frozenset({'shopping bags'}),
 frozenset({'sausage'}),
 frozenset({'root vegetables'}),
 frozenset({'pastry'}),
 frozenset({'soda'}),
 frozenset({'bottled water'}),
 frozenset({'bottled beer'}),
 frozenset({'rolls/buns'}),
 frozenset({'other vegetables'}),
 frozenset({'whole milk'}),
 frozenset({'yogurt'}),
 frozenset({'tropical fruit'}),
 frozenset({'citrus fruit'})]

In [29]:
L, suppData = ap.apriori(dataSet, minSupport = 0.02)

In [30]:
L

[[frozenset({'meat'}),
  frozenset({'sliced cheese'}),
  frozenset({'onions'}),
  frozenset({'frozen meals'}),
  frozenset({'specialty chocolate'}),
  frozenset({'frozen vegetables'}),
  frozenset({'ice cream'}),
  frozenset({'oil'}),
  frozenset({'chewing gum'}),
  frozenset({'ham'}),
  frozenset({'cat food'}),
  frozenset({'hard cheese'}),
  frozenset({'misc. beverages'}),
  frozenset({'domestic eggs'}),
  frozenset({'dessert'}),
  frozenset({'grapes'}),
  frozenset({'whipped/sour cream'}),
  frozenset({'pork'}),
  frozenset({'berries'}),
  frozenset({'napkins'}),
  frozenset({'hygiene articles'}),
  frozenset({'hamburger meat'}),
  frozenset({'beverages'}),
  frozenset({'shopping bags'}),
  frozenset({'brown bread'}),
  frozenset({'sausage'}),
  frozenset({'canned beer'}),
  frozenset({'waffles'}),
  frozenset({'salty snack'}),
  frozenset({'root vegetables'}),
  frozenset({'candy'}),
  frozenset({'pastry'}),
  frozenset({'butter milk'}),
  frozenset({'specialty bar'}),
  frozenset(

In [31]:
L[0]

[frozenset({'meat'}),
 frozenset({'sliced cheese'}),
 frozenset({'onions'}),
 frozenset({'frozen meals'}),
 frozenset({'specialty chocolate'}),
 frozenset({'frozen vegetables'}),
 frozenset({'ice cream'}),
 frozenset({'oil'}),
 frozenset({'chewing gum'}),
 frozenset({'ham'}),
 frozenset({'cat food'}),
 frozenset({'hard cheese'}),
 frozenset({'misc. beverages'}),
 frozenset({'domestic eggs'}),
 frozenset({'dessert'}),
 frozenset({'grapes'}),
 frozenset({'whipped/sour cream'}),
 frozenset({'pork'}),
 frozenset({'berries'}),
 frozenset({'napkins'}),
 frozenset({'hygiene articles'}),
 frozenset({'hamburger meat'}),
 frozenset({'beverages'}),
 frozenset({'shopping bags'}),
 frozenset({'brown bread'}),
 frozenset({'sausage'}),
 frozenset({'canned beer'}),
 frozenset({'waffles'}),
 frozenset({'salty snack'}),
 frozenset({'root vegetables'}),
 frozenset({'candy'}),
 frozenset({'pastry'}),
 frozenset({'butter milk'}),
 frozenset({'specialty bar'}),
 frozenset({'sugar'}),
 frozenset({'newspapers

In [32]:
L[1]

[frozenset({'whipped/sour cream', 'yogurt'}),
 frozenset({'other vegetables', 'yogurt'}),
 frozenset({'other vegetables', 'pip fruit'}),
 frozenset({'other vegetables', 'pastry'}),
 frozenset({'other vegetables', 'shopping bags'}),
 frozenset({'other vegetables', 'sausage'}),
 frozenset({'bottled beer', 'whole milk'}),
 frozenset({'shopping bags', 'whole milk'}),
 frozenset({'citrus fruit', 'other vegetables'}),
 frozenset({'fruit/vegetable juice', 'whole milk'}),
 frozenset({'frankfurter', 'whole milk'}),
 frozenset({'newspapers', 'whole milk'}),
 frozenset({'margarine', 'whole milk'}),
 frozenset({'pip fruit', 'tropical fruit'}),
 frozenset({'pip fruit', 'whole milk'}),
 frozenset({'rolls/buns', 'whole milk'}),
 frozenset({'beef', 'whole milk'}),
 frozenset({'sausage', 'whole milk'}),
 frozenset({'frozen vegetables', 'whole milk'}),
 frozenset({'pastry', 'rolls/buns'}),
 frozenset({'fruit/vegetable juice', 'other vegetables'}),
 frozenset({'domestic eggs', 'other vegetables'}),
 froz

In [33]:
L[2]

[frozenset({'other vegetables', 'whole milk', 'yogurt'}),
 frozenset({'other vegetables', 'root vegetables', 'whole milk'})]

In [34]:
L[3]

[]

In [35]:
ap.aprioriGen(L[0], 2)

[frozenset({'meat', 'sliced cheese'}),
 frozenset({'meat', 'onions'}),
 frozenset({'frozen meals', 'meat'}),
 frozenset({'meat', 'specialty chocolate'}),
 frozenset({'frozen vegetables', 'meat'}),
 frozenset({'ice cream', 'meat'}),
 frozenset({'meat', 'oil'}),
 frozenset({'chewing gum', 'meat'}),
 frozenset({'ham', 'meat'}),
 frozenset({'cat food', 'meat'}),
 frozenset({'hard cheese', 'meat'}),
 frozenset({'meat', 'misc. beverages'}),
 frozenset({'domestic eggs', 'meat'}),
 frozenset({'dessert', 'meat'}),
 frozenset({'grapes', 'meat'}),
 frozenset({'meat', 'whipped/sour cream'}),
 frozenset({'meat', 'pork'}),
 frozenset({'berries', 'meat'}),
 frozenset({'meat', 'napkins'}),
 frozenset({'hygiene articles', 'meat'}),
 frozenset({'hamburger meat', 'meat'}),
 frozenset({'beverages', 'meat'}),
 frozenset({'meat', 'shopping bags'}),
 frozenset({'brown bread', 'meat'}),
 frozenset({'meat', 'sausage'}),
 frozenset({'canned beer', 'meat'}),
 frozenset({'meat', 'waffles'}),
 frozenset({'meat', '

In [36]:
rules = ap.generateRules(L, suppData, minConf=0.1)

frozenset({'yogurt'}) --> frozenset({'whipped/sour cream'}) conf: 0.14868804664723032
frozenset({'whipped/sour cream'}) --> frozenset({'yogurt'}) conf: 0.2893617021276596
frozenset({'other vegetables'}) --> frozenset({'yogurt'}) conf: 0.22438255386232264
frozenset({'yogurt'}) --> frozenset({'other vegetables'}) conf: 0.3112244897959184
frozenset({'pip fruit'}) --> frozenset({'other vegetables'}) conf: 0.3454301075268817
frozenset({'other vegetables'}) --> frozenset({'pip fruit'}) conf: 0.1350499211770888
frozenset({'pastry'}) --> frozenset({'other vegetables'}) conf: 0.2537142857142857
frozenset({'other vegetables'}) --> frozenset({'pastry'}) conf: 0.11665790856542302
frozenset({'other vegetables'}) --> frozenset({'shopping bags'}) conf: 0.11981082501313715
frozenset({'shopping bags'}) --> frozenset({'other vegetables'}) conf: 0.23529411764705885
frozenset({'other vegetables'}) --> frozenset({'sausage'}) conf: 0.13925380977404098
frozenset({'sausage'}) --> frozenset({'other vegetables'

In [37]:
rules

[(frozenset({'yogurt'}),
  frozenset({'whipped/sour cream'}),
  0.14868804664723032),
 (frozenset({'whipped/sour cream'}),
  frozenset({'yogurt'}),
  0.2893617021276596),
 (frozenset({'other vegetables'}), frozenset({'yogurt'}), 0.22438255386232264),
 (frozenset({'yogurt'}), frozenset({'other vegetables'}), 0.3112244897959184),
 (frozenset({'pip fruit'}),
  frozenset({'other vegetables'}),
  0.3454301075268817),
 (frozenset({'other vegetables'}),
  frozenset({'pip fruit'}),
  0.1350499211770888),
 (frozenset({'pastry'}), frozenset({'other vegetables'}), 0.2537142857142857),
 (frozenset({'other vegetables'}), frozenset({'pastry'}), 0.11665790856542302),
 (frozenset({'other vegetables'}),
  frozenset({'shopping bags'}),
  0.11981082501313715),
 (frozenset({'shopping bags'}),
  frozenset({'other vegetables'}),
  0.23529411764705885),
 (frozenset({'other vegetables'}),
  frozenset({'sausage'}),
  0.13925380977404098),
 (frozenset({'sausage'}), frozenset({'other vegetables'}), 0.28679653679

## **예제: 독버섯과 유사한 속성 찾기**

apriori 알고리즘을 이용해 독버섯의 공통적인 속성을 찾아보자.

In [38]:
mushDatSet = [line.split() for line in open('mushroom.dat').readlines()]

이제 이 데이터 집합으로 어프라이어리 알고리즘을 수행해 보자.

In [39]:
L, suppData = apriori.apriori(mushDatSet, minSupport=0.3)

독성에 대한 두 가지 속성에 대해 빈발 아이템 집합을 검색해 보자.

In [40]:
for item in L[1]:
    if item.intersection('2'): print(item)

frozenset({'2', '28'})
frozenset({'2', '53'})
frozenset({'2', '23'})
frozenset({'2', '34'})
frozenset({'2', '36'})
frozenset({'2', '59'})
frozenset({'2', '63'})
frozenset({'2', '67'})
frozenset({'2', '76'})
frozenset({'2', '85'})
frozenset({'2', '86'})
frozenset({'2', '90'})
frozenset({'2', '93'})
frozenset({'39', '2'})


또한, 큰 데이터 집합에 대해 반복할 수도 있다.

In [41]:
for item in L[3]:
    if item.intersection('2'): print(item)

frozenset({'63', '2', '34', '28'})
frozenset({'85', '2', '34', '28'})
frozenset({'86', '2', '34', '28'})
frozenset({'90', '2', '34', '28'})
frozenset({'2', '59', '34', '28'})
frozenset({'63', '2', '59', '28'})
frozenset({'85', '2', '59', '28'})
frozenset({'86', '2', '59', '28'})
frozenset({'90', '2', '59', '28'})
frozenset({'63', '85', '2', '28'})
frozenset({'86', '63', '2', '28'})
frozenset({'86', '85', '2', '28'})
frozenset({'86', '90', '2', '28'})
frozenset({'90', '85', '2', '28'})
frozenset({'2', '39', '59', '28'})
frozenset({'39', '2', '34', '28'})
frozenset({'63', '39', '2', '28'})
frozenset({'85', '39', '2', '28'})
frozenset({'86', '39', '2', '28'})
frozenset({'90', '39', '2', '28'})
frozenset({'53', '85', '2', '34'})
frozenset({'86', '53', '2', '34'})
frozenset({'53', '90', '2', '34'})
frozenset({'53', '2', '34', '28'})
frozenset({'53', '85', '2', '28'})
frozenset({'86', '53', '85', '2'})
frozenset({'86', '53', '90', '2'})
frozenset({'86', '53', '2', '28'})
frozenset({'53', '90

이제 독성이 있는 버섯들의 속성이 어떤 것들이 있는지 살펴보고 이 속성에 따라 독성이 있음을 알아낼 수 있다.