# 11.Apriori算法

In [3]:
from numpy import *

## 11.1.Ariori算法中的辅助函数

In [4]:
def loadDataSet():
    """产生简单的数据集
    :return: 简单的数据集
    """
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

def createC1(dataSet):
    """构建数据集的集合
    :param dataSet: 数据集
    :return: 不变集
    """
    #创建一个新列表
    C1 = []
    #遍历数据集中的所有项，得到项的集合
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    #排序
    C1.sort()
    #使用frozenset，得到不变集
    return list(map(frozenset, C1))

def scanD(D, Ck, minSupport):
    """计算支持度，得到满足要求的集合
    :param D: 数据集
    :param Ck: 集合Ck
    :param minSupport: 最小支持度
    :return retList: 返回的字典元素的列表
    :return supportData: 最频繁集的支持度
    """
    #新建以Ck项为关键词的计数字典
    ssCnt = {}
    #遍历数据和项的组合
    for tid in D:
        for can in Ck:
            #如果项是数据的子集
            if can.issubset(tid):
                #计数+1，如果以前没统计过就新建一个字典，从1开始
                if not ssCnt.__contains__(can): ssCnt[can]=1
                else: ssCnt[can] += 1
    #数据的个数
    numItems = float(len(D))
    #新建返回字典构成的列表
    retList = []
    #支持度字典
    supportData = {}
    #对于计数器中的每一项
    for key in ssCnt:
        #计算对应的支持度
        support = ssCnt[key]/numItems
        #如果支持度大于最小值支持度
        if support >= minSupport:
            #添加到输出列表中
            retList.insert(0,key)
        #记录频繁集的支持度
        supportData[key] = support
    return retList, supportData

测试

In [5]:
dataSet = loadDataSet()
print(f"数据集：\n{dataSet}")

C1 = createC1(dataSet)
print(f"第一个候选项集合C1：\n{C1}")

D = list(map(set, dataSet))
print(f"数据集的集合D：\n{D}")

L1, suppData0 = scanD(D, C1, 0.5)
print(f"结果：\n{L1}")

数据集：
[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
第一个候选项集合C1：
[frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]
数据集的集合D：
[{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}]
结果：
[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]


## 11.2.组织完整的Apriori算法

In [6]:
def aprioriGen(Lk, k): 
    """产生Ck
    :param Lk: 频繁项的集合
    :param k: 项集元素的个数
    :return retList: 返回Ck的列表
    """
    #新建空的返回列表
    retList = []
    #Lk的项数
    lenLk = len(Lk)
    #遍历Lk中项的两两组合
    for i in range(lenLk):
        for j in range(i+1, lenLk): 
            #为了避免并集后出现重复，只需要合并前k-2个元素相同的集合
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
            #排序，组合，不考虑元素间的顺序
            L1.sort(); L2.sort()
            #如果前k-2个元素相同
            if L1==L2: 
                #取并集
                retList.append(Lk[i] | Lk[j])
    return retList

In [7]:
def apriori(dataSet, minSupport = 0.5):
    """Apriori算法
    :param dataSet: 数据集
    :param minSupport: 最小支持度，默认0.5
    :return L: 满足条件的项集
    :return supportData: 最频繁项集的支持度
    """
    #建立C1集和D集
    C1 = createC1(dataSet)
    D = list(map(set, dataSet))
    #验证C1的超集，留下满足最小支持度的
    L1, supportData = scanD(D, C1, minSupport)
    L = [L1]
    k = 2
    #当还有项时，不断取超集
    while (len(L[k-2]) > 0):
        Ck = aprioriGen(L[k-2], k)
        Lk, supK = scanD(D, Ck, minSupport)
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData

验证

In [8]:
L, suppData = apriori(dataSet)
print(f"最终结果为：\n{L}")

最终结果为：
[[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})], [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})], [frozenset({2, 3, 5})], []]


检验一下，aprioriGen是否真的不会产生重复的项

In [9]:
aprioriGen(L[0], 2)

[frozenset({2, 5}),
 frozenset({3, 5}),
 frozenset({1, 5}),
 frozenset({2, 3}),
 frozenset({1, 2}),
 frozenset({1, 3})]

70%的支持度

In [10]:
L, _ = apriori(dataSet, minSupport=0.7)
print(f"70%支持度下的最终结果为：\n{L}")

70%支持度下的最终结果为：
[[frozenset({5}), frozenset({2}), frozenset({3})], [frozenset({2, 5})], []]


## 11.3.从频繁项集中挖掘关联规则

In [11]:
def generateRules(L, supportData, minConf=0.7):
    """生成规则
    :param L: 频繁项集合所构成的列表
    :param supportData: apriori函数得到的最频繁项集的支持度
    :param minConf: 最小置信度，默认0.7
    :return bigRuleList: 规则集合
    """
    #新建一个规则列表
    bigRuleList = []
    #对于含有两个以上项的集合
    for i in range(1, len(L)):
        #对于每一项
        for freqSet in L[i]:
            #H1为频繁项对应的不变集组成的列表
            H1 = [frozenset([item]) for item in freqSet]
            if (i > 1):
                #生成候选规则集
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                #计算置信度
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList         

In [12]:
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    """计算置信度
    :param freqSet: 频繁集
    :param H: 频繁集对应的不变集列表
    :param supportData: 支持度
    :param brl: 规则列表
    :param minConf: 最小置信度，默认值0.7
    :return prunedH: 满足最小置信度的规则列表
    """
    #新建一个列表
    prunedH = []
    #对于频繁集中的每一项
    for conseq in H:
        #计算置信度
        conf = supportData[freqSet]/supportData[freqSet-conseq]
        # 如果置信度满足最小置信度要求
        if conf >= minConf: 
            #打印结果
            print(freqSet-conseq,'-->',conseq,'conf:',conf)
            #添加到规则
            brl.append((freqSet-conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH

In [13]:
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    """最初项集中生成更多关联规则
    :param freqSet: 频繁集合
    :param H: 频繁集对应的不变集列表
    :param supportData: 支持度
    :param brl: 规则列表
    :param minConf: 最小置信度，默认值0.7
    """
    #频繁集大小m
    m = len(H[0])
    #判断频繁集是否可以移除子集
    if (len(freqSet) > (m + 1)): 
        #迭代地取得满足可信度要求的规则
        Hmp1 = aprioriGen(H, m+1)
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        if (len(Hmp1) > 1):
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)

测试

In [14]:
L, suppData = apriori(dataSet, minSupport=0.5)
rules = generateRules(L, suppData, minConf=0.7)
rules

frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({1}) --> frozenset({3}) conf: 1.0


[(frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({1}), frozenset({3}), 1.0)]

降低可信度要求

In [15]:
rules = generateRules(L, suppData, minConf=0.5)
rules

frozenset({3}) --> frozenset({2}) conf: 0.6666666666666666
frozenset({2}) --> frozenset({3}) conf: 0.6666666666666666
frozenset({5}) --> frozenset({3}) conf: 0.6666666666666666
frozenset({3}) --> frozenset({5}) conf: 0.6666666666666666
frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({3}) --> frozenset({1}) conf: 0.6666666666666666
frozenset({1}) --> frozenset({3}) conf: 1.0
frozenset({5}) --> frozenset({2, 3}) conf: 0.6666666666666666
frozenset({3}) --> frozenset({2, 5}) conf: 0.6666666666666666
frozenset({2}) --> frozenset({3, 5}) conf: 0.6666666666666666


[(frozenset({3}), frozenset({2}), 0.6666666666666666),
 (frozenset({2}), frozenset({3}), 0.6666666666666666),
 (frozenset({5}), frozenset({3}), 0.6666666666666666),
 (frozenset({3}), frozenset({5}), 0.6666666666666666),
 (frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({3}), frozenset({1}), 0.6666666666666666),
 (frozenset({1}), frozenset({3}), 1.0),
 (frozenset({5}), frozenset({2, 3}), 0.6666666666666666),
 (frozenset({3}), frozenset({2, 5}), 0.6666666666666666),
 (frozenset({2}), frozenset({3, 5}), 0.6666666666666666)]

## 11.4.发现毒蘑菇的相似特征

In [17]:
mushDatSet = [line.split() for line in open('mushroom.dat').readlines()]
L, suppData = apriori(mushDatSet, minSupport=0.3)

In [18]:
for item in L[1]:
    if item.intersection('2'): print(item)
for item in L[3]:
    if item.intersection('2'): print(item)

frozenset({'28', '2'})
frozenset({'2', '53'})
frozenset({'2', '23'})
frozenset({'34', '2'})
frozenset({'2', '36'})
frozenset({'2', '59'})
frozenset({'63', '2'})
frozenset({'67', '2'})
frozenset({'2', '76'})
frozenset({'85', '2'})
frozenset({'2', '86'})
frozenset({'2', '90'})
frozenset({'93', '2'})
frozenset({'39', '2'})
frozenset({'28', '2', '34', '59'})
frozenset({'28', '2', '34', '86'})
frozenset({'28', '2', '90', '34'})
frozenset({'28', '2', '34', '53'})
frozenset({'28', '63', '2', '34'})
frozenset({'28', '63', '2', '59'})
frozenset({'28', '63', '2', '85'})
frozenset({'28', '63', '2', '86'})
frozenset({'28', '85', '2', '34'})
frozenset({'28', '85', '2', '59'})
frozenset({'28', '85', '2', '86'})
frozenset({'28', '85', '2', '90'})
frozenset({'28', '85', '2', '53'})
frozenset({'28', '2', '86', '59'})
frozenset({'28', '2', '90', '59'})
frozenset({'28', '2', '90', '86'})
frozenset({'28', '2', '86', '53'})
frozenset({'28', '2', '90', '53'})
frozenset({'39', '28', '2', '53'})
frozenset({'3