# Frequent Pattern Mining

Dataset:
Bank-marketing (https://www.kaggle.com/datasets/janiobachmann/bank-marketing-dataset)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import scipy.stats as stats

# 即时生成图片
%matplotlib inline

# 隠藏Wranning 
import warnings
#warnings.filterwarnings("ignore")

# 1. 导入数据集


In [2]:
# import bank.csv
dataset = pd.read_csv("data/bank.csv")
#print(f'Dataset size: {dataset.shape}')
#print(f'{"Column Name":<16}Data Type')
dataset.info()

#dataset = dataset.drop('Id',axis=1)
#print(f'After Delete, Dataset size: {dataset.shape}')



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.4+ MB


# 2. 数据预处理

此次数据集没有缺失值, 但有一项栏位 `duration` 表示营销联系时的持续时间, 然而在执行呼叫之前持续时间是未知的, 因此将其丢弃。

In [3]:

# 持续时间：最后一次联系持续时间，以秒为单位（数字）
dataset = dataset.drop('duration',axis=1)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  campaign   11162 non-null  int64 
 12  pdays      11162 non-null  int64 
 13  previous   11162 non-null  int64 
 14  poutcome   11162 non-null  object
 15  deposit    11162 non-null  object
dtypes: int64(6), object(10)
memory usage: 1.4+ MB


# 3. Apriori 算法
观察数据集


In [4]:
dataset.describe() 

Unnamed: 0,age,balance,day,campaign,pdays,previous
count,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0
mean,41.231948,1528.538524,15.658036,2.508421,51.330407,0.832557
std,11.913369,3225.413326,8.42074,2.722077,108.758282,2.292007
min,18.0,-6847.0,1.0,1.0,-1.0,0.0
25%,32.0,122.0,8.0,1.0,-1.0,0.0
50%,39.0,550.0,15.0,2.0,-1.0,0.0
75%,49.0,1708.0,22.0,3.0,20.75,1.0
max,95.0,81204.0,31.0,63.0,854.0,58.0


# 算法实现

In [5]:
from itertools import combinations, chain

# 数据集转換成patters
def convertPatterns(dataset, k):
    patterns = set()
    # 遍历数据集
    for data in dataset:
        for itemset in combinations(data, k):
            patterns.add(itemset)
    return patterns

# 计算支持度
def calcSupport(pattern, dataset):
    count = 0
    for data in dataset:
        if set(pattern).issubset(set(data)):
            count += 1
    return count

# 生成超集
def powerset(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))


def selfApriori(dataset, min_support):
    frequent_itemsets = []
    k = 1

    while True:
        # 生成候选 patterns 集
        patterns = convertPatterns(dataset, k)
        
        if not patterns:
            break

        frequent_itemsets_k = []
        for pattern in patterns:
            # 计算 pattern 的支持度
            support = calcSupport(pattern, dataset)
            if support >= min_support:
                frequent_itemsets_k.append(pattern)

        frequent_itemsets.extend(frequent_itemsets_k)
        k += 1

    return frequent_itemsets

# 测试代码
aprioriDataset = dataset.select_dtypes(object).head(100).values.tolist()
min_support = 50
frequent_itemsets = selfApriori(aprioriDataset, min_support)
print(len(frequent_itemsets))
#print(list(frequent_itemsets))

731


In [6]:
aprioriDataset2 = dataset.select_dtypes(object)

aprioriDataset2 = aprioriDataset2.drop(['month', 'default', 'housing', 'loan'], axis=1)
#print(aprioriDataset2.describe())
aprioriDataset2 = aprioriDataset2.head(100).values.tolist()
min_support = 50
frequent_itemsets = selfApriori(aprioriDataset, min_support)
print(len(frequent_itemsets))
#print(list(frequent_itemsets))

731


由于对数据处理的不够完善, 导致 Apriori 的结果很多, 不太好
应该可以在输入数据集时, 对数据集进行进一步处理