### DDW - Association Rules Mining

In [7]:
! pip install pandas



In [8]:
import pandas as pd

#### Basic Operations

In [13]:
# Reading csv to a data frame
import pandas as pd
df = pd.read_csv('data/bank-data.csv')

# print head(tail) of the data frame
display(df.head()) # df.tail()

# select column
display(df[['age', 'car']])

# select by index
display(df.iloc[3:6,5:9])

# delete column
del df["id"]
display(df.head())

# discretize continous values to categorical values
df["income"] = pd.cut(df["income"],10)
display(df.head())

Unnamed: 0,id,age,sex,region,income,married,children,car,save_act,current_act,mortgage,pep
0,ID12101,48,FEMALE,INNER_CITY,17546.0,NO,1,NO,NO,NO,NO,YES
1,ID12102,40,MALE,TOWN,30085.1,YES,3,YES,NO,YES,YES,NO
2,ID12103,51,FEMALE,INNER_CITY,16575.4,YES,0,YES,YES,YES,NO,NO
3,ID12104,23,FEMALE,TOWN,20375.4,YES,3,NO,NO,YES,NO,NO
4,ID12105,57,FEMALE,RURAL,50576.3,YES,0,NO,YES,NO,NO,NO


Unnamed: 0,age,car
0,48,NO
1,40,YES
2,51,YES
3,23,NO
4,57,NO
...,...,...
595,61,YES
596,30,YES
597,31,YES
598,29,NO


Unnamed: 0,married,children,car,save_act
3,YES,3,NO,NO
4,YES,0,NO,YES
5,YES,2,NO,YES


Unnamed: 0,age,sex,region,income,married,children,car,save_act,current_act,mortgage,pep
0,48,FEMALE,INNER_CITY,17546.0,NO,1,NO,NO,NO,NO,YES
1,40,MALE,TOWN,30085.1,YES,3,YES,NO,YES,YES,NO
2,51,FEMALE,INNER_CITY,16575.4,YES,0,YES,YES,YES,NO,NO
3,23,FEMALE,TOWN,20375.4,YES,3,NO,NO,YES,NO,NO
4,57,FEMALE,RURAL,50576.3,YES,0,NO,YES,NO,NO,NO


Unnamed: 0,age,sex,region,income,married,children,car,save_act,current_act,mortgage,pep
0,48,FEMALE,INNER_CITY,"(16637.388, 22448.977]",NO,1,NO,NO,NO,NO,YES
1,40,MALE,TOWN,"(28260.566, 34072.155]",YES,3,YES,NO,YES,YES,NO
2,51,FEMALE,INNER_CITY,"(10825.799, 16637.388]",YES,0,YES,YES,YES,NO,NO
3,23,FEMALE,TOWN,"(16637.388, 22448.977]",YES,3,NO,NO,YES,NO,NO
4,57,FEMALE,RURAL,"(45695.333, 51506.922]",YES,0,NO,YES,NO,NO,NO


### Apriori algorithm implementation

In [4]:
from collections import Counter

def frequentItems(transactions, support):
    counter = Counter()
    for trans in transactions:
        counter.update(frozenset([t]) for t in trans)
    return set(item for item in counter if counter[item]/len(transactions) >= support), counter

def generateCandidates(L, k):
    candidates = set()
    for a in L:
        for b in L:
            union = a | b
            if len(union) == k and a != b:
                candidates.add(union)
    return candidates

def filterCandidates(transactions, itemsets, support):
    counter = Counter()
    for trans in transactions:
        subsets = [itemset for itemset in itemsets if itemset.issubset(trans)]
        counter.update(subsets)
    return set(item for item in counter if counter[item]/len(transactions) >= support), counter

def apriori(transactions, support):
    result = list()
    resultc = Counter()
    candidates, counter = frequentItems(transactions, support)
    result += candidates
    resultc += counter
    k = 2
    while candidates:
        candidates = generateCandidates(candidates, k)
        candidates,counter = filterCandidates(transactions, candidates, support)
        result += candidates
        resultc += counter
        k += 1
    resultc = {item:(resultc[item]/len(transactions)) for item in resultc}
    return result, resultc

#### Frequent item sets

In [5]:
dataset = [
    ['bread', 'milk'],
    ['bread', 'diaper', 'beer', 'egg'],
    ['milk', 'diaper', 'beer', 'cola'],
    ['bread', 'milk', 'diaper', 'beer'],
    ['bread', 'milk', 'diaper', 'cola'],
]

frequentItemsets, supports = apriori(dataset, 0.1)
for f in frequentItemsets:
    print("{} - {}".format(f,supports[f]))

frozenset({'milk'}) - 0.8
frozenset({'diaper'}) - 0.8
frozenset({'beer'}) - 0.6
frozenset({'cola'}) - 0.4
frozenset({'egg'}) - 0.2
frozenset({'bread'}) - 0.8
frozenset({'diaper', 'milk'}) - 0.6
frozenset({'milk', 'bread'}) - 0.6
frozenset({'beer', 'milk'}) - 0.4
frozenset({'cola', 'diaper'}) - 0.4
frozenset({'egg', 'bread'}) - 0.2
frozenset({'egg', 'diaper'}) - 0.2
frozenset({'cola', 'bread'}) - 0.2
frozenset({'beer', 'diaper'}) - 0.6
frozenset({'beer', 'bread'}) - 0.4
frozenset({'diaper', 'bread'}) - 0.6
frozenset({'egg', 'beer'}) - 0.2
frozenset({'cola', 'milk'}) - 0.4
frozenset({'cola', 'beer'}) - 0.2
frozenset({'cola', 'diaper', 'bread'}) - 0.2
frozenset({'beer', 'diaper', 'bread'}) - 0.4
frozenset({'egg', 'diaper', 'bread'}) - 0.2
frozenset({'diaper', 'cola', 'milk'}) - 0.4
frozenset({'cola', 'beer', 'diaper'}) - 0.2
frozenset({'egg', 'beer', 'diaper'}) - 0.2
frozenset({'beer', 'milk', 'bread'}) - 0.2
frozenset({'diaper', 'milk', 'bread'}) - 0.4
frozenset({'cola', 'beer', 'milk'})

#### Rules

In [6]:
def genereateRules(frequentItemsets, supports, minConfidence):
    print(" .... ")

# bank dataset preprocessing
import pandas as pd
df = pd.read_csv("data/bank-data.csv")
del df["id"]
df["income"] = pd.cut(df["income"],10)
dataset = []
for index, row in df.iterrows():
    row = [col+"="+str(row[col]) for col in list(df)]
    dataset.append(row)
frequentItemsets, supports = apriori(dataset, 0.3)
genereateRules(frequentItemsets, supports, 0.5)

# ...
# {'car=YES'} => married=YES, 0.3233333333333333, 0.6554054054054054
# ...
# {'married=YES', 'save_act=YES'} => current_act=YES, 0.3433333333333333, 0.7436823104693141
# ...

 .... 
