In [1]:
#import libraries
import numpy as np
import pandas as pd
from itertools import combinations

#import UCI adult census data
dataSet=pd.read_csv('adult.csv')

#categorize the continuous attributes :  age,weekly_hours, capital_gain, capital_loss values, weight

age_category=pd.cut(dataSet.age,bins=[21,40,65,99],labels=['Adult','Middle-aged','Elderly'])
dataSet.insert(0,'age group',age_category)

hours_category=pd.cut(dataSet.weekly_hours,bins=[1,20,50,100],labels=['Low hours','Medium hours','High hours'])
dataSet.insert(13,'weekly group',hours_category)

capgain_category=pd.cut(dataSet.capital_gain,bins=[-1,1,99999],labels=['Zero capgain','Non-zero capgain'])
dataSet.insert(11,'cap gain',capgain_category)

caploss_category=pd.cut(dataSet.capital_loss,bins=[-1,1,5000],labels=['Zero caploss','Non-zero caploss'])
dataSet.insert(13,'cap loss',caploss_category)

weight_category=pd.cut(dataSet.final_weight,5,labels=['w1','w2','w3','w4','w5'])
dataSet.insert(4,'weight category',weight_category)

#drop unnecessary columns

dataSet=dataSet.drop(columns=['age','final_weight','education_num','capital_gain','capital_loss','weekly_hours'])

dataSet = dataSet.dropna()


In [2]:
len(dataSet)

29412

In [3]:
def get_data():
    #df=pd.read_csv('new_adult.csv')
    df=pd.DataFrame(dataSet)
    df = df.stack().reset_index(-1, drop=True)
    df.index.names = ['tx_id']
    df = pd.get_dummies(df,prefix='',prefix_sep='')
    return df.groupby(level='tx_id').sum()


In [4]:
#ONLY RUN THIS CELL FOR IMPROVEMENT OF THE APRIORI ALGORITHM
#One improvement on Apriori algorithm: sampling 

sample= dataSet.sample(n= int(round(0.60*len(dataSet))),random_state=1) #sample the dataset, in this case we consider a 
                                                                        #sample that is 60% the size of the original data
def get_sample_data():
    df=pd.DataFrame(sample)
    df = df.stack().reset_index(-1, drop=True)
    df.index.names = ['tx_id']
    df = pd.get_dummies(df,prefix='',prefix_sep='')
    return df.groupby(level='tx_id').sum()

dataset = get_sample_data()
dataset.columns
item_id = pd.Series(dict(enumerate(dataset.columns)))  #different possible values in the dataset (before get dummies)
transactions = dataset.values #transaction matrix

#set minimum support threshold value
min_sup=0.50
item_length = 1
candidates = list(zip(item_id.index))
candidates_tested = 0 #no candidates have been tested initially
itemsets = pd.DataFrame()



In [5]:
#create the itemset candidates

#filename=input('Enter filename')
dataset = get_data()
#dataset = get_sample_data()   #run this line in place of the above line to get a SAMPLE of the data set (improvement of apriori)
dataset.columns
item_id = pd.Series(dict(enumerate(dataset.columns)))  #different possible values in the dataset (before get dummies)
transactions = dataset.values #transaction matrix

#set minimum support threshold value
min_sup=0.60
item_length = 1
candidates = list(zip(item_id.index))
candidates_tested = 0 #no candidates have been tested initially
itemsets = pd.DataFrame()



In [6]:
#generate the candidates

for i in range (1,5):
    remaining_items = np.unique([item for t in candidates for item in t])
    new_candidates = list(combinations(remaining_items,r=i))
    print('Length {}: {:10,.0f}'.format(i,len(new_candidates)), new_candidates[:5])
    
    
    

Length 1:        117 [(0,), (1,), (2,), (3,), (4,)]
Length 2:      6,786 [(0, 1), (0, 2), (0, 3), (0, 4), (0, 5)]
Length 3:    260,130 [(0, 1, 2), (0, 1, 3), (0, 1, 4), (0, 1, 5), (0, 1, 6)]
Length 4:  7,413,705 [(0, 1, 2, 3), (0, 1, 2, 4), (0, 1, 2, 5), (0, 1, 2, 6), (0, 1, 2, 7)]


In [7]:
def prune_candidates(all_transactions, candidates, cand_size, min_sup):
    itemsets = {} #dictionary where keys are the itemsets and values are the support levels
    for candidate in candidates:
        cand_transaction = all_transactions[:,candidate] #selecting from all the transactions, those columns that correspond to the candidates we are considering
        relevant_txn = cand_transaction[(cand_transaction==1).all(axis=1)] #transactions where all items are present i.e value is 1
        cand_sup = relevant_txn.shape[0]/all_transactions.shape[0] #calc support by dividing length of number of relevant transactions by all transactions
        if cand_sup >= min_sup: #does candidate support meet required threshold
            itemsets[frozenset(candidate)] = cand_sup #add to itemsets dictionary
    result = pd.Series(itemsets).to_frame('support') #series where index corresponds to itemsets
    return result.assign(length=cand_size) #convert series to frame


while candidates:
    new_items = prune_candidates(transactions, candidates, item_length,min_sup)
    itemsets = itemsets.append(new_items)
    candidates_tested += len(candidates)
    print('itemset length {}\tCandidates: {:>7.0f}\tNew Items: {:>7,.0f}'.format(item_length,len(candidates),len(new_items)))
    item_length +=1
    remaining_items = np.unique([item for t in new_items.index for item in t])
    candidates = list(combinations(remaining_items, r=item_length))
print('\nPotential itemsets: {:,.0f} \nTested itemsets: {:,.0f}'.format(
     2**len(item_id) - 1,candidates_tested))

itemset length 1	Candidates:     117	New Items:       9
itemset length 2	Candidates:      36	New Items:      29
itemset length 3	Candidates:      84	New Items:      26
itemset length 4	Candidates:      70	New Items:       8
itemset length 5	Candidates:       6	New Items:       0

Potential itemsets: 166,153,499,473,114,484,112,975,882,535,043,072 
Tested itemsets: 313


  result = pd.Series(itemsets).to_frame('support') #series where index corresponds to itemsets


In [8]:
#itemsets = itemsets.drop(columns=['Length','Support'])
itemsets.info()
itemsets.sort_values('support',ascending=False)
itemsets

<class 'pandas.core.frame.DataFrame'>
Index: 72 entries, frozenset({7}) to frozenset({112, 105, 110, 111})
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   support  72 non-null     float64
 1   length   72 non-null     int64  
dtypes: float64(1), int64(1)
memory usage: 1.7+ KB


Unnamed: 0,support,length
(7),0.733646,1
(59),0.685197,1
(64),0.815517,1
(84),0.689582,1
(102),0.892765,1
...,...,...
"(105, 110, 102, 111)",0.667721,4
"(112, 105, 110, 102)",0.628621,4
"(112, 105, 102, 111)",0.658065,4
"(112, 110, 102, 111)",0.674453,4


In [9]:
print('List of frequent patterns with support')
for fp in itemsets.index:
    print('pattern is: ')
    for i in fp:
        print(item_id[i])
    print('support is ',itemsets.support[fp])
    print('\n')
    

List of frequent patterns with support
pattern is: 
<=50K
support is  0.7336461308309533


pattern is: 
Male
support is  0.6851965184278526


pattern is: 
Medium hours
support is  0.8155174758601931


pattern is: 
Private
support is  0.6895824833401333


pattern is: 
United-States
support is  0.8927648578811369


pattern is: 
White
support is  0.8519651842785257


pattern is: 
Zero capgain
support is  0.9104447164422684


pattern is: 
Zero caploss
support is  0.9510403916768666


pattern is: 
w1
support is  0.8752549979600163


pattern is: 
Medium hours
<=50K
support is  0.6068271453828369


pattern is: 
United-States
<=50K
support is  0.649156806745546


pattern is: 
White
<=50K
support is  0.6101591187270502


pattern is: 
Zero capgain
<=50K
support is  0.7010403916768666


pattern is: 
Zero caploss
<=50K
support is  0.710968312253502


pattern is: 
w1
<=50K
support is  0.6424928600571196


pattern is: 
Male
United-States
support is  0.6116891064871481


pattern is: 
White
Male
suppo