In [1]:
# importing modules needed
import pandas as pd
import numpy as np
import math
import time
from itertools import combinations

In [2]:
# reading file using Pandas and converting it to DataFrame
file = pd.read_csv("retail.txt", names=['items'], dtype=object)

# converting the data into an array and removing "\n"
for i in file.index:
    file.iloc[i]['items'] = file.iloc[i]['items'].split(" ")
    del file.iloc[i]['items'][-1]

In [21]:
# defining the number of buckets and creating hashtable and bitmap
num_buckets = 135
hashtable1 = np.zeros((num_buckets))
hashtable2 = np.zeros((num_buckets))
bitmap1 = np.zeros((num_buckets), dtype=bool)
bitmap2 = np.zeros((num_buckets), dtype=bool)

In [22]:
# hashing functions that hashes two items to a index in hashtable/bitmap
def hash1(item1: str, item2: str)->int:
    num1 = int(item1)
    num2 = int(item2)
    return ((num1 + num2) % num_buckets)

In [23]:
def hash2(item1: str, item2: str)->int:
    num1 = int(item1)
    num2 = int(item2)
    return ((num1*num2) % num_buckets)

In [24]:
# using combinations from itertools to create pairs of items
def createPairs(basket: list):
    return list(combinations(basket, 2))

In [25]:
def PCYMultiHash(transactions: pd.DataFrame, min_support: int) -> pd.DataFrame:
    # creating empty dataframe
    candidate_set = pd.DataFrame(columns=['support'])
    
    # iterating the dataset to find support of individual items.
    for i in transactions.index:
        for item in transactions['items'][i]:
            if item in candidate_set.index:
                candidate_set.at[item, 'support'] += 1
            else:
                df2 = pd.DataFrame(columns=['support'])
                df2.loc[item, 'support'] = 1 
                candidate_set = pd.concat([candidate_set, df2])
        
        # creating pair of items in a basket
        pairs = createPairs(transactions['items'][i])
        # hashing the pairs to hashtable and increasing the count
        for pair in pairs:
            hashtable1[hash1(pair[0], pair[1])] += 1
            hashtable2[hash2(pair[0], pair[1])] += 1

    # converting hashtable to bitmap
    for i in range(len(hashtable1)):
        if(hashtable1[i] >= min_support):
            bitmap1[i] = 1
        else:
            bitmap1[i] = 0
        
        if(hashtable2[i] >= min_support):
            bitmap2[i] = 1
        else:
            bitmap2[i] = 0

    frequent_itemset = pd.DataFrame(columns=['items'])
    
    pairs = list(combinations(candidate_set.index,2))
    index = candidate_set.index
    # checking if the pair is frequent
    for pair in pairs:
        item1 = pair[0]
        item2 = pair[1]
        if (bitmap1[hash1(item1, item2)] and bitmap2[hash2(item1, item2)]):
            if(candidate_set.loc[index[index.get_loc(item1)]]['support'] >= min_support and 
               candidate_set.loc[index[index.get_loc(item2)]]['support'] >= min_support):
                df = pd.DataFrame({'items': [pair]})
                frequent_itemset = pd.concat([frequent_itemset,df], ignore_index=True)

    
    return frequent_itemset

In [30]:
start_time = time.time()
freq = PCYMultiHash(file.sample(frac=0.0005, ignore_index= True), 2)
end_time = time.time()
print(end_time-start_time)

4.488422155380249


In [31]:
print(freq)

            items
0        (39, 48)
1        (39, 38)
2        (39, 41)
3       (39, 310)
4       (39, 170)
..            ...
320      (60, 78)
321    (60, 1146)
322    (2987, 78)
323  (2987, 1146)
324    (78, 1146)

[325 rows x 1 columns]
