In [1]:
# importing the necessary modules
import pandas as pd
import numpy as np
import time
from itertools import combinations

In [2]:
# reading the file using Pandas and changing into DataFrame
file = pd.read_csv("retail.txt", names=['items'], dtype=object)

# Removing "\n" values from the data
for i in file.index:
    file.iloc[i]['items'] = file.iloc[i]['items'].split(" ")
    del file.iloc[i]['items'][-1]

In [3]:
# function to create frequent itemset from candidate set
def createFrequentItemSet(candidate_set: pd.DataFrame, min_support: int):
    # creating an empty DataFrame
    frequent_itemset = pd.DataFrame(columns=['support'])
    
    # iterating through the candidate set
    for item in candidate_set.index:
        # checking if the support of the item is >= min_support and putting them in frequent itemset
        if (candidate_set.loc[item]['support'] >= min_support):
            frequent_itemset.loc[item, 'support'] = candidate_set.loc[item, 'support']

    return frequent_itemset

In [4]:
# function to create frequent itemset of pairs from candidate set containing pairs
def createFrequentItemSet2(candidate_set: pd.DataFrame, min_support: int):
    #creating an empty DataFrame
    frequent_itemset = pd.DataFrame(columns=['items', 'support'])
    
    # iterating through candidate_set
    for item in candidate_set.index:
        # checking the support of pair with min_support
        if (candidate_set.loc[item, 'support'] >= min_support):
            frequent_itemset.loc[item, 'support'] = candidate_set.loc[item, 'support']
            frequent_itemset.loc[item, 'items'] = candidate_set.loc[item, 'items']

    return frequent_itemset

In [5]:
# function to check if the pair of items are in the basket
def isSubset(bigger_array, smaller_array):
    isSubset = True
    for i in smaller_array:
        if i in bigger_array:
            continue
        else:
            isSubset = False
            break

    return isSubset

In [6]:
# function to create pair from the frequent itemset
def createPairs(frequent_itemset: pd.DataFrame):
    index_array = frequent_itemset.index
    candidate_set = pd.DataFrame(columns=['items', 'support'])
    
    #using combinations from itertools to create pairs
    pairs = list(combinations(index_array, 2))
    pair_df = pd.DataFrame(columns=['items', 'support'])
    
    index = 0
    for i in range(len(pairs)):
            pair_df.loc[i,'items'] = pairs[i]
            pair_df.loc[i,'support'] = 0
            candidate_set = pd.concat([candidate_set, pair_df])
            index += 1
            pair_df.drop(pair_df.index, inplace=True)
    
    return candidate_set

In [7]:
#function implementing apriori algorithm
def Apriori(transactions: pd.DataFrame, min_support: int) -> pd.DataFrame:
    candidate_set = pd.DataFrame(columns=['support'])
    
    # iterating through the baskets to create candidate set with the support of individual items
    for i in transactions.index:
        for item in transactions['items'][i]:
            if item in candidate_set.index:
                candidate_set.at[item, 'support'] += 1
            else:
                df2 = pd.DataFrame(columns=['support'])
                df2.loc[item, 'support'] = 1 
                candidate_set = pd.concat([candidate_set, df2])
                

    # calling createFrequentItemSet to create a frequent itemset
    frequent_itemset = createFrequentItemSet(candidate_set, min_support)

    # creating pairs from frequent itemset
    candidate_set = createPairs(frequent_itemset)

    # clearing frequent_itemset
    frequent_itemset.drop(frequent_itemset.index, inplace=True)

    # creating support of the pairs in the candidate_set
    for i in candidate_set.index:
        for j in transactions.index:
            if isSubset(transactions.loc[j]['items'], candidate_set.loc[i]['items']):
                candidate_set.iloc[i]['support'] += 1

    # using createFrequentItemsSet2 to create frequent itemset containing pairs.
    frequent_itemset = createFrequentItemSet2(candidate_set, min_support)

    return frequent_itemset

In [25]:
# using time.time() to calculate the time taken by the algorithm to execute
start_time = time.time()
freq = Apriori(file.sample(frac=0.0001, ignore_index = True), 2)
end_time = time.time()
print(end_time-start_time)

0.16098713874816895


In [26]:
print(freq)

        items support
0    (39, 48)       4
1   (39, 856)       2
2    (39, 38)       2
3    (39, 65)       2
13  (38, 170)       2
