In [1]:
# importing modules needed
import pandas as pd
import numpy as np
import math
import time

In [2]:
# reading the file using Pandas and changing into DataFrame
file = pd.read_csv("retail.txt", names=['items'], dtype=object)

# Removing "\n" values from the data
for i in file.index:
    file.iloc[i]['items'] = file.iloc[i]['items'].split(" ")
    del file.iloc[i]['items'][-1]

In [34]:
data = file.sample(frac=0.0005, ignore_index=True)

In [28]:
# loading the apriori algorithm create previously.
# %load apy.py
# importing the necessary modules
import pandas as pd
import numpy as np
import time
from itertools import combinations

# reading the file using Pandas and changing into DataFrame
file = pd.read_csv("retail.txt", names=['items'], dtype=object)

# Removing "\n" values from the data
for i in file.index:
    file.iloc[i]['items'] = file.iloc[i]['items'].split(" ")
    del file.iloc[i]['items'][-1]

# function to create frequent itemset from candidate set
def createFrequentItemSet(candidate_set, min_support):
    # creating an empty DataFrame
    frequent_itemset = pd.DataFrame(columns=['support'])
    
    # iterating through the candidate set
    for item in candidate_set.index:
        # checking if the support of the item is >= min_support and putting them in frequent itemset
        if (candidate_set.loc[item]['support'] >= min_support):
            frequent_itemset.loc[item, 'support'] = candidate_set.loc[item, 'support']

    return frequent_itemset

# function to create frequent itemset of pairs from candidate set containing pairs
def createFrequentItemSet2(candidate_set, min_support):
    #creating an empty DataFrame
    frequent_itemset = pd.DataFrame(columns=['items', 'support'])
    
    # iterating through candidate_set
    for item in candidate_set.index:
        # checking the support of pair with min_support
        if (candidate_set.loc[item, 'support'] >= min_support):
            frequent_itemset.loc[item, 'support'] = candidate_set.loc[item, 'support']
            frequent_itemset.loc[item, 'items'] = candidate_set.loc[item, 'items']

    return frequent_itemset

# function to check if the pair of items are in the basket
def isSubset(bigger_array, smaller_array):
    isSubset = True
    for i in smaller_array:
        if i in bigger_array:
            continue
        else:
            isSubset = False
            break

    return isSubset

# function to create pair from the frequent itemset
def createPairs(frequent_itemset):
    index_array = frequent_itemset.index
    candidate_set = pd.DataFrame(columns=['items', 'support'])
    
    #using combinations from itertools to create pairs
    pairs = list(combinations(index_array, 2))
    pair_df = pd.DataFrame(columns=['items', 'support'])
    
    index = 0
    for i in range(len(pairs)):
            pair_df.loc[i,'items'] = pairs[i]
            pair_df.loc[i,'support'] = 0
            candidate_set = pd.concat([candidate_set, pair_df])
            index += 1
            pair_df.drop(pair_df.index, inplace=True)
    
    return candidate_set

#function implementing apriori algorithm
def Apriori(transactions, min_support) -> pd.DataFrame:
    candidate_set = pd.DataFrame(columns=['support'])
    
    # iterating through the baskets to create candidate set with the support of individual items
    for i in transactions.index:
        for item in transactions['items'][i]:
            if item in candidate_set.index:
                candidate_set.at[item, 'support'] += 1
            else:
                df2 = pd.DataFrame(columns=['support'])
                df2.loc[item, 'support'] = 1 
                candidate_set = pd.concat([candidate_set, df2])
                

    # calling createFrequentItemSet to create a frequent itemset
    frequent_itemset = createFrequentItemSet(candidate_set, min_support)

    # creating pairs from frequent itemset
    candidate_set = createPairs(frequent_itemset)

    # clearing frequent_itemset
    frequent_itemset.drop(frequent_itemset.index, inplace=True)

    # creating support of the pairs in the candidate_set
    for i in candidate_set.index:
        for j in transactions.index:
            if isSubset(transactions.loc[j]['items'], candidate_set.loc[i]['items']):
                candidate_set.iloc[i]['support'] += 1

    # using createFrequentItemsSet2 to create frequent itemset containing pairs.
    frequent_itemset = createFrequentItemSet2(candidate_set, min_support)

    return frequent_itemset


In [29]:
# function to divide dataframes into smaller chunks
def split_dataframe_by_position(df, splits):
    """
    Takes a dataframe and an integer of the number of splits to create.
    Returns a list of dataframes.
    """
    dataframes = []
    index_to_split = len(df) // splits
    start = 0
    end = index_to_split
    for split in range(splits):
        temporary_df = df.iloc[start:end, :]
        dataframes.append(temporary_df)
        start += index_to_split
        end += index_to_split
    return dataframes

In [30]:
# function to create frequent itemset using user_inputted support
def createFrequentItemSet3(candidate_set: pd.DataFrame, min_support: int) -> pd.DataFrame:
    freq = pd.DataFrame(columns=['items', 'support'])
    candidate_set = candidate_set.infer_objects()
    for i in candidate_set.index:
        support = candidate_set.iloc[i]['support']
        if (support >= min_support):
            freq.loc[i] = candidate_set.loc[i]
            #freq.loc[i, 'support'] = support
    return freq

In [41]:
# SON algorithm implementation
def SON(transactions: pd.DataFrame, min_support: int) -> pd.DataFrame:
    # defining the number of chunks
    num_chunks = 20

    #dividing the user_input data into chuncks
    chunks = split_dataframe_by_position(transactions, num_chunks)

    #recalculating user_support for chunks processing
    chunk_support = math.ceil(min_support/num_chunks)

    # creating an empty dataframe
    candidate_itemset = pd.DataFrame(columns=['items', 'support'])
    
    # for loop to process individual chunks
    for i in range(len(chunks)):
        # using Apriori algorithm on individual chunks and adding the results to candidate set
        candidates = Apriori(chunks[i], chunk_support)
        candidate_itemset = pd.concat([candidate_itemset, candidates], ignore_index=True)
    
    #print(candidate_itemset)
    # Pass 2
    # finding actual supports of the candidate set, thus reducing false negative
    # generated in the first pass.
    for i in candidate_itemset.index:
        for j in transactions.index:
            if isSubset(transactions.loc[j]['items'], candidate_itemset.loc[i]['items']):
                candidate_itemset.iloc[i]['support'] += 1

    
    freq = createFrequentItemSet3(candidate_itemset, min_support)
    return freq

In [42]:
start_time = time.time()
freq = SON(data, 2)
end_time = time.time()
print(end_time-start_time)

32.00025773048401


In [43]:
print(freq)

               items support
0          (39, 185)       2
1          (39, 664)       2
2          (39, 710)       2
3         (39, 1426)       2
4         (39, 3236)       3
...              ...     ...
3108   (2528, 13309)       2
3109   (2528, 13621)       2
3110  (10587, 13309)       2
3111  (10587, 13621)       2
3112  (13309, 13621)       2

[3113 rows x 2 columns]
