In [1]:
import pandas as pd

def load_data(file_path, sample_size=None):
    data = pd.read_csv(file_path)
    if sample_size is not None:
        data = data.sample(sample_size, random_state=42)
    # in future, we will add more data preprocessing steps here
    return data


cityc = load_data('POIdata_cityC.csv')
cityc

Unnamed: 0,x,y,category,POI_count
0,1,35,48,1
1,1,38,48,1
2,1,45,48,1
3,1,45,47,1
4,1,108,46,1
...,...,...,...,...
39059,187,200,81,1
39060,187,200,48,1
39061,188,199,63,1
39062,188,200,73,1


In [2]:
# The headers of city C are: x, y, category, POI_count

baskets = []

for city in 'A','B', 'C', 'D':
    raw = load_data(f'POIdata_city{city}.csv')
    groupd = raw.groupby(['x', 'y'], as_index=False).agg({'category': list})
    baskets.extend(groupd['category'].tolist())

In [10]:
import numpy as np


def generate_freq_itemsets(binary_matrix, unique_items):
    num_items = len(unique_items)
    freq_itemsets = {}

    # Calculate item counts for 1-itemsets
    item_counts = np.sum(binary_matrix, axis=0)
    
    # Dynamically calculate minimum support as the average support of 1-itemsets
    minsup = np.mean(item_counts)
    scaled_minsup = minsup * 0.5  # Adjust with a scaling factor to lower minsup
    print(f"Dynamic minimum support threshold (scaled): {scaled_minsup:.2f}")
    
    # Generate frequent 1-itemsets
    freq_1 = np.where(item_counts >= scaled_minsup)[0]
    freq_itemsets[1] = [frozenset([unique_items[i]]) for i in freq_1]
    print(f"Generated {len(freq_itemsets[1])} frequent 1-itemsets")

    k = 2
    while True:
        prev_freq = freq_itemsets.get(k - 1, [])
        if not prev_freq:
            break

        # Generate candidate k-itemsets
        candidates = []
        len_prev = len(prev_freq)
        for i in range(len_prev):
            for j in range(i + 1, len_prev):
                union = prev_freq[i].union(prev_freq[j])
                if len(union) == k:
                    # Prune step: check if all (k-1)-subsets are frequent
                    subsets = [union - frozenset([item]) for item in union]
                    if all(subset in freq_itemsets[k - 1] for subset in subsets):
                        candidates.append(union)

        # Remove duplicates
        candidates = list(set(candidates))
        if not candidates:
            break

        # Convert candidates to binary mask
        candidate_masks = np.zeros((len(candidates), num_items), dtype=int)
        for idx, candidate in enumerate(candidates):
            for item in candidate:
                candidate_masks[idx, item_index[item]] = 1

        # Optimized Frequency Counting using Matrix Multiplication
        dot_product = binary_matrix @ candidate_masks.T  # Shape: (num_baskets, num_candidates)
        subset_check = dot_product == k  # Boolean array
        freq_counts = np.sum(subset_check, axis=0)

        # Select candidates that meet or exceed scaled minsup
        freq_candidates_indices = np.where(freq_counts >= scaled_minsup)[0]

        # Store frequent k-itemsets
        freq_k = [frozenset([unique_items[idx] for idx in np.where(candidate_masks[i])[0]]) 
                  for i in freq_candidates_indices]
        if freq_k:
            freq_itemsets[k] = freq_k
            print(f"Generated {len(freq_k)} frequent {k}-itemsets")
            k += 1
        else:
            break

    return freq_itemsets

# Example Usage
if __name__ == "__main__":
    # Example binary matrix and unique items
    
    unique_items = sorted(set(item for basket in baskets for item in basket))
    item_index = {item: idx for idx, item in enumerate(unique_items)}
    
    # Create binary matrix
    binary_matrix = np.zeros((len(baskets), len(unique_items)), dtype=np.bool_)
    for i, basket in enumerate(baskets):
        for item in basket:
            binary_matrix[i, item_index[item]] = 1
    
    # Apriori Settings
    minsup = 2000  # Minimum support threshold
    
    # Generate frequent itemsets
    freq_itemsets = generate_freq_itemsets(binary_matrix, unique_items)
    

Dynamic minimum support threshold (scaled): 2289.92
Generated 48 frequent 1-itemsets
Generated 563 frequent 2-itemsets
Generated 2917 frequent 3-itemsets
Generated 7263 frequent 4-itemsets
Generated 9563 frequent 5-itemsets
Generated 6962 frequent 6-itemsets
Generated 2708 frequent 7-itemsets
Generated 483 frequent 8-itemsets
Generated 30 frequent 9-itemsets


In [11]:
import numpy as np
from itertools import chain, combinations

def generate_association_rules(freq_itemsets, binary_matrix, unique_items, minsup, minconf):
    """
    Generate association rules from frequent itemsets.

    Parameters:
    - freq_itemsets: dict, frequent itemsets with their sizes as keys
    - binary_matrix: np.ndarray, binary representation of baskets
    - unique_items: list, sorted list of unique items
    - minsup: int, minimum support threshold
    - minconf: float, minimum confidence threshold (e.g., 0.7 for 70%)

    Returns:
    - rules: list of tuples, each tuple contains (antecedent, consequent, support, confidence)
    """
    rules = []
    item_index = {item: idx for idx, item in enumerate(unique_items)}
    
    # Precompute support counts for all frequent itemsets
    support_counts = {}
    for k, itemsets in freq_itemsets.items():
        for itemset in itemsets:
            support_counts[itemset] = np.sum(
                np.all(binary_matrix[:, list(item_index[item] for item in itemset)] == 1, axis=1)
            )
    
    for k in freq_itemsets:
        if k < 2:
            continue  # Need at least 2 items to form a rule
        print(f"Generating rules from {k}-itemsets")
        for itemset in freq_itemsets[k]:
            # Generate all non-empty proper subsets of the itemset
            subsets = list(chain.from_iterable(combinations(itemset, r) for r in range(1, len(itemset))))
            for antecedent in subsets:
                antecedent = frozenset(antecedent)
                consequent = itemset - antecedent
                if not consequent:
                    continue
                # Calculate support and confidence
                support = support_counts[itemset]
                antecedent_support = support_counts.get(antecedent, 0)
                if antecedent_support == 0:
                    continue
                confidence = support / antecedent_support
                if confidence >= minconf:
                    rules.append((set(antecedent), set(consequent), support, confidence))
    
    return rules


In [12]:
rus = generate_association_rules(freq_itemsets, binary_matrix, unique_items, minsup, 0.8)

Generating rules from 2-itemsets
Generating rules from 3-itemsets
Generating rules from 4-itemsets
Generating rules from 5-itemsets
Generating rules from 6-itemsets
Generating rules from 7-itemsets
Generating rules from 8-itemsets
Generating rules from 9-itemsets


In [24]:
# Check generated frequent itemsets
#print("Frequent Itemsets:", freq_itemsets)

# Check rule generation
#for k, itemsets in freq_itemsets.items():
#    print(f"Generating rules from {k}-itemsets")
#    for itemset in itemsets:
#        print("Itemset:", itemset)

# Check individual rules
#print("Generated Rules:")
#for rule in rus:
#    print(rule)


In [25]:
#for rule in rus:
#    antecedent, cent, support,confidence = rule
#    confidence_str = f"{confidence:.2f}" if confidence is not None else "N/A"
#    print(f"Rule: {antecedent} -> {consequent} (Support: {support}, Confidence: {confidence:.2f})\n")
