# ASSOCIATION RULE BASED CONTRASTIVE LEARNING FOR RECOMMENDATION SYSTEM

In [2]:
!pip install torch



In [3]:
#Import the required libraries
import pandas as pd
import numpy as np

from itertools import combinations
from collections import defaultdict
from tqdm import tqdm  # optional: for progress bars

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Read the dataset
df = pd.read_csv("Dunnhumby.csv")


In [5]:
items = df["MATERIAL_NUMBER"].unique().tolist()

In [6]:
# Group by CUSTOMER_ID and extract unique MATERIAL_NUMBERs
transaction_series = df.groupby('CUSTOMER_ID')['MATERIAL_NUMBER'].apply(lambda x: list(set(x)))
transactions = [set(t) for t in transaction_series.tolist()[:200]]  # subset for testing

In [7]:
#Start the Extracting Associarion Rue (positive and negative)

In [8]:
# ------------------ Support & Confidence Functions ------------------

def get_support(itemset, support_cache, total):
    """Fetch support from cache or compute on the fly."""
    return support_cache.get(itemset, 0)

def get_confidence(X, Y, support_cache, total):
    """Calculate confidence using cached supports."""
    supp_X = get_support(X, support_cache, total)
    supp_XY = get_support(X | Y, support_cache, total)
    return supp_XY / supp_X if supp_X else 0

In [9]:
# ------------------ Correlation Function ------------------
def get_correlation(X, Y, transactions):
    """Compute correlation (phi-coefficient) for itemsets."""
    N = len(transactions)
    f11 = f10 = f01 = f00 = 0

    for t in transactions:
        x_in = X.issubset(t)
        y_in = Y.issubset(t)

        if x_in and y_in:
            f11 += 1
        elif x_in:
            f10 += 1
        elif y_in:
            f01 += 1
        #else:
            #f00 += 1

    f00 = N - f11 - f10 - f01
    f1_ = f11 + f10
    f0_ = f01 + f00
    f_1 = f11 + f01
    f_0 = f10 + f00

    numerator = f11 * f00 - f10 * f01
    denominator = (f1_ * f0_ * f_1 * f_0) ** 0.5

    return numerator / denominator if denominator else 0

In [10]:
def get_support_negated(pos_set, neg_set, transactions, total):
    """Returns support of itemsets like X and not Y"""
    count = sum(1 for t in transactions if pos_set.issubset(t) and not neg_set.issubset(t))
    return count / total

In [11]:
def get_confidence_negAR(pos_set, neg_set, support_cache, transactions, total):
    """Compute confidence of rule: pos_set ⇒ not neg_set"""
    sup_pos = support_cache.get(pos_set, 0)
    sup_pos_and_not_neg = get_support_negated(pos_set, neg_set, transactions, total)
    return sup_pos_and_not_neg / sup_pos if sup_pos > 0 else 0

In [12]:
# ------------------ Rule Generation ------------------

def generate_2_itemset_rules(transactions, minsupp=0.2, minconf=0.6, p_min=0.9):
    positiveAR = []
    negativeAR = []
    item_counts = defaultdict(int)
    total = len(transactions)

    print("Counting single item frequencies...")
    for t in transactions:
        for item in t:
            item_counts[item] += 1

    # Generate frequent 1-itemsets
    F1 = [frozenset([item]) for item, count in item_counts.items() if count / total >= minsupp]
    print(f"Frequent 1-itemsets: {len(F1)}")

    # Precompute support values
    print("Caching support values...")
    support_cache = {}
    for item in F1:
        support_cache[item] = item_counts[list(item)[0]] / total

    for a, b in combinations(F1, 2):
        union = a | b
        if union not in support_cache:
            count = sum(1 for t in transactions if union.issubset(t))
            support_cache[union] = count / total

    # Generate frequent 2-itemsets
    F2 = [a | b for a, b in combinations(F1, 2) if support_cache[a | b] >= minsupp]
    print(f"Frequent 2-itemsets: {len(F2)}")

    print("Generating rules...")
    for itemset in tqdm(F2):
        for X in combinations(itemset, 1):
            X = frozenset(X)
            Y = itemset - X

            rho = get_correlation(X, Y, transactions)
            support = get_support(X | Y, support_cache, total)

            if rho >= p_min and support >= minsupp:
                conf = get_confidence(X, Y, support_cache, total)
                if conf >= minconf:
                    positiveAR.append((set(X), set(Y), 'X → Y', conf, rho))

            elif rho < -0.01:
                conf1 = get_confidence_negAR(X, Y, support_cache, transactions, total)  # X ⇒ ¬Y
                conf2 = get_confidence_negAR(Y, X, support_cache, transactions, total)  # Y ⇒ ¬X
                if conf1 >= minconf:
                    negativeAR.append((set(X), set(Y), 'X → Y (Negative)', conf1, rho))
                elif conf2 >= minconf:
                    negativeAR.append((set(Y), set(X), 'Y → X (Negative)', conf2, rho))

    return positiveAR, negativeAR

In [13]:
pos_rules, neg_rules = generate_2_itemset_rules(transactions, minsupp=0.005, minconf=0.8, p_min=0.3)

Counting single item frequencies...
Frequent 1-itemsets: 3943
Caching support values...
Frequent 2-itemsets: 897130
Generating rules...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 897130/897130 [01:12<00:00, 12343.75it/s]


In [14]:
print(len(pos_rules))
print(len(neg_rules))

167435
3974


In [15]:
def filter_redundant_rules(rules):
    rule_dict = {}
    for X, Y, rule_type, conf, rho in rules:
        key = frozenset(X | Y)
        if key not in rule_dict or conf > rule_dict[key][3]:
            rule_dict[key] = (X, Y, rule_type, conf, rho)
    return list(rule_dict.values()), set(rule_dict.keys())

# Step 1: Filter both rules
pos_rules_filtered, pos_keys = filter_redundant_rules(pos_rules)
neg_rules_filtered, neg_keys = filter_redundant_rules(neg_rules)

# Step 2: Remove overlaps
pos_rules = [rule for rule in pos_rules_filtered if frozenset(rule[0] | rule[1]) not in neg_keys]
neg_rules = [rule for rule in neg_rules_filtered if frozenset(rule[0] | rule[1]) not in pos_keys]

In [16]:
print(len(pos_rules))
print(len(neg_rules))

156014
1987


In [17]:
#Keep only one of them of A->B or B->A whichever has more confidence
def remove_weaker_symmetric_rules(rules):
    rule_map = {}  # key: (A, B), value: (X, Y, direction, conf, rho)

    for X, Y, direction, conf, rho in rules:
        A, B = sorted([tuple(X), tuple(Y)])  # canonical form for the pair
        key = (A, B)

        if key not in rule_map or conf > rule_map[key][3]:  # compare confidence
            rule_map[key] = (X, Y, direction, conf, rho)

    return list(rule_map.values())

In [18]:
pos_rules_cleaned = remove_weaker_symmetric_rules(pos_rules)
neg_rules_cleaned = remove_weaker_symmetric_rules(neg_rules)

In [19]:
print(len(pos_rules))
print(len(neg_rules))

156014
1987


In [20]:
'''print(f"\nPositive Association Rules:{len(pos_rules)}")
for rule in pos_rules:
    print(f"{rule[0]} => {rule[1]} | conf: {rule[3]:.2f}, corr: {rule[4]:.2f}")

print(f"\nNegative Association Rules:{len(neg_rules)}")
for rule in neg_rules:
    print(f"{rule[0]} => ~{rule[1]} | conf: {rule[3]:.2f}, corr: {rule[4]:.2f}")'''

print(len(pos_rules))
print(len(neg_rules))

156014
1987


In [21]:
#START teh CONTRASTIVE LEARNIG FOR ITEM REPRESENTATION
#contrastive learning model that learns item embeddings by pushing similar items closer and dissimilar ones apart. 
#The distinction between positive and negative pairs is made through the label and the loss function.

In [22]:
#Define Embeddings and Encoder (e.g., simple linear layers)
# Simple embedding-based encoder
class ItemEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, x):
        return self.embedding(x)

# Map item names to indices
item_to_idx = {item: idx for idx, item in enumerate(items)}

In [23]:
#Prepare Training Pairs
# Convert item names to index pairs
positive_pairs = [
    (item_to_idx[next(iter(a))], item_to_idx[next(iter(b))])
    for a, b, _, _, _ in pos_rules
    if next(iter(a)) in item_to_idx and next(iter(b)) in item_to_idx
]

negative_pairs = [
    (item_to_idx[next(iter(a))], item_to_idx[next(iter(b))])
    for a, b, _, _, _ in neg_rules
    if next(iter(a)) in item_to_idx and next(iter(b)) in item_to_idx
]

In [24]:
#Define Contrastive Loss (e.g., NT-Xent-style)

def contrastive_loss(z1, z2, label, margin=1.0):
    # Cosine similarity
    sim = F.cosine_similarity(z1, z2)
    # Positive: minimize distance (maximize similarity), Negative: maximize distance
    loss = torch.where(label == 1, 1 - sim, F.relu(sim - margin))
    return loss.mean()

'''def contrastive_loss(z1, z2, label, margin=1.0):
    distance = torch.nn.functional.pairwise_distance(z1, z2)
    loss = label * distance.pow(2) + (1 - label) * torch.clamp(margin - distance, min=0).pow(2)
    return loss.mean() '''

'def contrastive_loss(z1, z2, label, margin=1.0):\n    distance = torch.nn.functional.pairwise_distance(z1, z2)\n    loss = label * distance.pow(2) + (1 - label) * torch.clamp(margin - distance, min=0).pow(2)\n    return loss.mean() '

In [25]:
class ItemEncoder(nn.Module):
    def __init__(self, num_items, embedding_dim):
        super(ItemEncoder, self).__init__()
        self.embedding = nn.Embedding(num_items, embedding_dim)

    def forward(self, x):
        return self.embedding(x)

In [26]:
class ItemEncoder(nn.Module):
    def __init__(self, num_items, embedding_dim):
        super(ItemEncoder, self).__init__()
        self.embedding = nn.Embedding(num_items, embedding_dim)

    def forward(self, x):
        return self.embedding(x)

In [27]:
items = df['MATERIAL_NUMBER'].unique().tolist()
item_to_idx = {item: idx for idx, item in enumerate(items)}

embedding_dim = 16
encoder = ItemEncoder(len(items), embedding_dim)

In [28]:
item_embeddings = encoder.embedding.weight.detach().numpy()

for item, idx in item_to_idx.items():
    print(f"{item}: {item_embeddings[idx]}")


2: [ 0.2615844  -1.3161895   0.9965863   1.6625717   0.8220307   0.08417652
  0.661044   -1.006521   -0.57543665  0.18044938  1.6613266  -0.59075713
 -2.1093686   0.06471614  0.04162313 -0.9657409 ]
3: [-0.6967705   1.6391177   0.37165177 -0.31896687 -0.355895    0.687585
 -0.04621823  0.19950086  0.25819337  0.98099893 -1.9195964   0.29367545
  0.35723275 -1.7912849   0.3399433  -0.14507237]
4: [ 0.51580256  1.6659971   0.6309046   0.42550132 -0.55456686 -0.28717357
 -0.49456608 -0.44163027 -1.4537501  -0.24531913  0.69581735  0.22170761
 -0.4926136   0.03335559 -0.873373   -1.9650414 ]
5: [-0.43726066  0.8414464  -0.11828526 -0.24339478 -0.09110421  0.2562376
  0.72337383  0.5510263   1.3880495  -0.63905096  0.05358011 -0.74416727
  2.1783967  -0.8012375  -0.44621587 -0.78003436]
6: [-0.34635475 -0.17426486  0.41508517 -0.23336384  0.59703726 -0.20651159
  0.65050673 -0.25144628 -1.0533308   0.57958394 -0.8385751  -2.23747
  1.6257656   0.31602296 -0.21768335  0.5967288 ]
7: [-1.3772

# SASRec BASED Recommendation System using learned embeddings

In [30]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from sklearn.metrics import ndcg_score

# Load and preprocess dataset
def load_and_preprocess(file_path, user_col="CUSTOMER_ID", order_col="ORDER_NUMBER", item_col="MATERIAL_NUMBER"):
    df = pd.read_csv(file_path)
    df = df.sort_values(by=[user_col, order_col])
    user_histories = defaultdict(list)

    for row in df.itertuples(index=False):
        user_histories[getattr(row, user_col)].append(getattr(row, item_col))

    train_data, test_data = {}, {}
    for user, items in user_histories.items():
        if len(items) < 4:  # Minimum sequence length
            continue
        train_data[user] = items[:-1]  # All except last
        test_data[user] = [items[-1]]  # Last item

    items = list(set(df[item_col]))
    item_to_idx = {item: idx for idx, item in enumerate(items)}
    idx_to_item = {idx: item for item, idx in item_to_idx.items()}
    return train_data, test_data, item_to_idx, idx_to_item

# Dataset class
class SASRecDataset(Dataset):
    def __init__(self, sequences, item_to_idx, max_len):
        self.samples = []
        self.max_len = max_len
        for user, seq in sequences.items():
            indexed = [item_to_idx[i] for i in seq if i in item_to_idx]
            if len(indexed) < 2:
                continue
            # Truncate to max_len and create samples
            for i in range(1, len(indexed)):
                input_seq = indexed[max(0, i-self.max_len):i]
                self.samples.append((input_seq, indexed[i]))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        seq, target = self.samples[idx]
        return torch.tensor(seq, dtype=torch.long), torch.tensor(target, dtype=torch.long)

# Collate function for padding
def collate_fn(batch):
    sequences, targets = zip(*batch)
    lengths = [len(seq) for seq in sequences]
    max_len = max(lengths)
    padded = torch.zeros(len(sequences), max_len, dtype=torch.long)
    for i, seq in enumerate(sequences):
        padded[i, -len(seq):] = seq
    return padded, torch.tensor(targets), torch.tensor(lengths)

# SASRec model with pre-trained embeddings
class SASRec(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_len, pretrained_embeddings, num_layers=2, num_heads=2, dropout=0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.max_len = max_len
        # Load pre-trained item embeddings
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(pretrained_embeddings, dtype=torch.float), freeze=False
        )
        # Learnable positional embeddings
        self.pos_embedding = nn.Parameter(torch.randn(1, max_len, embed_dim))
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=num_heads, dim_feedforward=embed_dim*4, dropout=dropout, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.output = nn.Linear(embed_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, lengths):
        B, L = x.shape
        # Create padding mask: True for padded positions
        mask = torch.arange(L, device=x.device)[None, :] >= lengths[:, None]
        # Combine item and positional embeddings
        x = self.embedding(x) + self.pos_embedding[:, :L]
        x = self.dropout(x)
        # Transformer processing
        x = self.transformer(x, src_key_padding_mask=mask)
        # Predict next item using last position
        out = self.output(x[:, -1, :])
        return out

# Training function
def train_model(model, train_loader, criterion, optimizer, device, epochs=5):
    model.to(device)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for x, y, lengths in train_loader:
            x, y, lengths = x.to(device), y.to(device), lengths.to(device)
            optimizer.zero_grad()
            logits = model(x, lengths)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation function
def evaluate(model, test_data, train_data, item_to_idx, idx_to_item, device, k_values=[5, 10]):
    model.eval()
    recall_k = {k: [] for k in k_values}
    ndcg_k = {k: [] for k in k_values}

    with torch.no_grad():
        for user in test_data:
            train_seq = [item_to_idx[i] for i in train_data[user] if i in item_to_idx]
            if not train_seq:
                continue
            # Truncate to max_len
            train_seq = train_seq[-model.max_len:]
            input_seq = torch.tensor(train_seq, dtype=torch.long).unsqueeze(0).to(device)
            lengths = torch.tensor([len(train_seq)], dtype=torch.long).to(device)
            logits = model(input_seq, lengths)
            probs = torch.softmax(logits, dim=1).cpu().numpy().flatten()
            top_preds = np.argsort(probs)[::-1]

            actual_idx = item_to_idx[test_data[user][0]] if test_data[user][0] in item_to_idx else None
            if actual_idx is None:
                continue

            for k in k_values:
                recall = int(actual_idx in top_preds[:k])
                recall_k[k].append(recall)

                true_relevance = np.zeros(len(probs))
                true_relevance[actual_idx] = 1
                ndcg = ndcg_score([true_relevance], [probs], k=k)
                ndcg_k[k].append(ndcg)

    for k in k_values:
        print(f"Recall@{k}: {np.mean(recall_k[k]):.4f}")
        print(f"NDCG@{k}: {np.mean(ndcg_k[k]):.4f}")

# Main execution
if __name__ == "__main__":
    # Configuration
    file_path = "Dunnhumby.csv"  # Replace with your dataset path
    max_len = 100  # Maximum sequence length
    embed_dim = 16  # Must match item_embeddings dimension (from RS_AR_CL.ipynb)
    batch_size = 64
    epochs = 10
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load dataset and preprocess
    train_data, test_data, item_to_idx, idx_to_item = load_and_preprocess(file_path)

    # Load pre-trained embeddings from RS_AR_CL.ipynb
    # REPLACE this with your actual item_embeddings from the notebook
    # Example: item_embeddings = encoder.embedding.weight.detach().numpy()
    # For demonstration, we create a dummy array (replace with your actual embeddings)
    vocab_size = len(item_to_idx)
    item_embeddings = np.random.randn(vocab_size, embed_dim)  # Shape: (8332, 16)
    
    # IMPORTANT: Ensure item_embeddings aligns with item_to_idx
    # If your item_to_idx from RS_AR_CL.ipynb differs, reindex embeddings:
    """
    old_item_to_idx = {...}  # From RS_AR_CL.ipynb
    new_embeddings = np.zeros((vocab_size, embed_dim))
    for item, idx in item_to_idx.items():
        if item in old_item_to_idx:
            old_idx = old_item_to_idx[item]
            new_embeddings[idx] = item_embeddings[old_idx]
    item_embeddings = new_embeddings
    """

    # Create dataset and dataloader
    train_dataset = SASRecDataset(train_data, item_to_idx, max_len)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    # Initialize model with pre-trained embeddings
    model = SASRec(
        vocab_size=vocab_size,
        embed_dim=embed_dim,
        max_len=max_len,
        pretrained_embeddings=item_embeddings,  # Pass pre-trained embeddings
        num_layers=2,
        num_heads=2,
        dropout=0.1
    )

    # Training setup
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Train model
    train_model(model, train_loader, criterion, optimizer, device, epochs)

    # Evaluate model
    evaluate(model, test_data, train_data, item_to_idx, idx_to_item, device)

Epoch 1, Loss: 7.9227
Epoch 2, Loss: 7.7390
Epoch 3, Loss: 7.5939
Epoch 4, Loss: 7.4301
Epoch 5, Loss: 7.3212
Epoch 6, Loss: 7.2530
Epoch 7, Loss: 7.2050
Epoch 8, Loss: 7.1692
Epoch 9, Loss: 7.1412
Epoch 10, Loss: 7.1191


  output = torch._nested_tensor_from_mask(


Recall@5: 0.1081
NDCG@5: 0.0817
Recall@10: 0.1357
NDCG@10: 0.0906


# Item-Based k-Nearest Neighbors (k-NN)

In [33]:
# import pandas as pd
# from collections import defaultdict
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# # 1. Load and preprocess
# df = pd.read_csv("Dunnhumby.csv")

# # Sort by time to split properly
# df = df.sort_values(by=["CUSTOMER_ID", "ORDER_NUMBER"])

# # Build interaction history
# user_histories = defaultdict(list)
# for row in df.itertuples(index=False):
#     user_histories[row.CUSTOMER_ID].append(row.MATERIAL_NUMBER)

# # Split each user history into train/test (leave last N as test)
# train_data, test_data = {}, {}
# for user, items in user_histories.items():
#     if len(items) < 3: continue  # skip very short sessions
#     split = int(0.8 * len(items))
#     train_data[user] = items[:split]
#     test_data[user] = items[split:]

# # Build item index
# items = df["MATERIAL_NUMBER"].unique().tolist()

# item_to_idx = {item: idx for idx, item in enumerate(items)}
# idx_to_item = {idx: item for item, idx in item_to_idx.items()}

# # 2. Compute recommendation candidates using learned embeddings
# item_embeddings = encoder.embedding.weight.detach().numpy()

# def recommend_knn(user_items, top_k=10):
#     vecs = [item_embeddings[item_to_idx[i]] for i in user_items if i in item_to_idx]
#     if not vecs: return []
#     profile = np.mean(vecs, axis=0).reshape(1, -1)
#     sim = cosine_similarity(profile, item_embeddings)[0]
    
#     ranked_indices = np.argsort(sim)[::-1]
#     recs = []
#     for idx in ranked_indices:
#         item = idx_to_item[idx]
#         if item not in user_items:
#             recs.append(item)
#         if len(recs) >= top_k:
#             break
#     return recs

# # 3. Evaluation metrics
# def recall_at_k(actual, predicted, k):
#     actual_set = set(actual)
#     pred_k = predicted[:k]
#     return len(set(pred_k) & actual_set) / len(actual_set)

# def ndcg_at_k(actual, predicted, k):
#     actual_set = set(actual)
#     dcg = 0.0
#     for i, p in enumerate(predicted[:k]):
#         if p in actual_set:
#             dcg += 1 / np.log2(i + 2)  # i+2 since log2(1) = 0
#     idcg = sum(1 / np.log2(i + 2) for i in range(min(len(actual_set), k)))
#     return dcg / idcg if idcg > 0 else 0.0

# # 4. Evaluate across users
# recall_5, recall_10, ndcg_5, ndcg_10 = [], [], [], []

# for user in test_data:
#     train_items = train_data.get(user, [])
#     test_items = test_data[user]

#     if not train_items or not test_items:
#         continue

#     preds = recommend_knn(train_items, top_k=20)

#     recall_5.append(recall_at_k(test_items, preds, 5))
#     recall_10.append(recall_at_k(test_items, preds, 10))
#     ndcg_5.append(ndcg_at_k(test_items, preds, 5))
#     ndcg_10.append(ndcg_at_k(test_items, preds, 10))

# print(f"Recall@5: {np.mean(recall_5):.4f}")
# print(f"Recall@10: {np.mean(recall_10):.4f}")
# print(f"NDCG@5: {np.mean(ndcg_5):.4f}")
# print(f"NDCG@10: {np.mean(ndcg_10):.4f}")
