In [2]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from collections import defaultdict, Counter

In [4]:
# Dataset (purchase logs)
purchase_history = [
    ("John", ["Milk", "Bread", "Napkin", "Butter", "Table salt"]),
    ("Mary", ["Lipstick", "Facewash", "Hair color", "Nail polish", "Bread"]),
    ("Ram", ["Rice", "Sugar", "Garam masala", "potato", "onion"]),
    ("Raj", ["Tea", "Milk", "wafers", "Chips", "nuts"]),
    ("Gita", ["Tomato", "Onion", "Cooking Oil", "Tur dal", "sugar"]),
    ("Raj", ["Bread", "Chips", "Sauce", "Pepsi", "Milk"]),
    ("Mary", ["Talcum Powder", "Fair & Lovely", "Nail cutter", "Ribbons", "Napkin"]),
    ("John", ["Onion", "Tea", "Milk", "Butter", "jam"]),
    ("Ram", ["Tur dal", "Tamarind", "Sugar", "pumpkin", "Milk"]),
    ("Raj", ["Noodles", "chips", "nuts", "wafers", "Tomato"]),
    ("Gita", ["Milk Powder", "Bread", "Napkin", "Butter", "Milk", "Table salt"]),
    ("Mary", ["Ribbon", "Body Wash", "Liquid Soap", "Nail polish", "Floor Cleaner"]),
    ("Ram", ["Cake", "Floor Cleaner", "Garam masala", "potato", "onion"]),
    ("Raj", ["Tea", "Milk", "wafers", "Chips", "nuts"]),
    ("John", ["Tomato", "Onion", "Floor Cleaner", "Tur dal", "sugar"]),
    ("Raj", ["Bread", "Chips", "Sauce", "Pepsi", "Milk"]),
    ("Gita", ["Talcum Powder", "Fair & Lovely", "grapes", "Apple", "Napkin"]),
    ("John", ["Onion", "Floor Cleaner", "Milk", "Butter", "jam"]),
    ("Mary", ["Tur dal", "Tamarind", "Sugar", "pumpkin", "Milk"]),
    ("Gita", ["Noodles", "chips", "nuts", "wafers", "Tomato"]),
    ("Raj", ["Apple", "Milk", "wafers", "Chips", "nuts"]),
    ("John", ["grapes", "Onion", "Cooking Oil", "Tur dal", "sugar"]),
    ("Gita", ["Apple", "Chips", "Sauce", "Pepsi", "Milk"]),
    ("Ram", ["Fair & Lovely", "Talcum Powder", "Nail cutter", "Ribbons", "Napkin"]),
    ("John", ["Onion", "Tea", "Milk", "Butter", "jam"]),
    ("Mary", ["Tur dal", "Floor Cleaner", "Sugar", "grapes", "Milk"]),
    ("Raj", ["Noodles", "chips", "nuts", "wafers", "Tomato"]),
    ("Raj", ["Tea", "Milk", "wafers", "Chips", "nuts"]),
    ("John", ["Tomato", "Floor Cleaner", "Cooking Oil", "Tur dal", "sugar"]),
    ("Mary", ["Tur dal", "Tamarind", "Sugar", "pumpkin", "Apple"]),
]

In [5]:
# Count item popularity
all_items = [item for _, items in purchase_history for item in items]
item_counts = Counter(all_items)
top_10_items = [item for item, _ in item_counts.most_common(10)]
top_10_items

['Milk',
 'Tur dal',
 'wafers',
 'Chips',
 'nuts',
 'Tomato',
 'Onion',
 'Floor Cleaner',
 'Bread',
 'Napkin']

In [6]:
#  Build user-item frequency table
user_item_freq = defaultdict(lambda: defaultdict(int))

for user, items in purchase_history:
    for item in items:
        if item in top_10_items:
            user_item_freq[user][item] += 1

In [13]:
for item in top_10_items:
    print(f"{item}: {sum(user_item_freq[user][item] for user in user_item_freq)}")

Milk: 15
Tur dal: 8
wafers: 7
Chips: 7
nuts: 7
Tomato: 6
Onion: 6
Floor Cleaner: 6
Bread: 5
Napkin: 5


In [11]:
# Convert frequency to ratings (cap at 5)
user_item_matrix = pd.DataFrame(
    index=tuple(set([u for u, _ in purchase_history])), columns=top_10_items
)

for user in user_item_matrix.index:
    for item in top_10_items:
        freq = user_item_freq[user][item]
        user_item_matrix.loc[user, item] = min(freq, 5) if freq > 0 else 0

user_item_matrix = user_item_matrix.fillna(0).astype(int)

print("Top 10 Items:", top_10_items)
print("\nUser-Item Interaction Matrix:")
print(user_item_matrix)

Top 10 Items: ['Milk', 'Tur dal', 'wafers', 'Chips', 'nuts', 'Tomato', 'Onion', 'Floor Cleaner', 'Bread', 'Napkin']

User-Item Interaction Matrix:
      Milk  Tur dal  wafers  Chips  nuts  Tomato  Onion  Floor Cleaner  Bread  \
Gita     2        1       1      1     1       2      1              0      1   
Mary     2        3       0      0     0       0      0              2      1   
Raj      5        0       5      5     5       2      0              0      2   
John     4        3       0      0     0       2      5              3      1   
Ram      1        1       0      0     0       0      0              1      0   

      Napkin  
Gita       2  
Mary       1  
Raj        0  
John       1  
Ram        1  


  user_item_matrix = user_item_matrix.fillna(0).astype(int)


Normalized Ratings Range Cap 5

In [17]:
# Find max purchase count per user (for normalization)
user_max_freq = {
    user: max(freqs.values()) if freqs else 1 for user, freqs in user_item_freq.items()
}

# Normalized rating calculation [0, 1] range
user_item_normalized = pd.DataFrame(
    index=tuple(set([u for u, _ in purchase_history])), columns=top_10_items
)

for user in user_item_normalized.index:
    max_count = user_max_freq[user]
    for item in top_10_items:
        freq = user_item_freq[user][item]
        user_item_normalized.loc[user, item] = (
            round(freq / max_count, 2) * 5 if max_count > 0 else 0
        )

user_item_normalized = user_item_normalized.fillna(0).astype(float)

print(user_item_normalized)

      Milk  Tur dal  wafers  Chips  nuts  Tomato  Onion  Floor Cleaner  Bread  \
Gita  5.00      2.5     2.5    2.5   2.5    5.00    2.5           0.00   2.50   
Mary  3.35      5.0     0.0    0.0   0.0    0.00    0.0           3.35   1.65   
Raj   5.00      0.0     5.0    5.0   5.0    1.65    0.0           0.00   1.65   
John  4.00      3.0     0.0    0.0   0.0    2.00    5.0           3.00   1.00   
Ram   5.00      5.0     0.0    0.0   0.0    0.00    0.0           5.00   0.00   

      Napkin  
Gita    5.00  
Mary    1.65  
Raj     0.00  
John    1.00  
Ram     5.00  


  user_item_normalized = user_item_normalized.fillna(0).astype(float)
