Imports

In [1]:
from IPython.display import display, HTML
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [2]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [3]:
def get_embeddings(sentences):
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

  with torch.no_grad():
      model_output = model(**encoded_input)

  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

  return sentence_embeddings

In [4]:
def table(df):
    df_copy = df.head(300).copy()
    display(HTML("""
        <style>
            .dataframe-container {
                height: 400px;
                overflow: auto;
                width: 100%;
            }
            .dataframe {
                table-layout: fixed;
                width: 100%;
            }
            .dataframe td, .dataframe th {
                white-space: normal;
                word-wrap: break-word;
                word-break: break-word;
                max-width: 150px; /* Adjust this value as needed */
                overflow: hidden;
                text-overflow: ellipsis;
            }
        </style>
        <div class='dataframe-container'>
            """ + df_copy.style.set_table_attributes("class='dataframe'").to_html() + """
        </div>
    """))

In [5]:
data_dir = Path.cwd() / "data"
data_dir

PosixPath('/Users/lawrence/Documents/PYTHON/item_recommendations_retail/data')

In [6]:
files = list(data_dir.iterdir())
files

[PosixPath('/Users/lawrence/Documents/PYTHON/item_recommendations_retail/data/recommendations_main.csv'),
 PosixPath('/Users/lawrence/Documents/PYTHON/item_recommendations_retail/data/custs.csv'),
 PosixPath('/Users/lawrence/Documents/PYTHON/item_recommendations_retail/data/lift.csv'),
 PosixPath('/Users/lawrence/Documents/PYTHON/item_recommendations_retail/data/items.csv')]

In [7]:
custs = pd.read_csv(data_dir / "custs.csv")
items = pd.read_csv(data_dir / "items.csv")
lift = pd.read_csv(data_dir / "lift.csv")

## Items that will need placing into other categories

In [8]:
super_cats_with_less_than_four_items = items.groupby("SUPER_CAT").filter(lambda x: len(x) < 4).SUPER_CAT.unique()
super_cats_with_less_than_four_items

array(['042a172740f7b6de757d54c5c8ae417f4af70326caf49174e3daec81d6f29a8e',
       '7a5a5e180aa843f2bf55ba649d20145b799e432059079308da4c2fe4dadd8d58',
       '6ba7d494efdf1a5ab73cad55c6cf76b4c199372ebaf0828c9c1b77ec9e64cfa7',
       '16e07576851f55b4c8bf497b8ceaac92b9dbcd2f57dbbc2202f7315e334eecb1',
       'ea1b80d4a3ab48e7a83f0777c7bb851760fdfd644fa67f3234330eca37a22674',
       '06d2c2d77a6923df8f14ae34f3f6662bc20d1833ec3a623a7b7672e6b5f5d7be',
       '74c1a1e360223d09e349364e20f179e68c673045857fb3e6f642ac84e81d40fa',
       '128e9e0029ed6bd2c7b33fab214183b44a6719fa90b40fae65a7a6e4b57413ea',
       '3f7b2207bad71d03813ba16857e15f95259c820696203c97e51192ff6fd9c03d'],
      dtype=object)

In [9]:
items_in_super_cats_with_less_than_four_items = items[items["SUPER_CAT"].isin(super_cats_with_less_than_four_items)][['ITEM', 'ITEM_NAME']]
table(items_in_super_cats_with_less_than_four_items)

Unnamed: 0,ITEM,ITEM_NAME
18,d49da2240fcc6c8ce39e6121b992182b3d1b71e9356f4af92ffa13d601a3f369,Dairy Pride UHT Skimmed Milk 1l
19,865b7c05a2859d4bdc0e68e13a7ae03c41ec32940903ef2b82496ea4d1c9e76a,Dairy Pride Semi Skimmed UHT Milk 1ltr
20,5a7672762401c638088791e8d25fdb13c8dd46b84a270272f6317972fc292f8b,Rizla Regular Green x 5 Multi Pack
21,0d4376d8f3f3fbdd1b1d705f59a16dda694661c38bfe505951cfb746d4cf15d3,Swan Extra Slim Filter Tips X120
137,3b47ead8994dd9980ab2bd6fdfc9d667417e07de869e408815c9ee0998241810,LITTLE ONES DRY FIT 6+EP 30
147,5681e194e8db0231215b34aec6396b185890eba668de6ec5d2d978d490431252,TV TIMES 22/23
148,dd5f2227014e3cac07caa5fdc6e8651cb88104530d401c3cda67a303f5d92604,Stamford Street Strawberry &Vanilla Roll
149,56a6972024ec538baf8b0695aa9cc6fcf347f5dbe3c5238f7f6171b75fac81b3,Stamford Street Fairy Cakes X12
150,c62772b3bf43bafc30d89276129d4fc6e14ac824083eb9ff18df170eb6d1c5a9,Stamford Street Mini Rolls 102g
158,fcb4860524f4b3f5503a6f6096ade548f01e2374b5f082189b19ceb5eba54487,TAKE A BREAK


In [10]:
items_with_super_cat_more_than_four_items = items[~items["SUPER_CAT"].isin(super_cats_with_less_than_four_items)]
items_with_super_cat_more_than_four_items

Unnamed: 0,ITEM,QUANTITY,TRANSACTIONS,ITEM_NAME,SEGMENT,SUB_CAT,CAT,SUPER_CAT,ITEM_TOTAL_WEIGHT_OR_VOL_QTY
0,6446506d5cf328e5e4e0b47f3a2914a86d55b95e8b24f6...,4527.0,3354,Tena lady - extra plus duo pack x16,47fc30d2dfeaba94b1b61322cb705babf98b120dab4c89...,581a25a4ede6489bea182a647896302985576f54f040c4...,5a1f6e711a956a3d6246d6dbccf78ee28f12aeffc07f9d...,5df3e4da2395aef5c8b3267b7253bb118eae178abf3d68...,16.0
1,abc0c83eaef2366dbec3b35a478c81547fc451f3666347...,2557.0,2124,Tena Lady Mini X20,47fc30d2dfeaba94b1b61322cb705babf98b120dab4c89...,581a25a4ede6489bea182a647896302985576f54f040c4...,5a1f6e711a956a3d6246d6dbccf78ee28f12aeffc07f9d...,5df3e4da2395aef5c8b3267b7253bb118eae178abf3d68...,150.0
2,63f3da95314c90b3195468322b10677a17a43d92b4dddc...,2343.0,1847,Tena lady - extra duo pack x20,47fc30d2dfeaba94b1b61322cb705babf98b120dab4c89...,581a25a4ede6489bea182a647896302985576f54f040c4...,5a1f6e711a956a3d6246d6dbccf78ee28f12aeffc07f9d...,5df3e4da2395aef5c8b3267b7253bb118eae178abf3d68...,512.0
3,232247fa7953669d85b54088b2f236766342d1ce96f978...,3679.0,1844,TENA LADY MAXI NIGHT x6,47fc30d2dfeaba94b1b61322cb705babf98b120dab4c89...,581a25a4ede6489bea182a647896302985576f54f040c4...,5a1f6e711a956a3d6246d6dbccf78ee28f12aeffc07f9d...,5df3e4da2395aef5c8b3267b7253bb118eae178abf3d68...,258.0
4,cb5142698732aa569c90c8762ebdb0a7ed40330fa1abc6...,3348.0,2680,Tena Lady Discreet Normal x16pc,47fc30d2dfeaba94b1b61322cb705babf98b120dab4c89...,581a25a4ede6489bea182a647896302985576f54f040c4...,5a1f6e711a956a3d6246d6dbccf78ee28f12aeffc07f9d...,5df3e4da2395aef5c8b3267b7253bb118eae178abf3d68...,160.0
...,...,...,...,...,...,...,...,...,...
390,fd8c70d6b32291b152aa5cfc47f567ab8f5b0c2af4bcf3...,7226.0,6230,NESCAFE ORIGINAL COFFEE 300G,0f8cbbfebb0c691320c5fd068ca200919612b65ac86047...,4914e72d40087b6aeb394d98b1da861cf9f3bb741e3b66...,3e1d75f4aa1ecc2e7186f84b8526fc783dd9be494ae104...,d7a0999920f6139b0eb50b75a58a373b41b30098394b4d...,300.0
391,88b3eb994e047ecd49628eb0fbccb6100b6baaff8c7702...,2452.0,1282,Nescafe Original 3in1 102g,59114c66e1118846d237601e9bb65c3fe0743bc70ed6f3...,4914e72d40087b6aeb394d98b1da861cf9f3bb741e3b66...,3e1d75f4aa1ecc2e7186f84b8526fc783dd9be494ae104...,d7a0999920f6139b0eb50b75a58a373b41b30098394b4d...,102.0
392,78ada47bdcba69f1ab471d5d4df3d5a22a42f04a38aebb...,2545.0,1611,Nescafe Cafe Irish Cream Latte 158g,fa61bb6fbc53df0d5ab65d0e552c318ff5430001db98da...,4914e72d40087b6aeb394d98b1da861cf9f3bb741e3b66...,3e1d75f4aa1ecc2e7186f84b8526fc783dd9be494ae104...,d7a0999920f6139b0eb50b75a58a373b41b30098394b4d...,158.4
393,f7c33ba71e42717c7ebc73cc4cab5b6383e548fa0d5c90...,2069.0,1964,L'OR Intense Coffee 150g,8843def33b0e8b5d98e3dd7aa11631e32cdd38f0bf8c69...,4914e72d40087b6aeb394d98b1da861cf9f3bb741e3b66...,3e1d75f4aa1ecc2e7186f84b8526fc783dd9be494ae104...,d7a0999920f6139b0eb50b75a58a373b41b30098394b4d...,150.0


### I will now iterate through catgories and for each item return the items of the category until aquiring four items by transaction ammount

In [11]:
item_with_four_recommendations_dict = {}

for item in items_with_super_cat_more_than_four_items["ITEM_NAME"].unique():
    recommended_items = []
    

    item_segment = items_with_super_cat_more_than_four_items[
        items_with_super_cat_more_than_four_items["ITEM_NAME"] == item
    ].sort_values('TRANSACTIONS', ascending=False)["SEGMENT"].values[0]
    
    segment_items = items_with_super_cat_more_than_four_items[
        items_with_super_cat_more_than_four_items["SEGMENT"] == item_segment
    ]
    for segment_item in segment_items["ITEM_NAME"]:
        if segment_item not in recommended_items:
            recommended_items.append(segment_item)
            if len(recommended_items) >= 5:
                break

    if len(recommended_items) < 5:
        sub_cat = items_with_super_cat_more_than_four_items[
            items_with_super_cat_more_than_four_items["ITEM_NAME"] == item
        ].sort_values('TRANSACTIONS', ascending=False)["SUB_CAT"].values[0]
        sub_cat_items = items_with_super_cat_more_than_four_items[
            items_with_super_cat_more_than_four_items["SUB_CAT"] == sub_cat
        ].sort_values('TRANSACTIONS', ascending=False)["ITEM_NAME"].values
        for sub_cat_item in sub_cat_items:
            if sub_cat_item not in recommended_items:
                recommended_items.append(sub_cat_item)
                if len(recommended_items) >= 5:
                    break

    if len(recommended_items) < 5:
        cat = items_with_super_cat_more_than_four_items[
            items_with_super_cat_more_than_four_items["ITEM_NAME"] == item
        ].sort_values('TRANSACTIONS', ascending=False)["CAT"].values[0]
        cat_items = items_with_super_cat_more_than_four_items[
            items_with_super_cat_more_than_four_items["CAT"] == cat
        ].sort_values('TRANSACTIONS', ascending=False)["ITEM_NAME"].values
        for cat_item in cat_items:
            if cat_item not in recommended_items:
                recommended_items.append(cat_item)
                if len(recommended_items) >= 5:
                    break

    if len(recommended_items) < 5:
        super_cat = items_with_super_cat_more_than_four_items[
            items_with_super_cat_more_than_four_items["ITEM_NAME"] == item
        ].sort_values('TRANSACTIONS', ascending=False)["SUPER_CAT"].values[0]
        super_cat_items = items_with_super_cat_more_than_four_items[
            items_with_super_cat_more_than_four_items["SUPER_CAT"] == super_cat
        ].sort_values('TRANSACTIONS', ascending=False)["ITEM_NAME"].values
        for super_cat_item in super_cat_items:
            if super_cat_item not in recommended_items:
                recommended_items.append(super_cat_item)
                if len(recommended_items) >= 5:
                    break

    item_for_dict = item
    list_for_dict = recommended_items[1:]
    
    item_with_four_recommendations_dict[item_for_dict] = list_for_dict

    print("Item: ", item_for_dict, "Recommended Items: ", item_with_four_recommendations_dict[item_for_dict], "Number of items: ", len(item_with_four_recommendations_dict[item_for_dict]))

Item:  Tena lady - extra plus duo pack x16 Recommended Items:  ['Tena Lady Mini X20', 'Tena lady - extra duo pack x20', 'TENA LADY MAXI NIGHT x6', 'Tena Lady Discreet Normal x16pc'] Number of items:  4
Item:  Tena Lady Mini X20 Recommended Items:  ['Tena Lady Mini X20', 'Tena lady - extra duo pack x20', 'TENA LADY MAXI NIGHT x6', 'Tena Lady Discreet Normal x16pc'] Number of items:  4
Item:  Tena lady - extra duo pack x20 Recommended Items:  ['Tena Lady Mini X20', 'Tena lady - extra duo pack x20', 'TENA LADY MAXI NIGHT x6', 'Tena Lady Discreet Normal x16pc'] Number of items:  4
Item:  TENA LADY MAXI NIGHT x6 Recommended Items:  ['Tena Lady Mini X20', 'Tena lady - extra duo pack x20', 'TENA LADY MAXI NIGHT x6', 'Tena Lady Discreet Normal x16pc'] Number of items:  4
Item:  Tena Lady Discreet Normal x16pc Recommended Items:  ['Tena Lady Mini X20', 'Tena lady - extra duo pack x20', 'TENA LADY MAXI NIGHT x6', 'Tena Lady Discreet Normal x16pc'] Number of items:  4
Item:  TENA Lady Discreet No

so now we can take our items with not enough super cat and use cosine similarity to find an item that is similar and copy its recommendations

In [12]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] 
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

In [13]:
def get_embeddings(sentences):
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

  with torch.no_grad():
      model_output = model(**encoded_input)

  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

  return sentence_embeddings

In [14]:
def remove_company_names(df, column, company_names, new_column=None):
    copy_df = df.copy()
    cleaned_column = copy_df[column].copy()
    for name in company_names:
        cleaned_column = cleaned_column.str.replace(name, '', regex=False).str.strip()
    
    if new_column:
        copy_df[new_column] = cleaned_column
    else:
        copy_df[column] = cleaned_column
    
    return copy_df


company_names = ['Stamford Street', 'JS ', 'SSTC', 'Stamford St', 'Sainsburys']


In [15]:
items_in_super_cats_with_less_than_four_items = remove_company_names(
    items_in_super_cats_with_less_than_four_items, 'ITEM_NAME', company_names, 'ITEM_NAME_CLEANED'
)
items_with_super_cat_more_than_four_items = remove_company_names(
    items_with_super_cat_more_than_four_items, 'ITEM_NAME', company_names, 'ITEM_NAME_CLEANED'
)

In [16]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

embed_df = items_with_super_cat_more_than_four_items.copy()

product_embeddings = model.encode(items_with_super_cat_more_than_four_items['ITEM_NAME_CLEANED'].tolist())

In [17]:
def get_recommendations(query, embeddings, df, top_n=200):
    copy_df = df.copy()
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, embeddings)
    top_indices = similarities[0].argsort()[-top_n:][::-1]
    copy_df['COSINE_SIMILARITY'] = similarities[0]
    return copy_df.iloc[top_indices]

In [18]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

for item_for_swap in items_in_super_cats_with_less_than_four_items['ITEM_NAME_CLEANED'].values:
    query = item_for_swap
    recommendations = get_recommendations(query, product_embeddings, embed_df, top_n=1)
    top_cosine_item = recommendations['ITEM_NAME'].values[0]

    copied_recomendations = item_with_four_recommendations_dict[top_cosine_item]

    item_for_dict = items_in_super_cats_with_less_than_four_items[items_in_super_cats_with_less_than_four_items['ITEM_NAME_CLEANED'] == item_for_swap]['ITEM_NAME'].values[0]
    list_for_dict = copied_recomendations
    
    item_with_four_recommendations_dict[item_for_dict] = list_for_dict

    print("\nItem for swap: ", item_for_dict, 
          "\nTaking Recommendations From: ", top_cosine_item, 
          "\nRecommended Items: ", list_for_dict)


Item for swap:  Dairy Pride UHT Skimmed Milk 1l 
Taking Recommendations From:  JS Scottish Skimmed Milk 2 Pint 1.136L 
Recommended Items:  ['SSTC SOFT CREAM CHEESE 200G', 'JS Butterlicious 500g', 'SSTC GREEK STYLE SALAD CHEESE 200G', 'Anchor Spreadable 250G']

Item for swap:  Dairy Pride Semi Skimmed UHT Milk 1ltr 
Taking Recommendations From:  JS Scottish Skimmed Milk 2 Pint 1.136L 
Recommended Items:  ['SSTC SOFT CREAM CHEESE 200G', 'JS Butterlicious 500g', 'SSTC GREEK STYLE SALAD CHEESE 200G', 'Anchor Spreadable 250G']

Item for swap:  Rizla Regular Green x 5 Multi Pack 
Taking Recommendations From:  Tena lady - extra plus duo pack x16 
Recommended Items:  ['Tena Lady Mini X20', 'Tena lady - extra duo pack x20', 'TENA LADY MAXI NIGHT x6', 'Tena Lady Discreet Normal x16pc']

Item for swap:  Swan Extra Slim Filter Tips X120 
Taking Recommendations From:  Strongbow 18x440ml 
Recommended Items:  ['JS Original Dry Cider 4x440ml', 'JS Original Dry Cider 2lt', 'Thatchers Gold 10x440ml', '

In [19]:
recommendations_df = pd.DataFrame.from_dict(item_with_four_recommendations_dict, orient='index')

recommendations_df.columns = [f"RECOMMENDATION {i+1}" for i in range(recommendations_df.shape[1])]

recommendations_df = recommendations_df.reset_index().rename(columns={'index': 'ITEM_NAME'})

recommendations_df

Unnamed: 0,ITEM_NAME,RECOMMENDATION 1,RECOMMENDATION 2,RECOMMENDATION 3,RECOMMENDATION 4
0,Tena lady - extra plus duo pack x16,Tena Lady Mini X20,Tena lady - extra duo pack x20,TENA LADY MAXI NIGHT x6,Tena Lady Discreet Normal x16pc
1,Tena Lady Mini X20,Tena Lady Mini X20,Tena lady - extra duo pack x20,TENA LADY MAXI NIGHT x6,Tena Lady Discreet Normal x16pc
2,Tena lady - extra duo pack x20,Tena Lady Mini X20,Tena lady - extra duo pack x20,TENA LADY MAXI NIGHT x6,Tena Lady Discreet Normal x16pc
3,TENA LADY MAXI NIGHT x6,Tena Lady Mini X20,Tena lady - extra duo pack x20,TENA LADY MAXI NIGHT x6,Tena Lady Discreet Normal x16pc
4,Tena Lady Discreet Normal x16pc,Tena Lady Mini X20,Tena lady - extra duo pack x20,TENA LADY MAXI NIGHT x6,Tena Lady Discreet Normal x16pc
...,...,...,...,...,...
390,Stamford St White Fish Fillets 520g,JS Roast Lamb Dinner 400g,JS chicken hotpot dinner 400g,Stamford St Large Pork Sausages 1kg,Stamford St Beef Burgers x8 397g
391,Comfort creations strawberry lily 30w,Hubbard's Peach Slices 411g,Hubbards Spaghetti Rings 400g,Hubbard's Strawberry Jam 454g,Hubbards Peanut Butter 340g
392,Stamford St Mixed Portions Pack 2kg,Stamford Street Peas 850g,Stamford Street Sweetcorn 950g,Stamford St Large Pork Sausages 1kg,JS Rocket Lollies 8pkt 464ml
393,Js Frozen Chicken Liver Pots 250g,Whiskas temptations chicken & cheese 60g,Webbox chckn & liver tasty sticks x6 30g,Dreamies cat treats chicken 60g,Webbox turkey & lamb tasty sticks x6 30g
