# Supermarket Items Order Model

This notebook is used as an experimentation notebook to train a model that sorts items in a supermarket list in the most optimal pick-up order. 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import time
from datetime import datetime
import random
import seaborn as sns
import warnings
from IPython.display import clear_output
from itertools import combinations

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, roc_curve, auc, f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

import boto3
from botocore.exceptions import ClientError
import json

client = boto3.client("bedrock-runtime", region_name="eu-west-1")

import tensorflow as tf
import keras
# from tensorflow.keras import Sequential
# from tensorflow.keras.layers import Dense, Input, Flatten, Concatenate, Dropout
# from tensorflow.keras.models import Model

from tqdm.notebook import tqdm
from scipy.stats import uniform, randint, boxcox

pd.set_option('display.max_columns', None)

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=UserWarning)

# Load the data
archived_lists = pd.read_json('/home/sagemaker-user/ml-notebooks/toto/supermarket/item-order/20241025-archivedLists.json')
game_examples = pd.read_json('/home/sagemaker-user/ml-notebooks/toto/supermarket/item-order/20241025-trainingExamples.json')

# Basic variables that are going to be used later
target_var = 'before'
id_var = '_id'

archived_lists.drop(columns=[id_var], inplace=True)
game_examples.drop(columns=[id_var], inplace=True)

In [None]:
archived_lists.head(2)

In [None]:
game_examples.head(2)

---
# Bedrock Experimentation

In [None]:
class Llama:

    model_id = "eu.meta.llama3-2-3b-instruct-v1:0"

    def invoke(self, prompt: str): 
        
        formatted_prompt = f"""
        <|begin_of_text|><|start_header_id|>user<|end_header_id|>
        {prompt}
        <|eot_id|>
        <|start_header_id|>assistant<|end_header_id|>
        """
        
        native_request = {
            "prompt": formatted_prompt,
            "max_gen_len": 2000,
            "temperature": 0,
        }
        
        # Convert the native request to JSON.
        request = json.dumps(native_request)
        
        try:
            # Invoke the model with the request.
            response = client.invoke_model(modelId=self.model_id, body=request)
        
            # Decode the response body.
            model_response = json.loads(response["body"].read())
            
            # Extract and print the response text.
            response_text = model_response["generation"]
            
            return response_text
        
        except (ClientError, Exception) as e:
            print(f"ERROR: Can't invoke '{self.model_id}'. Reason: {e}")
            exit(1)

In [None]:
class Claude: 

    model_id = 'eu.anthropic.claude-3-5-sonnet-20240620-v1:0'

    def invoke(self, prompt: str): 
        # Start a conversation with the user message.
        
        conversation = [
            {
                "role": "user",
                "content": [{"text": prompt}],
            }
        ]
        
        try:
            # Send the message to the model, using a basic inference configuration.
            response = client.converse(
                modelId=self.model_id,
                messages=conversation,
                inferenceConfig={"maxTokens": 2000, "temperature": 0, "topP": 0.9},
            )
        
            # Extract and print the response text.
            return response["output"]["message"]["content"][0]["text"]
        
        except (ClientError, Exception) as e:
            print(f"ERROR: Can't invoke '{self.model_id}'. Reason: {e}")
            exit(1)

In [None]:
import ast 

class ItemClassifier:

    categories = {
        'Produce': ['apples', 'bananas', 'lettuce', 'tomatoes', 'sweet potato', 'carrots'],
        'Dairy': ['milk', 'cheese', 'yogurt', 'butter', 'mozzarella', 'ricotta'],
        'Bakery': ['bread', 'bagels', 'muffins', 'cakes', 'wasa'],
        'Meat': ['chicken', 'beef', 'pork', 'sausage', 'medister', 'bacon'],
        'Seafood': ['salmon', 'tuna', 'shrimp', 'cod'],
        'Frozen Foods': ['ice cream', 'frozen pizza', 'frozen vegetables', 'frozen bread'],
        'Canned Goods': ['soup', 'beans', 'corn', 'tomato sauce'],
        'Pasta and Rice': ['spaghetti', 'penne', 'rice', 'couscous'],
        'Condiments and Sauces': ['ketchup', 'mustard', 'mayo', 'salad dressing'],
        'Oils and Vinegars': ['olive oil', 'vegetable oil', 'balsamic vinegar'],
        'Snacks': ['chips', 'pretzels', 'popcorn', 'nuts'],
        'Breakfast Foods': ['cereal', 'oatmeal', 'musli', 'pancake mix'],
        'Beverages': ['soda', 'juice', 'water', 'coffee', 'havremælk'],
        'Baking Supplies': ['flour', 'sugar', 'baking powder', 'vanilla extract'],
        'Spreads': ['jam', 'honey', 'peanut butter', 'nutella'],
        'Pet Supplies': ['dog food', 'cat litter', 'pet toys', 'dog bags'],
        'Personal Care': ['shampoo', 'soap', 'toothpaste', 'deodorant'],
        'Household Items': ['paper towels', 'toilet paper', 'trash bags', 'diapers'],
        'Deli': ['sliced meats', 'olives', 'hummus', 'prepared salads'],
        'International Foods': ['salsa', 'soy sauce', 'curry paste', 'tortillas']
    }

    def categorize(self, item: str):
    
        prompt = f"""
        The following are supermarket items' categories. Each category has some examples of typical items that would belong in that category. 
        ---
        CATEGORIES:
        {self.categories}
        ---
        
        I need you to pick the most probable category for the following item: '{item}'.
        You can only consider categories that are among the ones that I provided above. 
        
        If you cannot assign a category, return the category 'unknown'.

        Only return the name of the category. Nothing else. 
        """
        
        return Claude().invoke(prompt)

    def batch_categorize(self, items, return_as: str = None): 

        instructions = """
        For each item, return a tuple in this format: 
        (item, category)
        'category' must be a string 
        'item' must be the item name

        Do not return any text expect the list of tuples. The list of tuples must be formatted as follows: 
        [tuple1, tuple2, ...]
        """

        if return_as == 'array': 
            instructions = """
            Return the categories as a list. 
            Do not return any text expect the list of categories, which must be formatted as follows: 
            ['category1', 'category2', ...]
            """
        elif return_as == 'dict': 
            instructions = """
            Return the categories as a dict in this format: 
            {'item1': 'category_of_item1', 'item2': 'category_of_item2', ...}

            Do not return any text expect the dict as specified above. 
            """

        items_string = ", ".join(items)
    
        prompt = f"""
        The following are supermarket items' categories. Each category has some examples of typical items that would belong in that category. 
        ---
        CATEGORIES:
        {self.categories}
        ---
        
        I need you to pick the most probable category for the following items.
        ---
        ITEMS:
        [{items_string}].
        ---
        You can only consider categories that are among the ones that I provided above. 
        
        If you cannot assign a category, return the category 'unknown'.

        {instructions}
        """
        
        result = Claude().invoke(prompt)

        
        if return_as == 'array' or return_as == 'dict': 
            return ast.literal_eval(result)
        
        return result
        

In [None]:
ItemClassifier().categorize('eggs')

In [None]:
ItemClassifier().batch_categorize(['eggs', 'nutella', 'peanut butter', 'letmælk', 'latte', 'medister', 'bacon i tern', 'bacon', 'pesto'], return_as='dict')

---
# Data Preparation & Feature Engineering

## Uniforming Datasets

In [None]:
def convert_archived_lists_to_examples(dataset):
    
    results = []
    
    for list_id, group in dataset.groupby("listId"):
        # Sort the group by userIndex to get the pickup order
        sorted_group = group.sort_values("userIndex")
    
        # Get all possible pairs of items in the sorted order
        for (i, row1), (j, row2) in combinations(sorted_group.iterrows(), 2):
            
            item1, item2 = row1["name"], row2["name"]
            supermarket_id = row1["supermarketId"]
            
            # Determine if item1 was picked "before" or "after" item2 based on userIndex
            if row1["userIndex"] < row2["userIndex"]:
                label = 1
            else:
                label = 0
    
            # Append the result as a new row
            results.append({
                "item1": item1,
                "item2": item2,
                "before": label,
                "supermarket_id": supermarket_id, 
                # 'list_id': list_id
            })
    
    return pd.DataFrame(results)

In [None]:
def prepare_game_examples(dataset): 

    df = dataset.copy()

    df['before'] = df['label'].apply(lambda label: 1 if label == 'before' else 0)
    df['supermarket_id'] = df['supermarketId']
    df.drop(columns=['supermarketId', 'label', 'date'], inplace=True)

    return df

In [None]:
def unite_and_balance_training_examples(archived_lists=archived_lists, game_examples=game_examples): 
    
    # 1. Convert the archived lists to pairs of training examples
    ex1 = convert_archived_lists_to_examples(archived_lists)

    # 2. Prepare the game examples
    ex2 = prepare_game_examples(game_examples)

    # 3. Unite the two
    df = pd.concat([ex1, ex2], axis=0)

    # 4. Rebalance the dataset: for each pair (item1, item2) with before = 1, generate one (item2, item1) with before = 0
    # Otherwise I have an extremely unbalanced dataset
    df_before = df[df["before"] == 1]
    df_swapped = df_before.copy()
    df_swapped["item1"], df_swapped["item2"] = df_before["item2"], df_before["item1"]
    df_swapped["before"] = 0

    # 5. Unite
    return pd.concat([df, df_swapped], ignore_index=True, axis=0)

In [None]:
df = unite_and_balance_training_examples()
plt.figure(figsize=(4, 3))
sns.countplot(df, x='before')

## Data Cleaning

In [1]:
def remove_useless_words(df):
    """Removes words that are considered useless (e.g. 'c', 'big', 'for', etc..)
    """
    useless_words = ['c', 'n', 'noah', 'for', 'us', 'x2', 'big', 'pack', 'greek', 'or', 'something', 'caro',  'sweet', 'p', 'small']

    # Define the cleaning function for each item1 value
    def remove_useless_words(text):
        
        words = text.split()  
        
        # Filter out any words that are in the useless_words list
        cleaned_words = [word for word in words if word.lower() not in useless_words]
        
        return ' '.join(cleaned_words)  # Join the remaining words back into a single string

    df['item1'] = df['item1'].apply(remove_useless_words)
    df['item2'] = df['item2'].apply(remove_useless_words)

    return df

In [None]:
def remove_rows_with_long_items(df):
    # Filter rows where item1 or item2 has 3 or fewer words
    df_filtered = df[df["item1"].apply(lambda x: len(x.split()) < 3)]
    df_filtered = df_filtered[df_filtered["item2"].apply(lambda x: len(x.split()) < 3)]
    return df_filtered

In [None]:
def lower_case_of_items(df):
    df["item1"] = df["item1"].str.lower()
    df["item2"] = df["item2"].str.lower()

    return df
    

In [None]:
def clean_data(df):
    return lower_case_of_items(
        remove_empty_rows(
            remove_rows_with_long_items(
                remove_useless_words(df)
            )
        )
    )

In [None]:
def clean_data(df):
    return lower_case_of_items(
        remove_empty_rows(
            remove_rows_with_long_items(
                remove_useless_words(df)
            )
        )
    )

In [None]:
df_cleaned = clean_data(df.copy())
df_cleaned.info()

## Filtering
We're only going to train a model on supermarket 1. There's not enough data in general on other supermarkets. 

In [None]:
def filter_supermarkets(df):
    return df[df['supermarket_id'] == 1].drop(columns='supermarket_id').reset_index(drop=True)

## Feature Engineering


In [None]:
def get_items_dictionnary(df):
    return pd.concat([pd.Series(df['item1']), pd.Series(df['item1'])]).unique()

### Item Category

In [None]:
def get_items_cat_dict(df):
    return ItemClassifier().batch_categorize(get_items_dictionnary(df), return_as='dict')
    

In [None]:
def add_items_category(dataset):

    df = dataset.copy()
    
    items_cat_dict = get_items_cat_dict(df)
    
    df['item1_cat'] = df['item1'].apply(lambda x : items_cat_dict.get(x, 'other').lower())
    df['item2_cat'] = df['item2'].apply(lambda x : items_cat_dict.get(x, 'other').lower())

    return df

In [None]:
def engineer_features(dataset): 
    return add_items_category(dataset)
    

## Encoding

In [2]:
def encode_items(df, trained_encoders = None):
    
    items_dict = get_items_dictionnary(df)
    items_cat_dict = get_items_cat_dict(df)
    item_categories = list(items_cat_dict.keys())

    if trained_encoders is None: 
        item_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        cat_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        
        item_encoder.fit(items_dict.reshape(-1,1))
        cat_encoder.fit(np.array(item_categories).reshape(-1,1))
    else:
        item_encoder = trained_encoders['item_encoder']
        cat_encoder = trained_encoders['cat_encoder']

    encoded_df = df.copy()
    
    encoded_item1 = pd.DataFrame(item_encoder.transform(df[['item1']]), columns=items_dict).add_prefix('item1_')
    encoded_item2 = pd.DataFrame(item_encoder.transform(df[['item2']]), columns=items_dict).add_prefix('item2_')
    encoded_cat1 = pd.DataFrame(item_encoder.transform(df[['item1_cat']]), columns=item_categories).add_prefix('cat1_')
    encoded_cat2 = pd.DataFrame(item_encoder.transform(df[['item2_cat']]), columns=item_categories).add_prefix('cat2_')
    
    encoded_df.drop(columns=['item1', 'item2', 'item1_cat', 'item2_cat'], inplace=True)

    encoded_df = pd.concat([encoded_df, encoded_item1, encoded_item2, encoded_cat1, encoded_cat2], axis=1)

    return {
        "dataset": encoded_df, 
        "item_encoder": item_encoder, 
        "cat_encoder": cat_encoder
    }

# Data Exploration

In [3]:
# 1. Get the dataset without encoding
eda_df = filter_supermarkets(
                clean_data(
                    unite_and_balance_training_examples()
                )
            )

plt.figure(figsize=(25, 4))
sns.countplot(eda_df, x='item1')
plt.xticks(rotation=90)
plt.show()

NameError: name 'filter_supermarkets' is not defined


# Modelling


## Common Training functions


In [4]:
def train_model(model, model_name="Unamed Model", prepared_data=None):

    # 1. Prepare the data for Training
    if prepared_data is None: 
        preparation_result = encode_items(
            engineer_features(
                filter_supermarkets(
                    clean_data(
                        unite_and_balance_training_examples()
                    )
                )   
            )
        )
    else:
        preparation_result = prepared_data

    dataset = preparation_result['dataset']

    # 2. Split the data 
    X = dataset.drop(columns=[target_var]).to_numpy()
    y = dataset[target_var].to_numpy()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=32)

    # 3. Fit
    print(f"\nFitting {model_name}")
    
    model.fit(X_train, y_train)

    # 4. Score
    y_train_pred = model.predict_proba(X_train)[:,1]
    y_test_pred = model.predict_proba(X_test)[:,1]

    scores = {
        'train': roc_auc_score(y_train, y_train_pred),
        'test': roc_auc_score(y_test, y_test_pred)
    }

    print(f"{model_name}: Train Score [{scores['train']}] - Test Score [{scores['test']}]")

    return {
        'model': model,
        'scores': scores, 
        'item_encoder': preparation_result['item_encoder']
    }

In [5]:

prepared_data = encode_items(
    engineer_features(
        filter_supermarkets(
            clean_data(
                unite_and_balance_training_examples()
            )
        )   
    )
)
print(f"Prepared Data Shape: {prepared_data['dataset'].shape}")

NameError: name 'engineer_features' is not defined

In [None]:
rf_model = train_model(RandomForestClassifier(), model_name="Random Forest", prepared_data=prepared_data)
xgb_model = train_model(XGBClassifier(), model_name="XGBoost", prepared_data=prepared_data)
cat_boost_model = train_model(CatBoostClassifier(verbose=0), model_name="Cat Boost", prepared_data=prepared_data)
lgbm_model = train_model(LGBMClassifier(verbose=-1), model_name="Light GBM", prepared_data=prepared_data)
mlp_model = train_model(MLPClassifier(), model_name="MLP", prepared_data=prepared_data)

## Experiments

In [None]:

def mlp_grid_search(prepared_data=None):

    param_grid = {
        'hidden_layer_sizes': [(256, 256), (128, 128, 128, 128)],
        'activation': ['relu'],
        'alpha': [1.0, 3.0]
    }

    mlp = MLPClassifier(max_iter=500)

    gs = GridSearchCV(estimator=mlp, param_grid=param_grid, scoring='roc_auc', cv=3, n_jobs=-1)
    
    result = train_model(gs, model_name="Grid Search MLP", prepared_data=prepared_data)

    print("Best parameters found: ", gs.best_params_)
    print("Best score: ", gs.best_score_)

    result['model'] = gs.best_estimator_

    return result

In [None]:
mlp_best = mlp_grid_search(prepared_data=prepared_data)

## Final Model
The best trained model of the first version of this model, obtained without considering Item Categories was: <br>
Train Score [0.8328987977864004] - Test Score [**0.7889**227377472335]

In [None]:
final_model = train_model(MLPClassifier(alpha=1.0, hidden_layer_sizes=(20,20)), model_name="Chosen Model")

# Inference

In [None]:
example = ['bread', 'jam']
example_df = pd.DataFrame([example], columns=['item1', 'item2'])

encoded_example = encode_items(example_df, encoder=final_model['item_encoder'])['dataset']

predicted_before = final_model['model'].predict_proba(encoded_example)[:,1]

print(f"Probability that '{example[0]}' comes before '{example[1]}': {predicted_before[0]}")