# Supermarkets modeling

The goal of this document....

In [1]:
import pandas as pd
import numpy as np

## 1. Import data

In [2]:
# import data
supermarkets = {
    'aldinorth': pd.read_pickle('data/products-clean/aldinorth-products-clean.pkl'),
    'aldisouth': pd.read_pickle('data/products-clean/aldisouth-products-clean.pkl'),
    'edeka': pd.read_pickle('data/products-clean/edeka-products-clean.pkl')
    #'kaufan': pd.read_pickle('data/products-clean/edeka-products-clean.pkl')
    #'rewe': pd.read_pickle('data/products-clean/edeka-products-clean.pkl')
    #'lidl': pd.read_pickle('data/products-clean/edeka-products-clean.pkl')
}

In [3]:
supermarkets['aldinorth'].head()

Unnamed: 0,Name,Price
0,Tamara extra strawberry jam,0.99
1,Nutella,3.79
2,Coke,2.89
3,Trader Joe's / Aldi Nord Lemon Iced Tea,0.49
4,Iglo fish fingers,3.49


## 2. Modeling to do's

- Algorithm 1.0 - Build an algorithm able to (input: ingredient, output: % coverage + total price per each supermarket)
- Algorithm 2.0 - Build an algorithm able to (input: list of ingredients, output: % coverage + total price per each supermarket)
- Algorithm 3.0 - Use fuzzy buzzy to match the input (potato) with the ingredients by similarity.

### 2.1 Algorithm 1.0
Build an algorithm able to: 

- [input]: ingredient

- [output]: price in each supermarket

In [4]:
# We are looking for the price of the ingredient that we are searching per each supermarket
supermarkets['aldinorth'][supermarkets['aldinorth']['Name'] == 'Nutella']['Price'].values[0]

3.79

In [5]:
# Function to get the prices with an ingredient as input
def search(ingredient:str):
    # New empty list
    result = []
    for market in supermarkets:
        a = supermarkets[market]
        try:
            price = a[a['Name'] == ingredient]['Price'].values[0]
            result.append(f'{market}: {price}')
        except:
            result.append(f'{market}: Not found')
    return result

# Try it with 'Nutella' because it is available in every supermarket
# Try with 'Coke'

In [6]:
# Testing the function
search('Nutella')

['aldinorth: 3.79', 'aldisouth: 3.89', 'edeka: 1.77']

In [7]:
# Testing the function
search('Coke')

['aldinorth: 2.89', 'aldisouth: 0.99', 'edeka: 1.39']

We can see that there are some inconsistencies with the 'Coca Cola' prices. The problem is that the size of the packs are not normalized. 

In [8]:
supermarkets['edeka'][supermarkets['edeka']['Name'] == 'Coke']

Unnamed: 0,Name,Price
0,Coke,1.39


In [9]:
supermarkets['edeka'][supermarkets['edeka']['Name'].str.contains('Coke')]

Unnamed: 0,Name,Price
0,Coke,1.39


### 2.2 Algorithm 2.0
Build an algorithm able to:

- [input]: list of ingredients (ing as it is in the dataset)
- [output]: % coverage + total price per each supermarket

In [10]:
def search_list(ingredients:list):
    # New empty list
    result = []
    
    for market in supermarkets:
        
        a = supermarkets[market]
        
        for ing in ingredients:    
            try:
                price = a[a['Name'] == ing]['Price'].values[0]
                result.append(f'{market}: {price} for the {ing}')
            except:
                result.append(f'{market}: Not found')
            
    return result

In [11]:
search_list(['Coke', 'Nutella'])

['aldinorth: 2.89 for the Coke',
 'aldinorth: 3.79 for the Nutella',
 'aldisouth: 0.99 for the Coke',
 'aldisouth: 3.89 for the Nutella',
 'edeka: 1.39 for the Coke',
 'edeka: 1.77 for the Nutella']

In [12]:
result = pd.DataFrame(columns=['Supermarket', 'Ingredient', 'Price', 'Coverage'])

In [13]:
result.append({'Supermarket': 'aldinorth', 'Ingredient': 'Coke', 'Price': 2.89, 'Coverage': 1}, ignore_index=True)

Unnamed: 0,Supermarket,Ingredient,Price,Coverage
0,aldinorth,Coke,2.89,1


In [14]:
result['Price'] = pd.to_numeric(result['Price'], errors='coerce')
result['Coverage'] = pd.to_numeric(result['Coverage'], errors='coerce')

In [15]:
result.dtypes

Supermarket    object
Ingredient     object
Price           int64
Coverage        int64
dtype: object

In [16]:
def search_list_grouped(ingredients:list):
    # New empty df
    result = pd.DataFrame(columns=['Supermarket', 'Ingredient', 'Price', 'Coverage'])

    for market in supermarkets:
        
        a = supermarkets[market]
        
        for ing in ingredients:    
            try:
                price = a[a['Name'] == ing]['Price'].values[0]
                df2 = {'Supermarket': market, 'Ingredient': ing, 'Price': price, 'Coverage': 1}
                result = result.append(df2, ignore_index=True)
            except:
                result = result.append({'Supermarket': market, 'Ingredient': ing, 'Price': np.nan, 'Coverage': 0}, ignore_index=True)
    
    # Change to numeric
    result['Price'] = pd.to_numeric(result['Price'], errors='coerce')
    result['Coverage'] = pd.to_numeric(result['Coverage'], errors='coerce') * 100
    
    # Group by 'Supermarket'
    results = result.groupby('Supermarket').agg({'Price':'sum', 'Coverage': 'mean'}).reset_index()
    results = results.sort_values('Price').reset_index(drop=True)
    
    # How to display it to users
#    for row in range(len(results)):
#        print(f"In {row['Supermarket']} you can find {row['Coverage']}% of the ingredients for row['Price']")
    
    return results

# 'result' dataframe contains detailes information
# 'results' dataframe contains grouped information

In [17]:
search_list_grouped(['Coke', 'Nutella', 'Vanilla', 'Tuna', 'Soap', 'Classic Tomato Sauce'])

Unnamed: 0,Supermarket,Price,Coverage
0,edeka,3.16,33.333333
1,aldisouth,4.88,33.333333
2,aldinorth,6.68,33.333333


As we can see, if we do not type the ingredient exactly as it is in the dataset, our algorithm is not able to find it. In order to make it more useful, we are going to add fuzzy words and create a basic search engine.

### 2.3 Algorithm 3.0

Build an algorithm able to:

- [input]: list of ingredients (similar word)
- [output]: % coverage + total price per each supermarket

We are going to use the fuzzywuzzy ratio

In [18]:
# Add the packages
from fuzzywuzzy import fuzz

In [19]:
fuzz.ratio("Coca Cola", "Coca Cola")

100

In [20]:
fuzz.ratio("Coca Cola", "Coke")

31

In [21]:
fuzz.ratio("Coca Cola".lower(), "Coke".lower())

31

Extract the ingredient name

In [22]:
supermarkets['edeka']

Unnamed: 0,Name,Price
0,Coke,1.39
1,Nutella,1.77
2,Becel Gold 70% fat,1.49
3,Iglo fish fingers,2.99
4,Good & Cheap Landgasthof Goulash Pan,1.99
...,...,...
5043,Nivea Extreme Stay Lipstick 17 Extrem Pinkini,11.49
5044,Nivea Extreme Stay Lipstick 21 Extreme Cinnam,11.49
5045,Nivea Extreme Stay Lipstick 11 Extremely Beige,11.49
5046,Nivea Turbo Color Nail Polish 12 Turbo Red,7.89


- After the numbers I can extract the rest
- Brands ? 
- Black list with words that I can exclude

In [23]:
input_ing = "Sunflower oil"

In [24]:
matches = []

for ing in supermarkets['aldinorth']['Name']:
    ratio = fuzz.ratio(ing.lower(), input_ing.lower())

    if ratio > 60:
        matches.append((ing, ratio))

In [25]:
matches

[('Buttella fine sunflower oil', 65)]

In [26]:
matches = sorted(matches, key=lambda x: x[1], reverse=True)

In [27]:
# Add the best match to a list
clean_list = []

clean_list.append(matches[0][0])

In [28]:
clean_list

['Buttella fine sunflower oil']

Our final algorithm should be able to: 
    
    Step 1:
    -------
    
    - [Input] - [List of ingredients (not necessarly exactly same as database)]
    - [Output] - list of supermarkets with total price and coverage
    
    Step 2:
    -------
    
    - [Input] - user selects one supermarket
    - [Output] - Shopping list with ingredient + price
    
    Step 3:
    -------
    
    - [Output] - list of 5 items that they might be interested in shopping
    


In [29]:
supermarkets['aldinorth']

Unnamed: 0,Name,Price
0,Tamara extra strawberry jam,0.99
1,Nutella,3.79
2,Coke,2.89
3,Trader Joe's / Aldi Nord Lemon Iced Tea,0.49
4,Iglo fish fingers,3.49
...,...,...
2984,Moser Roth Fine Easter Eggs Fine Bitter 70%,3.65
2985,Moser Roth Fine Easter Eggs Noble Nougat,1.49
2986,Moser Roth Fine Easter Eggs Chocolate Candy,0.99
2987,Moser Roth Fine Easter Eggs Noble Collection w...,1.15


In [30]:
def get_products(ingredients:list):
    
            # ...
    supermarkets_lists = {
            'aldinorth': [],
            'aldisouth': [],
            'edeka': []
            #'kaufan': pd.read_pickle('data/products-clean/edeka-products-clean.pkl')
            #'rewe': pd.read_pickle('data/products-clean/edeka-products-clean.pkl')
            #'lidl':
        }
    
    for market in supermarkets:

        for ing in ingredients:
        
            # Empty dictionary where we will store the matching products
            matches = []   
        
            # For each product, iterate through each supermarket and generate a list
            for product in supermarkets[market]['Name']:
                ratio = fuzz.ratio(product.lower(), ing.lower())

                if ratio > 50:
                    matches.append((product, ratio))
                    matches = sorted(matches, key=lambda x: x[1], reverse=True)
                
            # If matches list is not empty
            if matches:
                supermarkets_lists[market].append(matches[0][0])
            else: 
                supermarkets_lists[market].append(np.nan)

    return supermarkets_lists

The function has to have different input depending on the supermarket. That means that we have to modify the search_list_grouped function, since it has been developed to be able to take only 1 input argument.

In [31]:
get_products(['meat', 'Coce', 'Sunflower oil', 'Garlic', 'Onion', 'Basmati rice'])

{'aldinorth': ['meatloaf',
  'Coke',
  'Buttella fine sunflower oil',
  nan,
  'Onion baguette',
  'Albona milk rice'],
 'aldisouth': [nan,
  'Coke',
  'Bellasan sunflower oil',
  'garlic',
  'Onions',
  'RYZ Basmati rice'],
 'edeka': [nan,
  'Coke',
  'Thomy sunflower oil',
  'garlic',
  'Onions',
  'Lien Ying Basmati rice']}

In [32]:
def get_list_products_price(ingredients:list):
    
    # Create a dictionary with the supermarkets and a list with the products
    supermarkets_p = {
        'aldinorth': get_products(ingredients)['aldinorth'],
        'aldisouth': get_products(ingredients)['aldisouth'],
        'edeka': get_products(ingredients)['edeka']
        # ...
    }
    
    # New empty df
    result = pd.DataFrame(columns=['Ingredient', 'Supermarket', 'Product', 'Price', 'Coverage'])

    # Iterate and populate a df with the final result
    for market in supermarkets_p:
        
        # Iterate per each product in each supermarket
        s = supermarkets[market] # Dictionary which contains the detailed info of all products per supermarket
        
        ### NEW TO FIND HOW TO ADD THE NAN with THE COVERAGE 0.
        for prod, ing in zip(supermarkets_p[market], ingredients):    
            try:
                price = s[s['Name'] == prod]['Price'].values[0]
                df2 = {'Ingredient': ing, 'Supermarket': market, 'Product': prod, 'Price': price, 'Coverage': 1}
                result = result.append(df2, ignore_index=True)
            except:
                result = result.append({'Ingredient': ing, 'Supermarket': market, 'Product': np.nan, 'Price': np.nan, 'Coverage': 0}, ignore_index=True)
    
    # 'result' contains detailed information about product and price per each supermarket
    
    # Change to numeric
    result['Price'] = pd.to_numeric(result['Price'], errors='coerce')
    result['Coverage'] = pd.to_numeric(result['Coverage'], errors='coerce')
    
    return result


In [33]:
get_list_products_price(['oats', 'zuchinni', 'cherry tomatoes', 'pasta', 'meat', 'Coce', 'Sunflower oil', 'Onion', 'Basmati rice', 'Spaghetti', 'Miso soup', 'Tempeh', 'Soya Sauce'])

Unnamed: 0,Ingredient,Supermarket,Product,Price,Coverage
0,oats,aldinorth,Oranges,0.69,1
1,zuchinni,aldinorth,,,0
2,cherry tomatoes,aldinorth,Rocher chocolates,3.49,1
3,pasta,aldinorth,,,0
4,meat,aldinorth,meatloaf,1.89,1
5,Coce,aldinorth,Coke,2.89,1
6,Sunflower oil,aldinorth,Buttella fine sunflower oil,0.99,1
7,Onion,aldinorth,Onion baguette,0.89,1
8,Basmati rice,aldinorth,Albona milk rice,0.45,1
9,Spaghetti,aldinorth,Good organic spaghetti,0.75,1


In [35]:
# We want to see:
#    1. Where we will be able to find all the products
#    2. Where can we buy them at the best price

def get_supermarket_choice(ingredients:list):
    
    result = get_list_products_price(ingredients) 
    
    # Group by 'Supermarket'
    results = result.groupby('Supermarket').agg({'Price':'sum', 'Coverage': 'mean'}).reset_index()
    results = results.sort_values('Price').reset_index(drop=True)
    
    return results

In [40]:
get_supermarket_choice(['vodka', 'oats', 'zuchinni', 'cherry tomatoes', 'pasta', 'meat', 'Coce', 'Sunflower oil', 'Onion', 'Basmati rice', 'Spaghetti', 'Miso soup', 'Tempeh', 'Soya Sauce'])

Unnamed: 0,Supermarket,Price,Coverage
0,aldisouth,11.34,0.785714
1,edeka,13.42,0.571429
2,aldinorth,17.73,0.785714


In [37]:
# We want to see:
#    1. Detailed list of the product names and price
def get_shopping_list(ingredients:list):
    
    result = get_list_products_price(ingredients) 
    results = get_supermarket_choice(ingredients)
    
    # Ask the user where they would like to do the grocery shopping
    user_input = input('Where would you like to go shopping?\n')
    
    return result[result['Supermarket'] == user_input]

In [41]:
get_shopping_list(['vodka', 'oats', 'zuchinni', 'cherry tomatoes', 'pasta', 'meat', 'Coce', 'Sunflower oil', 'Onion', 'Basmati rice', 'Spaghetti', 'Miso soup', 'Tempeh', 'Soya Sauce'])

Where would you like to go shopping?
aldisouth


Unnamed: 0,Ingredient,Supermarket,Product,Price,Coverage
14,vodka,aldisouth,,,0
15,oats,aldisouth,Oranges,2.39,1
16,zuchinni,aldisouth,zucchini,0.69,1
17,cherry tomatoes,aldisouth,Date cherry tomatoes,0.66,1
18,pasta,aldisouth,,,0
19,meat,aldisouth,,,0
20,Coce,aldisouth,Coke,0.99,1
21,Sunflower oil,aldisouth,Bellasan sunflower oil,,1
22,Onion,aldisouth,Onions,1.19,1
23,Basmati rice,aldisouth,RYZ Basmati rice,1.99,1


### Create one function with the whole process

In [45]:
def demo(ingredients:list):
    
    # 1. Convert the ingredient list to product lists per each supermarket
    result = get_list_products_price(ingredients)
    
    # 2. Show an overview of their shopping list grouped by supermarket
    results = get_supermarket_choice(ingredients)
    
    print(results) # display as dataframe with print method..
    
    # 3. Ask the user where they would like to do the grocery shopping
    user_input = input('Where would you like to go shopping?\n')
    
    # 4. Show the user theis shopping list
    return result[result['Supermarket'] == user_input]
    
    # 5. Show products that they may be interested in buy

## HOW CAN I MAKE IT INTERACTIVE?
## Display -- > https://stackoverflow.com/questions/26873127/show-dataframe-as-table-in-ipython-notebook

In [46]:
demo(['tofu', 'black pepper', 'lemon' 'meatballs', 'Coce', 'Sunflower oil', 'Onion', 'Basmati rice', 'Spaghetti', 'Miso soup', 'Tempeh', 'Soya Sauce'])

  Supermarket  Price  Coverage
0   aldinorth   7.66  0.818182
1   aldisouth  11.38  0.909091
2       edeka  16.42  0.727273
Where would you like to go shopping?
edeka


Unnamed: 0,Ingredient,Supermarket,Product,Price,Coverage
22,tofu,edeka,,,0
23,black pepper,edeka,Red pepper,4.99,1
24,lemonmeatballs,edeka,Good & cheap meatballs,1.99,1
25,Coce,edeka,Coke,1.39,1
26,Sunflower oil,edeka,Thomy sunflower oil,1.99,1
27,Onion,edeka,Onions,0.99,1
28,Basmati rice,edeka,Lien Ying Basmati rice,1.89,1
29,Spaghetti,edeka,Buitoni spaghetti,1.29,1
30,Miso soup,edeka,,,0
31,Tempeh,edeka,,,0


### Unify with the ML model