# Supermarkets modeling

The goal of this document....

In [1]:
import pandas as pd
import numpy as np

## 1. Import data

In [2]:
# import data
supermarkets = {
    'aldinorth': pd.read_pickle('data/products-clean/aldinorth-products-clean.pkl'),
    'aldisouth': pd.read_pickle('data/products-clean/aldisouth-products-clean.pkl'),
    'edeka': pd.read_pickle('data/products-clean/edeka-products-clean.pkl'),
    'kaufland': pd.read_pickle('data/products-clean/kaufland-products-clean.pkl'),
    'lidl': pd.read_pickle('data/products-clean/lidl-products-clean.pkl'),
    'rewe': pd.read_pickle('data/products-clean/rewe-products-clean.pkl')
}

In [3]:
supermarkets['edeka'].head()

Unnamed: 0,Name,Price
0,Coke,1.39
1,Nutella,1.77
2,Becel Gold 70% fat,1.49
3,Iglo fish fingers,2.99
5,"""Landliebe country cheese original",2.29


## 2. Modeling to do's

- Algorithm 1.0 - Build an algorithm able to (input: ingredient, output: % coverage + total price per each supermarket)
- Algorithm 2.0 - Build an algorithm able to (input: list of ingredients, output: % coverage + total price per each supermarket)
- Algorithm 3.0 - Use fuzzy buzzy to match the input (potato) with the ingredients by similarity.

### 2.1 Algorithm 1.0
Build an algorithm able to: 

- [input]: ingredient

- [output]: price in each supermarket

In [4]:
# We are looking for the price of the ingredient that we are searching per each supermarket
supermarkets['aldinorth'][supermarkets['aldinorth']['Name'] == 'Nutella']['Price'].values[0]

3.79

In [5]:
# Function to get the prices with an ingredient as input
def search(ingredient:str):
    # New empty list
    result = []
    for market in supermarkets:
        a = supermarkets[market]
        try:
            price = a[a['Name'] == ingredient]['Price'].values[0]
            result.append(f'{market}: {price}')
        except:
            result.append(f'{market}: Not found')
    return result

# Try it with 'Nutella' because it is available in every supermarket
# Try with 'Coke'

In [6]:
# Testing the function
search('Nutella')

['aldinorth: 3.79',
 'aldisouth: Not found',
 'edeka: 1.77',
 'kaufland: 3.79',
 'lidl: 3.99',
 'rewe: 3.79']

In [7]:
# Testing the function
search('Coke')

['aldinorth: 2.89',
 'aldisouth: Not found',
 'edeka: 1.39',
 'kaufland: 11.4',
 'lidl: 0.99',
 'rewe: 0.99']

We can see that there are some inconsistencies with the 'Coca Cola' prices. The problem is that the size of the packs are not normalized. 

In [8]:
supermarkets['edeka'][supermarkets['edeka']['Name'] == 'Coke']

Unnamed: 0,Name,Price
0,Coke,1.39


In [9]:
supermarkets['edeka'][supermarkets['edeka']['Name'].str.contains('Coke')]

Unnamed: 0,Name,Price
0,Coke,1.39


### 2.2 Algorithm 2.0
Build an algorithm able to:

- [input]: list of ingredients (ing as it is in the dataset)
- [output]: % coverage + total price per each supermarket

In [10]:
def search_list(ingredients:list):
    # New empty list
    result = []
    
    for market in supermarkets:
        
        a = supermarkets[market]
        
        for ing in ingredients:    
            try:
                price = a[a['Name'] == ing]['Price'].values[0]
                result.append(f'{market}: {price} for the {ing}')
            except:
                result.append(f'{market}: Not found')
            
    return result

In [11]:
search_list(['Coke', 'Nutella'])

['aldinorth: 2.89 for the Coke',
 'aldinorth: 3.79 for the Nutella',
 'aldisouth: Not found',
 'aldisouth: Not found',
 'edeka: 1.39 for the Coke',
 'edeka: 1.77 for the Nutella',
 'kaufland: 11.4 for the Coke',
 'kaufland: 3.79 for the Nutella',
 'lidl: 0.99 for the Coke',
 'lidl: 3.99 for the Nutella',
 'rewe: 0.99 for the Coke',
 'rewe: 3.79 for the Nutella']

In [12]:
result = pd.DataFrame(columns=['Supermarket', 'Ingredient', 'Price', 'Coverage'])

In [13]:
result.append({'Supermarket': 'aldinorth', 'Ingredient': 'Coke', 'Price': 2.89, 'Coverage': 1}, ignore_index=True)

Unnamed: 0,Supermarket,Ingredient,Price,Coverage
0,aldinorth,Coke,2.89,1


In [14]:
result['Price'] = pd.to_numeric(result['Price'], errors='coerce')
result['Coverage'] = pd.to_numeric(result['Coverage'], errors='coerce')

In [15]:
result.dtypes

Supermarket    object
Ingredient     object
Price           int64
Coverage        int64
dtype: object

In [16]:
def search_list_grouped(ingredients:list):
    # New empty df
    result = pd.DataFrame(columns=['Supermarket', 'Ingredient', 'Price', 'Coverage'])

    for market in supermarkets:
        
        a = supermarkets[market]
        
        for ing in ingredients:    
            try:
                price = a[a['Name'] == ing]['Price'].values[0]
                df2 = {'Supermarket': market, 'Ingredient': ing, 'Price': price, 'Coverage': 1}
                result = result.append(df2, ignore_index=True)
            except:
                result = result.append({'Supermarket': market, 'Ingredient': ing, 'Price': np.nan, 'Coverage': 0}, ignore_index=True)
    
    # Change to numeric
    result['Price'] = pd.to_numeric(result['Price'], errors='coerce')
    result['Coverage'] = pd.to_numeric(result['Coverage'], errors='coerce') * 100
    
    # Group by 'Supermarket'
    results = result.groupby('Supermarket').agg({'Price':'sum', 'Coverage': 'mean'}).reset_index()
    results = results.sort_values('Price').reset_index(drop=True)
    
    # How to display it to users
#    for row in range(len(results)):
#        print(f"In {row['Supermarket']} you can find {row['Coverage']}% of the ingredients for row['Price']")
    
    return results

# 'result' dataframe contains detailes information
# 'results' dataframe contains grouped information

In [17]:
search_list_grouped(['Coke', 'Nutella', 'Vanilla', 'Tuna', 'Soap', 'Classic Tomato Sauce'])

Unnamed: 0,Supermarket,Price,Coverage
0,aldisouth,0.0,0.0
1,edeka,3.16,33.333333
2,rewe,4.78,33.333333
3,lidl,4.98,33.333333
4,aldinorth,6.68,33.333333
5,kaufland,15.19,33.333333


As we can see, if we do not type the ingredient exactly as it is in the dataset, our algorithm is not able to find it. In order to make it more useful, we are going to add fuzzy words and create a basic search engine.

### 2.3 Algorithm 3.0

Build an algorithm able to:

- [input]: list of ingredients (similar word)
- [output]: % coverage + total price per each supermarket

We are going to use the fuzzywuzzy ratio

In [18]:
# Add the packages
from fuzzywuzzy import fuzz

In [19]:
fuzz.ratio("Coca Cola", "Coca Cola")

100

In [20]:
fuzz.ratio("Coca Cola", "Coke")

31

In [21]:
fuzz.ratio("Coca Cola".lower(), "Coke".lower())

31

Extract the ingredient name

In [22]:
supermarkets['edeka']

Unnamed: 0,Name,Price
0,Coke,1.39
1,Nutella,1.77
2,Becel Gold 70% fat,1.49
3,Iglo fish fingers,2.99
5,"""Landliebe country cheese original",2.29
...,...,...
5042,Nivea Stay on Eye Pencil 6 Smoke,6.49
5043,Nivea Extreme Stay Lipstick 17 Extrem Pinkini,11.49
5044,Nivea Extreme Stay Lipstick 21 Extreme Cinnam,11.49
5045,Nivea Extreme Stay Lipstick 11 Extremely Beige,11.49


- After the numbers I can extract the rest
- Brands ? 
- Black list with words that I can exclude

In [23]:
input_ing = "Sunflower oil"

In [24]:
matches = []

for ing in supermarkets['aldinorth']['Name']:
    ratio = fuzz.ratio(ing.lower(), input_ing.lower())

    if ratio > 60:
        matches.append((ing, ratio))

In [25]:
matches

[('Buttella fine sunflower oil', 65)]

In [26]:
matches = sorted(matches, key=lambda x: x[1], reverse=True)

In [27]:
# Add the best match to a list
clean_list = []

clean_list.append(matches[0][0])

In [28]:
clean_list

['Buttella fine sunflower oil']

Our final algorithm should be able to: 
    
    Step 1:
    -------
    
    - [Input] - [List of ingredients (not necessarly exactly same as database)]
    - [Output] - list of supermarkets with total price and coverage
    
    Step 2:
    -------
    
    - [Input] - user selects one supermarket
    - [Output] - Shopping list with ingredient + price
    
    Step 3:
    -------
    
    - [Output] - list of 5 items that they might be interested in shopping
    


In [29]:
supermarkets['edeka']

Unnamed: 0,Name,Price
0,Coke,1.39
1,Nutella,1.77
2,Becel Gold 70% fat,1.49
3,Iglo fish fingers,2.99
5,"""Landliebe country cheese original",2.29
...,...,...
5042,Nivea Stay on Eye Pencil 6 Smoke,6.49
5043,Nivea Extreme Stay Lipstick 17 Extrem Pinkini,11.49
5044,Nivea Extreme Stay Lipstick 21 Extreme Cinnam,11.49
5045,Nivea Extreme Stay Lipstick 11 Extremely Beige,11.49


In [30]:
supermarkets['edeka'].dtypes

Name      object
Price    float64
dtype: object

In [31]:
supermarkets['edeka']['Name'] = supermarkets['edeka']['Name'].astype(str)

In [32]:
supermarkets['edeka']['Name'].value_counts()

Danone Actimel 4-pack raspberry, 4 x 100 g. 0.1% fat         1
Sanella baking is love ... margarine, 75% fat                1
Good & cheap pure cream cheese                               1
Bahlsen sweet pleasure                                       1
Knorr Fix Spaghetti Bolognese                                1
                                                            ..
Oro di Parma tomato paste with fresh garlic                  1
Kraft Steak & Grillketchup Chili                             1
Homann Rollmops with onion and mustard                       1
Rügen Fisch Light line Pangasius fillet in salsa dressing    1
Good & cheap premium ice cream vanilla                       1
Name: Name, Length: 4864, dtype: int64

In [39]:
def get_products(ingredients:list):
    
    # Create an empty list per each supermarket
    supermarkets_lists = {
            'aldinorth': [],
            'aldisouth': [],
            'edeka': [],
            'kaufland': [],
            'lidl': [],
            'rewe': []
        }
    
    for market in supermarkets:

        for ing in ingredients:
            
            # Empty dictionary where we will store the matching products
            matches = []   
        
            # For each product, iterate through each supermarket and generate a list
            for product in supermarkets[market]['Name']:
                ratio = fuzz.ratio(product.lower(), ing.lower())

                if ratio > 50:
                    matches.append((product, ratio))
                    matches = sorted(matches, key=lambda x: x[1], reverse=True)
                
            # If matches list is not empty
            if matches:
                supermarkets_lists[market].append(matches[0][0])
            else: 
                supermarkets_lists[market].append(np.nan)

    return supermarkets_lists

The function has to have different input depending on the supermarket. That means that we have to modify the search_list_grouped function, since it has been developed to be able to take only 1 input argument.

In [40]:
get_products(['meat', 'Coce', 'Sunflower oil', 'Garlic', 'Onion', 'Basmati rice'])

{'aldinorth': ['meatloaf',
  'Coke',
  'Buttella fine sunflower oil',
  nan,
  'Onion baguette',
  'Albona milk rice'],
 'aldisouth': [nan, nan, nan, nan, nan, nan],
 'edeka': [nan,
  'Coke',
  'Thomy sunflower oil',
  'garlic',
  'Onions',
  'Lien Ying Basmati rice'],
 'kaufland': [nan,
  'Coke',
  'Thomy sunflower oil',
  'garlic',
  'Onions',
  'K organic basmati rice'],
 'lidl': [nan,
  'Coke',
  'Mermaid fillets in sunflower oil',
  'garlic',
  'Onions',
  'Fairglobe basmati rice loose'],
 'rewe': [nan,
  'Coke',
  'Rewe shower oil',
  'garlic',
  'Onions',
  'Rewe Basmati rice']}

In [45]:
def get_list_products_price(ingredients:list):
    
    # Create a dictionary with the supermarkets and a list with the products
    supermarkets_p = {
        'aldinorth': get_products(ingredients)['aldinorth'],
        'aldisouth': get_products(ingredients)['aldisouth'],
        'edeka': get_products(ingredients)['edeka'],
        'kaufland': get_products(ingredients)['kaufland'],
        'lidl': get_products(ingredients)['lidl'],
        'rewe': get_products(ingredients)['rewe']
    }
    
    # New empty df
    result = pd.DataFrame(columns=['Ingredient', 'Supermarket', 'Product', 'Price', 'Coverage'])

    # Iterate and populate a df with the final result
    for market in supermarkets_p:
        
        # Iterate per each product in each supermarket
        s = supermarkets[market] # Dictionary which contains the detailed info of all products per supermarket
        
        ### NEW TO FIND HOW TO ADD THE NAN with THE COVERAGE 0.
        for prod, ing in zip(supermarkets_p[market], ingredients):    
            try:
                price = s[s['Name'] == prod]['Price'].values[0]
                df2 = {'Ingredient': ing, 'Supermarket': market, 'Product': prod, 'Price': price, 'Coverage': 1}
                result = result.append(df2, ignore_index=True)
            except:
                result = result.append({'Ingredient': ing, 'Supermarket': market, 'Product': np.nan, 'Price': np.nan, 'Coverage': 0}, ignore_index=True)
    
    # 'result' contains detailed information about product and price per each supermarket
    
    # Change to numeric
    result['Price'] = pd.to_numeric(result['Price'], errors='coerce')
    result['Coverage'] = pd.to_numeric(result['Coverage'], errors='coerce')
    
    return result


In [46]:
get_list_products_price(['oats', 'zuchinni', 'cherry tomatoes', 'pasta', 'meat', 'Coce', 'Sunflower oil', 'Onion', 'Basmati rice', 'Spaghetti', 'Miso soup', 'Tempeh', 'Soya Sauce'])

Unnamed: 0,Ingredient,Supermarket,Product,Price,Coverage
0,oats,aldinorth,Oranges,0.69,1
1,zuchinni,aldinorth,,,0
2,cherry tomatoes,aldinorth,Rocher chocolates,3.49,1
3,pasta,aldinorth,,,0
4,meat,aldinorth,meatloaf,1.89,1
...,...,...,...,...,...
73,Basmati rice,rewe,Rewe Basmati rice,2.19,1
74,Spaghetti,rewe,Rewe spaghetti,1.19,1
75,Miso soup,rewe,Felix goulash soup,2.29,1
76,Tempeh,rewe,peach,1.99,1


In [47]:
# We want to see:
#    1. Where we will be able to find all the products
#    2. Where can we buy them at the best price

def get_supermarket_choice(ingredients:list):
    
    result = get_list_products_price(ingredients) 
    
    # Group by 'Supermarket'
    results = result.groupby('Supermarket').agg({'Price':'sum', 'Coverage': 'mean'}).reset_index()
    results = results.sort_values('Price').reset_index(drop=True)
    
    return results

In [48]:
get_supermarket_choice(['vodka', 'oats', 'zuchinni', 'cherry tomatoes', 'pasta', 'meat', 'Coce', 'Sunflower oil', 'Onion', 'Basmati rice', 'Spaghetti', 'Miso soup', 'Tempeh', 'Soya Sauce'])

Unnamed: 0,Supermarket,Price,Coverage
0,aldisouth,0.0,0.0
1,lidl,12.16,0.714286
2,edeka,13.42,0.571429
3,aldinorth,18.21,0.785714
4,kaufland,22.97,0.714286
5,rewe,25.13,0.928571


In [49]:
# We want to see:
#    1. Detailed list of the product names and price
def get_shopping_list(ingredients:list):
    
    result = get_list_products_price(ingredients) 
    results = get_supermarket_choice(ingredients)
    
    # Ask the user where they would like to do the grocery shopping
    user_input = input('Where would you like to go shopping?\n')
    
    return result[result['Supermarket'] == user_input]

In [50]:
get_shopping_list(['vodka', 'oats', 'zuchinni', 'cherry tomatoes', 'pasta', 'meat', 'Coce', 'Sunflower oil', 'Onion', 'Basmati rice', 'Spaghetti', 'Miso soup', 'Tempeh', 'Soya Sauce'])

Where would you like to go shopping?
rewe


Unnamed: 0,Ingredient,Supermarket,Product,Price,Coverage
70,vodka,rewe,Vodka Schilkin,5.49,1
71,oats,rewe,Carrots,0.99,1
72,zuchinni,rewe,zucchini,0.89,1
73,cherry tomatoes,rewe,REWE cherry groats,3.99,1
74,pasta,rewe,papaya,1.79,1
75,meat,rewe,,,0
76,Coce,rewe,Coke,0.99,1
77,Sunflower oil,rewe,Rewe shower oil,1.29,1
78,Onion,rewe,Onions,0.49,1
79,Basmati rice,rewe,Rewe Basmati rice,2.19,1


### Create one function with the whole process

In [51]:
def demo(ingredients:list):
    
    # 1. Convert the ingredient list to product lists per each supermarket
    result = get_list_products_price(ingredients)
    
    # 2. Show an overview of their shopping list grouped by supermarket
    results = get_supermarket_choice(ingredients)
    
    print(results) # display as dataframe with print method..
    
    # 3. Ask the user where they would like to do the grocery shopping
    user_input = input('Where would you like to go shopping?\n')
    
    # 4. Show the user theis shopping list
    return result[result['Supermarket'] == user_input]
    
    # 5. Show products that they may be interested in buy

## HOW CAN I MAKE IT INTERACTIVE?
## Display -- > https://stackoverflow.com/questions/26873127/show-dataframe-as-table-in-ipython-notebook

In [52]:
demo(['tofu', 'black pepper', 'lemon' 'meatballs', 'Coce', 'Sunflower oil', 'Onion', 'Basmati rice', 'Spaghetti', 'Miso soup', 'Tempeh', 'Soya Sauce'])

  Supermarket  Price  Coverage
0   aldisouth   0.00  0.000000
1   aldinorth   9.93  0.818182
2        lidl  13.41  0.818182
3        rewe  14.22  0.909091
4       edeka  16.42  0.727273
5    kaufland  22.24  0.818182
Where would you like to go shopping?
rewe


Unnamed: 0,Ingredient,Supermarket,Product,Price,Coverage
55,tofu,rewe,,,0
56,black pepper,rewe,Yes! Snack peppers,0.75,1
57,lemonmeatballs,rewe,Yes! Meatballs,1.49,1
58,Coce,rewe,Coke,0.99,1
59,Sunflower oil,rewe,Rewe shower oil,1.29,1
60,Onion,rewe,Onions,0.49,1
61,Basmati rice,rewe,Rewe Basmati rice,2.19,1
62,Spaghetti,rewe,Rewe spaghetti,1.19,1
63,Miso soup,rewe,Felix goulash soup,2.29,1
64,Tempeh,rewe,peach,1.99,1


### Unify with the ML model