# Supermarkets modeling

The goal of this document....

In [14]:
import pandas as pd
import numpy as np

## 1. Import data

In [417]:
# import data
supermarkets = {
    'aldinorth': pd.read_pickle('data/products-clean/aldinorth-products-clean.pkl'),
    'aldisouth': pd.read_pickle('data/products-clean/aldisouth-products-clean.pkl'),
    'edeka': pd.read_pickle('data/products-clean/edeka-products-clean.pkl')
    #'kaufan': pd.read_pickle('data/products-clean/edeka-products-clean.pkl')
    #'rewe': pd.read_pickle('data/products-clean/edeka-products-clean.pkl')
    #'lidl': pd.read_pickle('data/products-clean/edeka-products-clean.pkl')
}

In [418]:
supermarkets['aldinorth'].head()

Unnamed: 0,Name,Price
0,Tamara extra strawberry jam,0.99
1,Nutella,3.79
2,Coke,2.89
3,Trader Joe's / Aldi Nord Lemon Iced Tea,0.49
4,Iglo fish fingers,3.49


## 2. Modeling to do's

- Algorithm 1.0 - Build an algorithm able to (input: ingredient, output: % coverage + total price per each supermarket)
- Algorithm 2.0 - Build an algorithm able to (input: list of ingredients, output: % coverage + total price per each supermarket)
- Algorithm 3.0 - Use fuzzy buzzy to match the input (potato) with the ingredients by similarity.

### 2.1 Algorithm 1.0
Build an algorithm able to: 

- [input]: ingredient

- [output]: price in each supermarket

In [18]:
# We are looking for the price of the ingredient that we are searching per each supermarket
supermarkets['aldinorth'][supermarkets['aldinorth']['Name'] == 'Nutella']['Price'].values[0]

3.79

In [27]:
# Function to get the prices with an ingredient as input
def search(ingredient:str):
    # New empty list
    result = []
    for market in supermarkets:
        a = supermarkets[market]
        try:
            price = a[a['Name'] == ingredient]['Price'].values[0]
            result.append(f'{market}: {price}')
        except:
            result.append(f'{market}: Not found')
    return result

# Try it with 'Nutella' because it is available in every supermarket
# Try with 'Coke'

In [28]:
# Testing the function
search('Nutella')

['aldinorth: 3.79', 'aldisouth: 3.89', 'edeka: 1.77']

In [29]:
# Testing the function
search('Coke')

['aldinorth: 2.89', 'aldisouth: 0.99', 'edeka: 1.39']

We can see that there are some inconsistencies with the 'Coca Cola' prices. The problem is that the size of the packs are not normalized. 

In [31]:
supermarkets['edeka'][supermarkets['edeka']['Name'] == 'Coke']

Unnamed: 0,Name,Price
0,Coke,1.39


In [39]:
supermarkets['edeka'][supermarkets['edeka']['Name'].str.contains('Coke')]

Unnamed: 0,Name,Price
0,Coke,1.39


### 2.2 Algorithm 2.0
Build an algorithm able to:

- [input]: list of ingredients (ing as it is in the dataset)
- [output]: % coverage + total price per each supermarket

In [40]:
def search_list(ingredients:list):
    # New empty list
    result = []
    
    for market in supermarkets:
        
        a = supermarkets[market]
        
        for ing in ingredients:    
            try:
                price = a[a['Name'] == ing]['Price'].values[0]
                result.append(f'{market}: {price} for the {ing}')
            except:
                result.append(f'{market}: Not found')
            
    return result

In [42]:
search_list(['Coke', 'Nutella'])

['aldinorth: 2.89 for the Coke',
 'aldinorth: 3.79 for the Nutella',
 'aldisouth: 0.99 for the Coke',
 'aldisouth: 3.89 for the Nutella',
 'edeka: 1.39 for the Coke',
 'edeka: 1.77 for the Nutella']

In [153]:
result = pd.DataFrame(columns=['Supermarket', 'Ingredient', 'Price', 'Coverage'])

In [154]:
result.append({'Supermarket': 'aldinorth', 'Ingredient': 'Coke', 'Price': 2.89, 'Coverage': 1}, ignore_index=True)

Unnamed: 0,Supermarket,Ingredient,Price,Coverage
0,aldinorth,Coke,2.89,1


In [157]:
result['Price'] = pd.to_numeric(result['Price'], errors='coerce')
result['Coverage'] = pd.to_numeric(result['Coverage'], errors='coerce')

In [158]:
result.dtypes

Supermarket    object
Ingredient     object
Price           int64
Coverage        int64
dtype: object

In [188]:
def search_list_grouped(ingredients:list):
    # New empty df
    result = pd.DataFrame(columns=['Supermarket', 'Ingredient', 'Price', 'Coverage'])

    for market in supermarkets:
        
        a = supermarkets[market]
        
        for ing in ingredients:    
            try:
                price = a[a['Name'] == ing]['Price'].values[0]
                df2 = {'Supermarket': market, 'Ingredient': ing, 'Price': price, 'Coverage': 1}
                result = result.append(df2, ignore_index=True)
            except:
                result = result.append({'Supermarket': market, 'Ingredient': ing, 'Price': np.nan, 'Coverage': 0}, ignore_index=True)
    
    # Change to numeric
    result['Price'] = pd.to_numeric(result['Price'], errors='coerce')
    result['Coverage'] = pd.to_numeric(result['Coverage'], errors='coerce') * 100
    
    # Group by 'Supermarket'
    results = result.groupby('Supermarket').agg({'Price':'sum', 'Coverage': 'mean'}).reset_index()
    results = results.sort_values('Price').reset_index(drop=True)
    
    # How to display it to users
#    for row in range(len(results)):
#        print(f"In {row['Supermarket']} you can find {row['Coverage']}% of the ingredients for row['Price']")
    
    return results

# 'result' dataframe contains detailes information
# 'results' dataframe contains grouped information

In [189]:
search_list_grouped(['Coke', 'Nutella', 'Vanilla', 'Tuna', 'Soap', 'Classic Tomato Sauce'])

Unnamed: 0,Supermarket,Price,Coverage
0,edeka,3.16,33.333333
1,aldisouth,4.88,33.333333
2,aldinorth,6.68,33.333333


As we can see, if we do not type the ingredient exactly as it is in the dataset, our algorithm is not able to find it. In order to make it more useful, we are going to add fuzzy words and create a basic search engine.

### 2.3 Algorithm 3.0

Build an algorithm able to:

- [input]: list of ingredients (similar word)
- [output]: % coverage + total price per each supermarket

We are going to use the fuzzywuzzy ratio

In [191]:
# Add the packages
from fuzzywuzzy import fuzz
import re

In [193]:
fuzz.ratio("Coca Cola", "Coca Cola")

100

In [195]:
fuzz.ratio("Coca Cola", "Coke")

31

In [199]:
fuzz.ratio("Coca Cola".lower(), "Coke".lower())

31

Extract the ingredient name

In [201]:
supermarkets['edeka']

Unnamed: 0,Name,Price
0,Coke,1.39
1,Nutella,1.77
2,Becel Gold 70% fat,1.49
3,Iglo fish fingers,2.99
4,Good & Cheap Landgasthof Goulash Pan,1.99
...,...,...
5043,Nivea Extreme Stay Lipstick 17 Extrem Pinkini,11.49
5044,Nivea Extreme Stay Lipstick 21 Extreme Cinnam,11.49
5045,Nivea Extreme Stay Lipstick 11 Extremely Beige,11.49
5046,Nivea Turbo Color Nail Polish 12 Turbo Red,7.89


- After the numbers I can extract the rest
- Brands ? 
- Black list with words that I can exclude

In [411]:
input_ing = "Sunflower oil"

In [412]:
matches = []

for ing in supermarkets['aldinorth']['Name']:
    ratio = fuzz.ratio(ing.lower(), input_ing.lower())

    if ratio > 60:
        matches.append((ing, ratio))

In [413]:
matches

[('Buttella fine sunflower oil', 65)]

In [414]:
matches = sorted(matches, key=lambda x: x[1], reverse=True)

In [415]:
# Add the best match to a list
clean_list = []

clean_list.append(matches[0][0])

In [416]:
clean_list

['Buttella fine sunflower oil']

Our final algorithm should be able to: 
    
    Step 1:
    -------
    
    - [Input] - [List of ingredients (not necessarly exactly same as database)]
    - [Output] - list of supermarkets with total price and coverage
    
    Step 2:
    -------
    
    - [Input] - user selects one supermarket
    - [Output] - Shopping list with ingredient + price
    
    Step 3:
    -------
    
    - [Output] - list of 5 items that they might be interested in shopping
    


In [436]:
supermarkets['aldinorth']

Unnamed: 0,Name,Price
0,Tamara extra strawberry jam,0.99
1,Nutella,3.79
2,Coke,2.89
3,Trader Joe's / Aldi Nord Lemon Iced Tea,0.49
4,Iglo fish fingers,3.49
...,...,...
2984,Moser Roth Fine Easter Eggs Fine Bitter 70%,3.65
2985,Moser Roth Fine Easter Eggs Noble Nougat,1.49
2986,Moser Roth Fine Easter Eggs Chocolate Candy,0.99
2987,Moser Roth Fine Easter Eggs Noble Collection w...,1.15


In [437]:
def shopping(ingredients:list):
    
            # ...
    supermarkets_lists = {
            'aldinorth': [],
            'aldisouth': [],
            'edeka': []
            #'kaufan': pd.read_pickle('data/products-clean/edeka-products-clean.pkl')
            #'rewe': pd.read_pickle('data/products-clean/edeka-products-clean.pkl')
            #'lidl':
        }
    
    for market in supermarkets:

        for ing in ingredients:
        
            # Empty dictionary where we will store the matching products
            matches = []   
        
            # Cada palabra, recorrer ingredientes del supermercado y generar lista
            for product in supermarkets[market]['Name']:
                ratio = fuzz.ratio(product.lower(), ing.lower())

                if ratio > 60:
                    matches.append((product, ratio))
                    matches = sorted(matches, key=lambda x: x[1], reverse=True)
                
            # If matches list is not empty
            if matches:
                supermarkets_lists[market].append(matches[0][0])
                        
    # Use the function .. to get the get the prices in the dif supermarkets
    return supermarkets_lists

# Cada supermercado

# Seleccionar la palabra con mejor puntuación

# Añadirla a una lista limpia que será el input de la siguiente función

The function has to have different input depending on the supermarket. That means that we have to modify the search_list_grouped function, since it has been developed to be able to take only 1 input argument.

In [438]:
shopping(['sausages', 'Coce', 'Sunflower oil', 'Garlic', 'Onion', 'Basmati rice'])

{'aldinorth': ['Coke', 'Buttella fine sunflower oil', 'Albona milk rice'],
 'aldisouth': ['Coke',
  'Bellasan sunflower oil',
  'garlic',
  'Onions',
  'RYZ Basmati rice'],
 'edeka': ['Coke',
  'Thomy sunflower oil',
  'garlic',
  'Onions',
  'Lien Ying Basmati rice']}

UNTIL HERE

In [439]:
####### COPY #######

def search_list_grouped(aldinorth=[]:list,
                        aldisouth=[]:list,
                        edeka=[]:list,
                        kaufan=[]:list,
                        lidl=[]:list,
                        rewe=[]:list):
    # New empty df
    result = pd.DataFrame(columns=['Supermarket', 'Ingredient', 'Price', 'Coverage'])

    for market in supermarkets:
        
        a = supermarkets[market]
        
        for ing in ingredients:    
            try:
                price = a[a['Name'] == ing]['Price'].values[0]
                df2 = {'Supermarket': market, 'Ingredient': ing, 'Price': price, 'Coverage': 1}
                result = result.append(df2, ignore_index=True)
            except:
                result = result.append({'Supermarket': market, 'Ingredient': ing, 'Price': np.nan, 'Coverage': 0}, ignore_index=True)
    
    # Change to numeric
    result['Price'] = pd.to_numeric(result['Price'], errors='coerce')
    result['Coverage'] = pd.to_numeric(result['Coverage'], errors='coerce') * 100
    
    # Group by 'Supermarket'
    results = result.groupby('Supermarket').agg({'Price':'sum', 'Coverage': 'mean'}).reset_index()
    results = results.sort_values('Price').reset_index(drop=True)
    
    # How to display it to users
#    for row in range(len(results)):
#        print(f"In {row['Supermarket']} you can find {row['Coverage']}% of the ingredients for row['Price']")
    
    return results

# 'result' dataframe contains detailes information
# 'results' dataframe contains grouped information

SyntaxError: invalid syntax (<ipython-input-439-107bc6770184>, line 3)