In [30]:
import pandas as pd
import pickle
import numpy as np
from fuzzywuzzy import fuzz
from sklearn.neighbors import NearestNeighbors

## 0. Setup

### 0.1 Import datasets

In [34]:
ingredient_features = pd.read_pickle('data/demo/ingredient_features.pkl')
unique_ingredients = pd.read_pickle('data/demo/unique_ingredients.pkl')

In [32]:
# import data
supermarkets = {
    'aldinorth': pd.read_pickle('data/products-clean/aldinorth-products-clean.pkl'),
    'aldisouth': pd.read_pickle('data/products-clean/aldisouth-products-clean.pkl'),
    'edeka': pd.read_pickle('data/products-clean/edeka-products-clean.pkl'),
    'kaufland': pd.read_pickle('data/products-clean/kaufland-products-clean.pkl'),
    'lidl': pd.read_pickle('data/products-clean/lidl-products-clean.pkl'),
    'rewe': pd.read_pickle('data/products-clean/rewe-products-clean.pkl')
}

### 0.2 Define Functions

#### 04_supermarkets-modeling

In [4]:
def get_products(ingredients:list):
    
    # Create an empty list per each supermarket
    supermarkets_lists = {
            'aldinorth': [],
            'aldisouth': [],
            'edeka': [],
            'kaufland': [],
            'lidl': [],
            'rewe': []
        }
    
    for market in supermarkets:

        for ing in ingredients:
            
            # Empty dictionary where we will store the matching products
            matches = []   
        
            # For each product, iterate through each supermarket and generate a list
            for product in supermarkets[market]['Name']:
                ratio = fuzz.ratio(product.lower(), ing.lower())

                if ratio > 50:
                    matches.append((product, ratio))
                    matches = sorted(matches, key=lambda x: x[1], reverse=True)
                
            # If matches list is not empty
            if matches:
                supermarkets_lists[market].append(matches[0][0])
            else: 
                supermarkets_lists[market].append(np.nan)

    return supermarkets_lists

In [5]:
def get_list_products_price(ingredients:list):
    
    # Create a dictionary with the supermarkets and a list with the products
    supermarkets_p = {
        'aldinorth': get_products(ingredients)['aldinorth'],
        'aldisouth': get_products(ingredients)['aldisouth'],
        'edeka': get_products(ingredients)['edeka'],
        'kaufland': get_products(ingredients)['kaufland'],
        'lidl': get_products(ingredients)['lidl'],
        'rewe': get_products(ingredients)['rewe']
    }
    
    # New empty df
    result = pd.DataFrame(columns=['Ingredient', 'Supermarket', 'Product', 'Price', 'Coverage'])

    # Iterate and populate a df with the final result
    for market in supermarkets_p:
        
        # Iterate per each product in each supermarket
        s = supermarkets[market] # Dictionary which contains the detailed info of all products per supermarket
        
        ### NEW TO FIND HOW TO ADD THE NAN with THE COVERAGE 0.
        for prod, ing in zip(supermarkets_p[market], ingredients):    
            try:
                price = s[s['Name'] == prod]['Price'].values[0]
                df2 = {'Ingredient': ing, 'Supermarket': market, 'Product': prod, 'Price': price, 'Coverage': 1}
                result = result.append(df2, ignore_index=True)
            except:
                result = result.append({'Ingredient': ing, 'Supermarket': market, 'Product': np.nan, 'Price': np.nan, 'Coverage': 0}, ignore_index=True)
    
    # 'result' contains detailed information about product and price per each supermarket
    
    # Change to numeric
    result['Price'] = pd.to_numeric(result['Price'], errors='coerce')
    result['Coverage'] = pd.to_numeric(result['Coverage'], errors='coerce')
    
    return result

In [6]:
def get_supermarket_choice(ingredients:list):
    
    result = get_list_products_price(ingredients) 
    
    # Group by 'Supermarket'
    results = result.groupby('Supermarket').agg({'Price':'sum', 'Coverage': 'mean'}).reset_index()
    results = results.sort_values('Price').reset_index(drop=True)
    
    return results

In [7]:
def get_shopping_list(ingredients:list):
    
    result = get_list_products_price(ingredients) 
    results = get_supermarket_choice(ingredients)
    
    # Ask the user where they would like to do the grocery shopping
    user_input = input('Where would you like to go shopping?\n')
    
    return result[result['Supermarket'] == user_input]

In [36]:
def get_translation(ingredients:list):
    
    clean_list = []
    
    # ...
    for ingredient in ingredients:

        matches = []

        for ing in unique_ingredients['ingredients']:
            ratio = fuzz.ratio(ing.lower(), ingredient.lower())

            if ratio > 50:
                matches.append((ing, ratio))
                matches = sorted(matches, key=lambda x: x[1], reverse=True)
                
        if matches:
            clean_list.append(matches[0][0])
        else: 
            clean_list.append(np.nan)
            
    return clean_list

In [45]:
def get_top10_recommendations(ingredients:list):
    
    ingredients = get_translation(ingredients)
    
    input_arr = np.array(ingredients)
    
    # Create empty DataFrame
    model = pd.DataFrame(columns=('Ingredient Position', 'Distance'))
    
    # Generate the model
    knn = model_knn.kneighbors(ingredient_features.loc[input_arr].values, n_neighbors=10)
    
    for i, ing in enumerate(ingredients):
        
        # Zip the ingredients position with the distance to the input 'ingredient'
        z = zip(knn[1][i].tolist(), knn[0][i].tolist())
        
        # Formate it as a list
        z_list = [(x, y) for x, y in z]
    
        # Create the DataFrame
        temp = pd.DataFrame(z_list, columns=('Ingredient Position', 'Distance'))
        
        # Drop the rows of the ingredients that are already in our list
        temp = temp.drop(temp[temp['Distance'] <= 0.001].index)

        # Concat the new list of (Position, Distance) with the final df
        model = pd.concat([model, temp])
        
    # Group by ingredient position and sort by distance
    model = model.groupby('Ingredient Position').agg({'Distance': 'mean'})
    model = model.sort_values(by='Distance', ascending=True).reset_index()
    
    # Print output header
    print(f'If you are buying {ingredients} you may also want to buy:')
    print(f'---------------------------------------------------------\n')
    
    for x in range(0, 9):
        # Print a list with the top 10 recommendations
        print(f'{x + 1}.{ingredient_features.iloc[model["Ingredient Position"][x]].name}')
    
    return

In [10]:
def demo(ingredients:list):
    
    # 1. Convert the ingredient list to product lists per each supermarket
    result = get_list_products_price(ingredients)
    
    # 2. Show an overview of their shopping list grouped by supermarket
    results = get_supermarket_choice(ingredients)
    
    print(results) # display as dataframe with print method..
    
    # 3. Ask the user where they would like to do the grocery shopping
    user_input = input('Where would you like to go shopping?\n')
    
    # 4. Show the user theis shopping list
    return result[result['Supermarket'] == user_input]
    
    # 5. Show products that they may be interested in buy

## HOW CAN I MAKE IT INTERACTIVE?
## Display -- > https://stackoverflow.com/questions/26873127/show-dataframe-as-table-in-ipython-notebook

## 0.3 Train KNN model

In [20]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

mat_ingredient_features = csr_matrix(ingredient_features)

model_knn = NearestNeighbors(metric="cosine",
                             algorithm="brute",
                             n_jobs=-1)

model_knn.fit(mat_ingredient_features)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=5, p=2, radius=1.0)

## 1. Ingredient list

In [47]:
ingredients = ['dijon mustard', 'dill weed']

# ['tofu', 'black pepper', 
#               'lemon', 'meatballs', 
#               'Coce', 'Sunflower oil', 
#               'Onion', 'Basmati rice', 
#               'Spaghetti', 'Miso soup', 
#               'Tempeh', 'Soya Sauce']

## 2. Which supermarket should we choose?

In [12]:
get_supermarket_choice(ingredients)

Unnamed: 0,Supermarket,Price,Coverage
0,aldinorth,9.45,0.818182
1,aldisouth,13.37,0.818182
2,lidl,13.41,0.818182
3,rewe,14.22,0.909091
4,edeka,16.42,0.727273
5,kaufland,22.24,0.818182


## 3. Can we get the Shopping List with the exact products?

In [50]:
get_shopping_list(ingredients)

Where would you like to go shopping?
rewe


Unnamed: 0,Ingredient,Supermarket,Product,Price,Coverage
10,dijon mustard,rewe,Lion mustard extra,1.49,1
11,dill weed,rewe,,,0


## 4. Customers that bought those items, also bought:

In [48]:
get_top10_recommendations(ingredients)

If you are buying ['Dijon mustard', 'dill weed'] you may also want to buy:
---------------------------------------------------------

1.sun-dried tomatoes chopped
2.zucchini cut in half lengthwise then sliced diagonally
3.fat-free chicken broth
4.fat-free sour cream
5.dry white wine
6.boneless skinless chicken breast halves
7.Dijon mustard
8.dill weed
9.oil
