In [1]:
import sys
import os
from os.path import join as osp
import numpy as np
from tqdm import tqdm

script_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(script_dir, '..'))
grand_dir = os.path.abspath(os.path.join(parent_dir, '..'))
sys.path.extend([parent_dir, grand_dir])
from constant import *

In [2]:
import pandas as pd
import ast
from transformers import AutoTokenizer, OpenAIGPTModel
import torch
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cuda')
tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt")
class Node:
    def __init__(self, value):
        self.value = value
        self.left = None
        self.right = None

def insert(root, value):
    if root is None:
        return Node(value)
    if value < root.value:
        root.left = insert(root.left, value)
    elif value > root.value:
        root.right = insert(root.right, value)
    return root

def build_bst(lst):
    root = None
    for sublist in lst:
        for item in sublist:
            root = insert(root, item)
    return root

def inorder_traversal(node, result_set):
    if node is not None:
        inorder_traversal(node.left, result_set)
        result_set.add(node.value)
        inorder_traversal(node.right, result_set)

def set_from_list_column(df, column_name):
    unique_set = set()
    root = build_bst(df[column_name])
    inorder_traversal(root, unique_set)
    return unique_set

def parse_string_to_list(string):
    result = ast.literal_eval(string)
    if isinstance(result[0], list):
        result = tokenizer.batch_decode(result)
    else:
        result = tokenizer.decode(result)
    return result

def data_preprocessing(lst):
    lst = ast.literal_eval(lst)
    text = ', '.join(lst)
    text = text.replace(' ', '-')
    embeddings = model.encode(text)
    return embeddings

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data_interactions_train = pd.read_csv(osp(PROJECT_ROOT, 'data/interactions_train.csv'))
data_interactions_train_rate = data_interactions_train.drop(columns=["user_id", "date", "recipe_id", "u"])
data_interactions_train_rate = data_interactions_train_rate.groupby("i", as_index=False).mean()
data_interactions_train_rate

Unnamed: 0,i,rating
0,0,4.50
1,1,4.00
2,2,5.00
3,3,3.80
4,4,4.50
...,...,...
160896,178255,5.00
160897,178256,5.00
160898,178257,5.00
160899,178261,5.00


In [4]:
data_pp_recipes = pd.read_csv(osp(PROJECT_ROOT, r'data/PP_recipes.csv'))
data_pp_recipes = data_pp_recipes.drop(columns=["name_tokens", "ingredient_tokens", "steps_tokens", "techniques", "ingredient_ids"])
data_pp_recipes

Unnamed: 0,id,i,calorie_level
0,424415,23,0
1,146223,96900,0
2,312329,120056,1
3,74301,168258,0
4,76272,109030,0
...,...,...,...
178260,323143,76862,1
178261,149114,145962,0
178262,34200,65066,2
178263,30618,77358,0


In [5]:
merged_df_1 = pd.merge(data_interactions_train_rate, data_pp_recipes, on='i', how='inner')
merged_df_1

Unnamed: 0,i,rating,id,calorie_level
0,0,4.50,40893,0
1,1,4.00,44394,0
2,2,5.00,85009,2
3,3,3.80,134728,1
4,4,4.50,200236,2
...,...,...,...,...
160896,178255,5.00,40514,0
160897,178256,5.00,190261,0
160898,178257,5.00,290157,0
160899,178261,5.00,492861,0


In [10]:
data_raw_recipes = pd.read_csv(osp(PROJECT_ROOT, r'data/RAW_recipes.csv'))
merged_df_2 = pd.merge(merged_df_1, data_raw_recipes, on='id', how='inner').drop(columns=["contributor_id", "submitted"])
merged_df_2.dropna(inplace=True, how='any')
merged_df_3 = merged_df_2.copy()
merged_df_3["nutrition"] = merged_df_3["nutrition"].apply(ast.literal_eval)
for column in ["tags", "steps", "ingredients"]:
    tqdm.pandas()
    merged_df_3[column] = merged_df_3[column].progress_map(data_preprocessing)
for column in ["name", "description"]:
    tqdm.pandas()
    merged_df_3[column] = merged_df_3[column].progress_map(model.encode)
for column in ["calorie_level", "minutes", "n_steps", "n_ingredients"]:
    merged_df_3[column] = merged_df_3["calorie_level"].astype(dtype=np.int32)
merged_df_3

100%|██████████| 157453/157453 [09:16<00:00, 282.93it/s]
100%|██████████| 157453/157453 [11:54<00:00, 220.46it/s]
100%|██████████| 157453/157453 [09:31<00:00, 275.66it/s]


Unnamed: 0,i,rating,id,calorie_level,name,minutes,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,0,4.500000,40893,0,white bean green chile pepper soup,0,"[0.028073952, -0.006903332, 0.014056096, 0.074...","[204.8, 5.0, 9.0, 26.0, 24.0, 2.0, 10.0]",0,"[-0.065314904, -0.027060697, -0.033178516, 0.0...",easy soup for the crockpot.,"[-0.030071119, -0.022510767, 0.022805776, -0.0...",0
2,2,5.000000,85009,2,baked potato toppings,2,"[-0.01025454, 0.018841082, 0.021573348, 0.0251...","[2786.2, 342.0, 134.0, 290.0, 161.0, 301.0, 42.0]",2,"[-0.0061680423, -0.06966538, -0.07100283, -0.0...",these toppings sure makes a nice change from p...,"[-0.0477214, -0.06811079, 0.015418942, 0.03810...",2
3,3,3.800000,134728,1,kfc honey bbq strips,1,"[0.017466733, -0.025005536, -0.0065729218, 0.0...","[316.0, 4.0, 40.0, 37.0, 78.0, 4.0, 10.0]",1,"[-0.036889493, -0.046186943, -0.04546317, 0.01...",these are so yummy and they do taste just like...,"[-0.03073587, -0.05026849, 0.008119164, 0.0174...",1
4,4,4.500000,200236,2,lamb stew with tomatoes chickpeas and spices,2,"[0.00435885, -0.019715838, 0.012276351, 0.0483...","[606.5, 65.0, 12.0, 34.0, 65.0, 83.0, 7.0]",2,"[-0.04397176, -0.066110075, -0.007693818, 0.02...",north african spices with a basic meat stew re...,"[-0.06495572, -0.052379385, 0.027239358, 0.040...",2
5,5,4.666667,254596,2,apple apricot pork chops crock pot,2,"[0.017859336, -0.018680792, 0.05262837, 0.0093...","[710.0, 44.0, 210.0, 7.0, 94.0, 50.0, 21.0]",2,"[0.04670352, -0.033276875, 0.0031858345, 0.045...","i'm not sure where i got this recipe, but it's...","[-0.02936025, -0.020462094, 0.012194063, 0.018...",2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
160896,178255,5.000000,40514,0,sun dried tomato bruschetta with goat cheese,0,"[0.027980803, 0.031470962, 0.03837033, 0.03320...","[94.6, 5.0, 4.0, 8.0, 8.0, 10.0, 3.0]",0,"[-0.08651913, -0.010349829, 0.0026058808, 0.01...",these are good. if you haven't tried goat chee...,"[-0.033260714, -0.012673882, 0.05784064, 0.039...",0
160897,178256,5.000000,190261,0,anise carrots,0,"[0.054065317, 0.01081251, -0.0032517803, 0.087...","[210.3, 18.0, 55.0, 13.0, 3.0, 36.0, 8.0]",0,"[-0.033476487, -0.0873968, -0.016068304, 0.087...",anise and carrots go so well together.,"[-0.046139833, -0.040238623, -0.014374842, 0.0...",0
160898,178257,5.000000,290157,0,mediterranean spice mix,0,"[-0.039675828, -0.016202087, 0.05217323, 0.022...","[10.9, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0]",0,"[-0.022726385, -0.07930781, 0.04042405, 0.0409...","sprinkle dry on new baby potatoes, meat or mix...","[-0.06696304, -0.050798386, 0.006047829, 0.002...",0
160899,178261,5.000000,492861,0,omani coffee,0,"[-0.0016710645, 0.020765956, 0.019067304, 0.03...","[16.3, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]",0,"[-0.020176716, -0.029519286, -0.033471283, 0.0...",the omani people are well known for their hosp...,"[0.0022629872, -0.022872927, 0.032064673, 0.12...",0


In [13]:
merged_df_3.to_csv('data/clean_data.csv', index=False)

In [18]:
merged_df_3["nutrition"] = merged_df_3["nutrition"].apply(ast.literal_eval)

In [19]:
merged_df_3["X"] = merged_df_3["name"] + merged_df_3["tags"] + merged_df_3["nutrition"] + merged_df_3["steps"] + merged_df_3["description"] + merged_df_3["ingredients"]
merged_df_3["y"] = merged_df_3["rating"]

ValueError: operands could not be broadcast together with shapes (384,) (7,) 