# DATA Integration
- Objective is to combine data which are collected from various different sources
- tag each record with veg/non-veg
- tag each food data with allergy by using corpus collected for allergy_data
- handling ingredients
- tagging seasonal data
- handling nutrition and tagging for dietary restrictions

In [None]:
# import all common necessary packages
import pandas as pd
import numpy as np
import json
import re
import ast

# for postgres SQL database
import psycopg2

import warnings
warnings.filterwarnings('ignore')

### Configuration

In [None]:
import logging

VERSION = "v0.1.0"
RELEASE_DATE = "16 Jun 2023"

##### DATABASE RELATED #####
DB_CONFIG_LOCAL = {
    "dbname": "food_db",
    "host": "localhost",
    "port": 5432,
    "username": "postgres",
    "password": "postgres"
}

DB_CONFIG_CLOUD = {
    "dbname": "food_db",
    "host": "",
    "port": 5432,
    "username": "postgres",
    "password": ""
}

# select which database (local or cloud)
DB_CONFIG = DB_CONFIG_LOCAL

### Database Functions

In [None]:
class DatabaseAccess:
    def __init__(self, db_config):
        self.conn = psycopg2.connect(
                            database=db_config['dbname'],
                            user=db_config['username'],
                            password=db_config['password'],
                            host=db_config['host'],
                            port=db_config['port'])
    def getConnection(self):
        return self.conn

In [None]:
# create database connection
gbl_db_conn = DatabaseAccess(DB_CONFIG).getConnection()

### Read all data from Database

In [None]:
# read raw_recipies from db
df_recipes_sql = pd.read_sql_query('select * from "raw_recipies"',con=gbl_db_conn)

# read raw_interactions from db
#df_users_sql = pd.read_sql_query('select * from "raw_interactions"',con=gbl_db_conn)

# read indian_food from db
df_ind_food_sql = pd.read_sql_query('select * from "indian_food"', con=gbl_db_conn)

# read indian_food101 from db
df_ind_food101_sql = pd.read_sql_query('select * from "indian_food101"',con=gbl_db_conn)


In [None]:
print("recipes shape: ", df_recipes_sql.shape)
print("india food shape: ", df_ind_food_sql.shape)
print("india food101 shape: ", df_ind_food101_sql.shape)

recipes shape:  (231637, 6)
india food shape:  (6871, 8)
india food101 shape:  (255, 10)


In [None]:
# tag each dataset with its datasource name

DATA_SOURCE_COL_NAME = "data_source"
df_recipes_sql[DATA_SOURCE_COL_NAME] = "raw_recipes"
df_ind_food_sql[DATA_SOURCE_COL_NAME] = "ind_food"
df_ind_food101_sql[DATA_SOURCE_COL_NAME] = "ind_food101"

In [None]:
df_recipes_sql.head()

Unnamed: 0,food_id,food_name,food_description,ingredients,nutrition,created_time,data_source
0,137739,arriba baked winter squash mexican style,autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",2023-06-17 00:04:53.882478,raw_recipes
1,31490,a bit different breakfast pizza,this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",2023-06-17 00:04:53.891878,raw_recipes
2,112140,all in the kitchen chili,this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",2023-06-17 00:04:53.892491,raw_recipes
3,59389,alouette potatoes,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",2023-06-17 00:04:53.892945,raw_recipes
4,44061,amish tomato ketchup for canning,my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",2023-06-17 00:04:53.893248,raw_recipes


In [None]:
df_ind_food_sql.head()

Unnamed: 0,food_id,food_name,food_description,ingredients,cuisine,course,diet,created_time,data_source
0,600000,Masala Karela,,"[Karela deseeded, Salt, Onion, table Gram flou...",Indian,Side Dish,Diabetic Friendly,2023-06-17 00:09:59.710310,ind_food
1,600001,Spicy Tomato Rice,,"[rice, tomatoes, teas Belle Bhat, salt per, te...",South Indian Recipes,Main Course,Vegetarian,2023-06-17 00:09:59.713008,ind_food
2,600002,Ragi Semiya Upma Ragi Millet Vermicelli Brea...,,"[Rice Vermicelli Noodles, Onion, Carrots ped, ...",South Indian Recipes,South Indian Breakfast,High Protein Vegetarian,2023-06-17 00:09:59.713339,ind_food
3,600003,Gongura Chicken Curry Andhra Style Gongura C...,,"[Chicken, Onion ped, Tomato ped, Green Chillie...",Andhra,Lunch,Non Vegeterian,2023-06-17 00:09:59.713584,ind_food
4,600004,Andhra Style Alam Pachadi Adrak Chutney,,"[table chana dal, table white urad dal, red ch...",Andhra,South Indian Breakfast,Vegetarian,2023-06-17 00:09:59.713810,ind_food


In [None]:
df_ind_food101_sql.head()

Unnamed: 0,id,food_id,food_name,food_description,ingredients,course,diet,state,region,created_time,data_source
0,1,700000,Balu shahi,,"[Maida flour, yogurt, oil, sugar]",dessert,vegetarian,West Bengal,East,2023-06-17 00:10:01.922589,ind_food101
1,2,700001,Boondi,,"[Gram flour, ghee, sugar]",dessert,vegetarian,Rajasthan,West,2023-06-17 00:10:01.926956,ind_food101
2,3,700002,Gajar ka halwa,,"[Carrots, milk, sugar, ghee, cashews, raisins]",dessert,vegetarian,Punjab,North,2023-06-17 00:10:01.927290,ind_food101
3,4,700003,Ghevar,,"[Flour, ghee, kewra, milk, clarified butter, s...",dessert,vegetarian,Rajasthan,West,2023-06-17 00:10:01.927599,ind_food101
4,5,700004,Gulab jamun,,"[Milk powder, plain flour, baking powder, ghee...",dessert,vegetarian,West Bengal,East,2023-06-17 00:10:01.927930,ind_food101


In [None]:
# combine all the dataset
df_merged = pd.concat([df_recipes_sql, df_ind_food_sql, df_ind_food101_sql], ignore_index=True, sort=False)

In [None]:
print("Merged dataset shape: ", df_merged.shape)
df_merged.head()

Merged dataset shape:  (238763, 13)


Unnamed: 0,food_id,food_name,food_description,ingredients,nutrition,created_time,data_source,cuisine,course,diet,id,state,region
0,137739,arriba baked winter squash mexican style,autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",2023-06-17 00:04:53.882478,raw_recipes,,,,,,
1,31490,a bit different breakfast pizza,this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",2023-06-17 00:04:53.891878,raw_recipes,,,,,,
2,112140,all in the kitchen chili,this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",2023-06-17 00:04:53.892491,raw_recipes,,,,,,
3,59389,alouette potatoes,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",2023-06-17 00:04:53.892945,raw_recipes,,,,,,
4,44061,amish tomato ketchup for canning,my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",2023-06-17 00:04:53.893248,raw_recipes,,,,,,


In [None]:
if len(df_merged['food_id'].unique()) < len(df_merged.index):
    print("Found duplicate food id")
else:
    print("food_id is unique across dataset")

food_id is unique across dataset


## Tag each food with Veg/Non-Veg

In [None]:
# read nonveg_keywords from db
df_nonveg_sql = pd.read_sql_query('select * from "nonveg_keywords"',con=gbl_db_conn)

In [None]:
import math

def check_non_veg(row):
    non_veg_keywords = list(df_nonveg_sql['keywords'])

    if any(keyword in row['food_name'].lower() or keyword in row['food_description'].lower() or keyword in row['ingredients'] for keyword in non_veg_keywords):
        return 'non-veg'

    return 'veg'

df_merged['veg_or_non_veg'] = df_merged.apply(check_non_veg, axis=1)

In [None]:
df_merged['veg_or_non_veg'].value_counts()

non-veg    128139
veg        110624
Name: veg_or_non_veg, dtype: int64

## Convert the 'ingredients' variable to a 'ingredients_str' variable, which will contain the list of all ingredients

In [None]:
# convert list value as comma seperated string
df_merged['ingredients_str'] = df_merged['ingredients'].apply(lambda x: ', '.join(x))

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('omw-1.4')

# Download nltk's wordnet and punkt packages
nltk.download('wordnet')
nltk.download('punkt')

# define the lemmatizer
lemmatizer = WordNetLemmatizer()

# define a function to lemmatize a sentence
def lemmatize_text(text):
    word_list = nltk.word_tokenize(text)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    return lemmatized_output

# apply lemmatization to the 'ingredients_str' column
df_merged['ingredients_str_lemmatized'] = df_merged['ingredients_str'].apply(lemmatize_text)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\biren\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\biren\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\biren\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df_merged['ingredients_str_lemmatized']

0         winter squash , mexican seasoning , mixed spic...
1         prepared pizza crust , sausage patty , egg , m...
2         ground beef , yellow onion , diced tomato , to...
3         spreadable cheese with garlic and herb , new p...
4         tomato juice , apple cider vinegar , sugar , s...
                                ...                        
238758             Glutinous rice , black sesame seed , gur
238759    Coconut milk , egg yolk , clarified butter , a...
238760    Cottage cheese , dry date , dried rose petal ,...
238761    Milk powder , dry fruit , arrowroot powder , a...
238762    Brown rice , fennel seed , grated coconut , bl...
Name: ingredients_str_lemmatized, Length: 238763, dtype: object

In [None]:
# store as list of word and ignore large ingredients text

def ingr_cleaning(text):
    ingrList = []
    for word in text.split(','):
        word = word.strip()
        if len(word) < 127:
            ingrList.append(word.strip())

    return ingrList

df_merged['ingredients_clean'] = df_merged['ingredients_str_lemmatized'].apply(ingr_cleaning)

In [None]:
df_merged.head()

Unnamed: 0,food_id,food_name,food_description,ingredients,nutrition,created_time,data_source,cuisine,course,diet,id,state,region,veg_or_non_veg,ingredients_str,ingredients_str_lemmatized,ingredients_clean
0,137739,arriba baked winter squash mexican style,autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",2023-06-17 00:04:53.882478,raw_recipes,,,,,,,veg,"winter squash, mexican seasoning, mixed spice,...","winter squash , mexican seasoning , mixed spic...","[winter squash, mexican seasoning, mixed spice..."
1,31490,a bit different breakfast pizza,this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",2023-06-17 00:04:53.891878,raw_recipes,,,,,,,non-veg,"prepared pizza crust, sausage patty, eggs, mil...","prepared pizza crust , sausage patty , egg , m...","[prepared pizza crust, sausage patty, egg, mil..."
2,112140,all in the kitchen chili,this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",2023-06-17 00:04:53.892491,raw_recipes,,,,,,,non-veg,"ground beef, yellow onions, diced tomatoes, to...","ground beef , yellow onion , diced tomato , to...","[ground beef, yellow onion, diced tomato, toma..."
3,59389,alouette potatoes,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",2023-06-17 00:04:53.892945,raw_recipes,,,,,,,veg,"spreadable cheese with garlic and herbs, new p...","spreadable cheese with garlic and herb , new p...","[spreadable cheese with garlic and herb, new p..."
4,44061,amish tomato ketchup for canning,my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",2023-06-17 00:04:53.893248,raw_recipes,,,,,,,veg,"tomato juice, apple cider vinegar, sugar, salt...","tomato juice , apple cider vinegar , sugar , s...","[tomato juice, apple cider vinegar, sugar, sal..."


In [None]:
df_merged = df_merged.drop(['ingredients_str_lemmatized','ingredients_str'], axis=1)

## Tag each food with Allergies

In [None]:
# read allergy crpus from db
df_allergy_sql = pd.read_sql_query('select * from "allergy_data"',con=gbl_db_conn)

In [None]:
df_allergy_sql.head()

Unnamed: 0,allergy,ingredients,created_time
0,allium,"[asparagus, garlic, leek, nira, onion, shallot...",2023-06-17 00:10:20.198287
1,allium,[welsh],2023-06-17 00:10:20.200643
2,alpha-gal syndrome,"[cattle, deer, goat, horse, pig, sheep, rabbit]",2023-06-17 00:10:20.201501
3,aquagenic urticaria,[mineral water],2023-06-17 00:10:20.202126
4,beer,[hop],2023-06-17 00:10:20.202827


In [None]:
import re

def detect_allergy(ingr_list):
    allergies_keywords = dict(zip(df_allergy_sql['allergy'], df_allergy_sql['ingredients']))

    allergies = []
    ingredients = str(ingr_list).lower()
    cleaned_ingr = re.sub(r'\W+', ' ', ingredients)  # Remove non-alphanumeric characters

    for allergy, keywords in allergies_keywords.items():
        for keyword in keywords:
            # Use regular expression with word boundaries to match whole words
            pattern = r"\b" + re.escape(keyword) + r"\b"
            if re.search(pattern, cleaned_ingr):
                allergies.append(allergy)
                break

    return allergies

df_merged['allergies'] = df_merged['ingredients_clean'].apply(detect_allergy)

In [None]:
df_merged['allergies'].value_counts()

[milk, lactose, dairy]                                                                               11377
[milk, lactose, sugar, dairy]                                                                        10162
[]                                                                                                    4969
[milk, lactose, nut, sugar, dairy]                                                                    4330
[milk, lactose, poultry, dairy]                                                                       3049
                                                                                                     ...  
[histamine, ltp, milk, lactose, nut, dairy]                                                              1
[gluten, hypersensitivity, nightshade, dairy]                                                            1
[hypersensitivity, nightshade, nut, sugar, dairy, sulfites, sulphites]                                   1
[citrus, cruciferous, hypersensitivit

In [None]:
# store empty nutritions as empty list
df_merged["nutrition"] = df_merged["nutrition"].apply(lambda x: list([]) if x is np.nan else x)

In [None]:
df_merged.head()

Unnamed: 0,food_id,food_name,food_description,ingredients,nutrition,created_time,data_source,cuisine,course,diet,id,state,region,veg_or_non_veg,ingredients_clean,allergies
0,137739,arriba baked winter squash mexican style,autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",2023-06-17 00:04:53.882478,raw_recipes,,,,,,,veg,"[winter squash, mexican seasoning, mixed spice...","[honey, milk, lactose, olive, squash, dairy]"
1,31490,a bit different breakfast pizza,this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",2023-06-17 00:04:53.891878,raw_recipes,,,,,,,non-veg,"[prepared pizza crust, sausage patty, egg, mil...","[gluten, milk, lactose, dairy]"
2,112140,all in the kitchen chili,this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",2023-06-17 00:04:53.892491,raw_recipes,,,,,,,non-veg,"[ground beef, yellow onion, diced tomato, toma...","[legume, ltp, milk, lactose, nightshade, dairy]"
3,59389,alouette potatoes,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",2023-06-17 00:04:53.892945,raw_recipes,,,,,,,veg,"[spreadable cheese with garlic and herb, new p...","[hypersensitivity, milk, lactose, olive, potat..."
4,44061,amish tomato ketchup for canning,my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",2023-06-17 00:04:53.893248,raw_recipes,,,,,,,veg,"[tomato juice, apple cider vinegar, sugar, sal...","[nightshade, oral, sugar]"


In [None]:
df_merged.shape

(238763, 16)

In [None]:
# save intermediate result
df_merged.to_csv("./data/processed/food_data_with_allergy.csv")

### Handling Nutrition

#### Convert the 'nutrition' column to the 7 numerical nutrient columns - 'fructose', 'galactose', 'glucose', 'lactose', 'maltose', 'fat_n', 'caffeine' and then convert the 7 numerical columns to categorical columns definfing the levels of 'High', 'Medium' and 'Low' and eventually converting the combination of nutrients columns to a single dietary preference column

**Labelling the Nutrients** <br>
The nutrients provided in the dataset has the following feature / consequences: <br>


**Fructose:** Fructose is a type of sugar found in fruits and some sweeteners. Individuals with fructose intolerance may need to limit their intake of fructose-containing foods. <br>


**Galactose:** Galactose is a sugar found in dairy products. People with galactosemia have a deficiency in an enzyme that breaks down galactose. As a result, they need to avoid galactose and lactose, which is made up of galactose and glucose. <br>


**Glucose:** Glucose intolerance refers to difficulties in processing glucose, a simple sugar that is the primary source of energy for the body. Individuals with glucose intolerance may need to monitor their carbohydrate intake, including foods with high glucose content. <br>


**Lactose:** Lactose is a sugar found in dairy products. People with lactose intolerance have a deficiency in the enzyme lactase, which breaks down lactose. They may need to limit or avoid lactose-containing foods.<br>


**Maltose:** Maltose is a sugar found in grains and malted foods. Some individuals may have difficulty digesting maltose due to a deficiency in the enzyme maltase. These individuals may need to limit their intake of maltose-containing foods. <br>


**Fat:** A low-fat diet involves reducing the consumption of foods high in fat, including saturated and trans fats. This type of diet is often recommended for individuals with certain health conditions, such as heart disease or high cholesterol. <br>


**Caffeine:** Some individuals may be sensitive to caffeine, a natural stimulant found in various foods and beverages. They may need to limit or avoid caffeine-containing products to prevent adverse effects such as increased heart rate or insomnia.

In [None]:
import ast

# Convert the string values in the 'nutrition' column into actual lists
#data['nutrition'] = data['nutrition'].apply(ast.literal_eval)

# Split the 'nutrition' column into a new dataframe
nutrition_data = df_merged['nutrition'].apply(pd.Series)

# Name the new columns
nutrition_data.columns = ['Fructose', 'Galactose', 'Glucose', 'Lactose', 'Maltose', 'Fat', 'Caffeine']

# Drop the original 'nutrition' column
#data = df_merged.drop('nutrition', axis=1)

# Concatenate the original dataframe with the new 'nutrition' dataframe
df_merged = pd.concat([df_merged, nutrition_data], axis=1)

In [None]:
df_merged.head()

Unnamed: 0,food_id,food_name,food_description,ingredients,nutrition,created_time,data_source,cuisine,course,diet,...,veg_or_non_veg,ingredients_clean,allergies,Fructose,Galactose,Glucose,Lactose,Maltose,Fat,Caffeine
0,137739,arriba baked winter squash mexican style,autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",2023-06-17 00:04:53.882478,raw_recipes,,,,...,veg,"[winter squash, mexican seasoning, mixed spice...","[honey, milk, lactose, olive, squash, dairy]",51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,31490,a bit different breakfast pizza,this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",2023-06-17 00:04:53.891878,raw_recipes,,,,...,non-veg,"[prepared pizza crust, sausage patty, egg, mil...","[gluten, milk, lactose, dairy]",173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,112140,all in the kitchen chili,this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",2023-06-17 00:04:53.892491,raw_recipes,,,,...,non-veg,"[ground beef, yellow onion, diced tomato, toma...","[legume, ltp, milk, lactose, nightshade, dairy]",269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,59389,alouette potatoes,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",2023-06-17 00:04:53.892945,raw_recipes,,,,...,veg,"[spreadable cheese with garlic and herb, new p...","[hypersensitivity, milk, lactose, olive, potat...",368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,44061,amish tomato ketchup for canning,my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",2023-06-17 00:04:53.893248,raw_recipes,,,,...,veg,"[tomato juice, apple cider vinegar, sugar, sal...","[nightshade, oral, sugar]",352.9,1.0,337.0,23.0,3.0,0.0,28.0


**Define the threshold value of the nutrients:** <br>
The threshold values for labeling nutrients as high, medium, or low depends on the specific food regulatory agencies and guidelines followed in the country. We have considered the following rule-based value basis Indian regulatory authority. <br>


**Fructose:** <br>
**High:** Above 15 grams per serving. <br>
**Medium:** Between the lower threshold and upper threshold. <br>
**Low:** Below 5 grams per serving. <br>


**Galactose:** <br>
**High:** Above 10 grams per serving. <br>
**Medium:** Between the lower threshold and upper threshold. <br>
**Low:** Below 2 grams per serving.<br>


**Glucose:** <br>
**High:** Above 10 grams per serving. <br>
**Medium:** Between the lower threshold and upper threshold. <br>
**Low:** Below 2 grams per serving. <br>


**Lactose:** <br>
**High:** Above 12 grams per serving. <br>
**Medium:** Between the lower threshold and upper threshold. <br>
**Low:** Below 2 grams per serving. <br>


**Maltose:** <br>
**High:** Above 10 grams per serving. <br>
**Medium:** Between the lower threshold and upper threshold. <br>
**Low:** Below 2 grams per serving. <br>


**Fat:** <br>
**High:** Above 20 grams per serving. <br>
**Medium:** Between the lower threshold and upper threshold. <br>
**Low:** Below 3 grams per serving. <br>


**Caffeine:** <br>
**High:** Above 0.150 grams per serving. <br>
**Medium:** Between the lower threshold and upper threshold. <br>
**Low:** Below 0.050 grams per serving. <br>

In [None]:
# Defining the function for each nutrient
def categorize_fructose(value):
    value = pd.to_numeric(value)
    if value > 30:
        return 'High'
    elif 5 <= value <= 30:
        return 'Medium'
    else:
        return 'Low'

def categorize_galactose(value):
    value = pd.to_numeric(value)
    if value > 30:
        return 'High'
    elif 5 <= value <= 30:
        return 'Medium'
    else:
        return 'Low'

def categorize_glucose(value):
    value = pd.to_numeric(value)
    if value > 30:
        return 'High'
    elif 5 <= value <= 30:
        return 'Medium'
    else:
        return 'Low'

def categorize_lactose(value):
    value = pd.to_numeric(value)
    if value > 30:
        return 'High'
    elif 5 <= value <= 30:
        return 'Medium'
    else:
        return 'Low'

def categorize_maltose(value):
    value = pd.to_numeric(value)
    if value > 30:
        return 'High'
    elif 5 <= value <= 30:
        return 'Medium'
    else:
        return 'Low'

def categorize_fat(value):
    value = pd.to_numeric(value)
    if value > 30:
        return 'High'
    elif 5 <= value <= 30:
        return 'Medium'
    else:
        return 'Low'

def categorize_caffeine(value):
    value = pd.to_numeric(value)
    if value > 5:
        return 'High'
    elif 2 <= value <= 5:
        return 'Medium'
    else:
        return 'Low'

# Apply the function to the column
df_merged['Fructose'] = df_merged['Fructose'].apply(categorize_fructose)
df_merged['Galactose'] = df_merged['Galactose'].apply(categorize_galactose)
df_merged['Glucose'] = df_merged['Glucose'].apply(categorize_glucose)
df_merged['Lactose'] = df_merged['Lactose'].apply(categorize_lactose)
df_merged['Maltose'] = df_merged['Maltose'].apply(categorize_maltose)
df_merged['Fat'] = df_merged['Fat'].apply(categorize_fat)
df_merged['Caffeine'] = df_merged['Caffeine'].apply(categorize_caffeine)

In [None]:
def map_dietary_restrictions(row):
    restrictions = []
    if row['Fructose'] == 'Low' and row['Galactose'] == 'Low' and row['Glucose'] == 'Low' and row['Maltose'] == 'Low':
        restrictions.append('low sugar')
    if row['Fat'] == 'Low':
        restrictions.append('low fat')
    if row['Glucose'] == 'Low':
        restrictions.append('low carb')
    if row['Caffeine'] == 'Low':
        restrictions.append('low caffeine')
    if row['Galactose'] == 'Low' and row['Lactose'] == 'Low':
        restrictions.append('low lactose')

    #return ', '.join(restrictions) if restrictions else 'none'
    return restrictions

# Apply the function to the DataFrame
df_merged['dietary_restrictions'] = df_merged.apply(map_dietary_restrictions, axis=1)

In [None]:
df_merged = df_merged.drop(['Fructose','Galactose','Glucose','Lactose','Maltose','Fat','Caffeine'], axis=1)

In [None]:
df_merged.columns

Index(['food_id', 'food_name', 'food_description', 'ingredients', 'nutrition',
       'created_time', 'data_source', 'cuisine', 'course', 'diet', 'id',
       'state', 'region', 'veg_or_non_veg', 'ingredients_clean', 'allergies',
       'dietary_restrictions'],
      dtype='object')

In [None]:
# store intermediate results
df_merged.to_csv("./data/processed/food_data_with_dietary_restriction.csv")

### Tag each food with Season name based on the season corpus

In [None]:
import re

def detect_season(text):
    season_keywords = {
        "spring": ["spring", "bloom", "flowers", "warm"],
        "summer": ["summer", "hot", "sun", "beach"],
        "autumn": ["autumn", "fall", "leaves", "cold", "harvest", "rainy"],
        "winter": ["winter", "snow", "christmas", "cold", "freeze"]
    }

    cleaned_text = re.sub(r'\W+', ' ', text)  # Remove non-alphanumeric characters
    tokenized_text = cleaned_text.lower().split()  # Tokenize the text

    found_seasons = []
    for season, keywords in season_keywords.items():
        for keyword in keywords:
            if keyword in tokenized_text:
                found_seasons.append(season)
                break

    #return ", ".join(found_seasons) if found_seasons else "none"
    return found_seasons

# Convert the 'description' column to string type
df_merged['food_description'] = df_merged['food_description'].astype(str)

# Apply the function to the 'description' column in the dataset
df_merged['seasons'] = df_merged['food_description'].apply(detect_season)

In [None]:
# save final result
df_merged.to_csv("./data/processed/food_data_final.csv")

In [None]:
df_merged.columns

Index(['food_id', 'food_name', 'food_description', 'ingredients', 'nutrition',
       'created_time', 'data_source', 'cuisine', 'course', 'diet', 'id',
       'state', 'region', 'veg_or_non_veg', 'ingredients_clean', 'allergies',
       'dietary_restrictions', 'seasons'],
      dtype='object')

In [None]:
gbl_db_conn.close()

## finally save integrated data into Database

In [None]:
# create database connection
gbl_db_conn = DatabaseAccess(DB_CONFIG).getConnection()

def food_data_save_to_db(item):
    global gbl_db_conn
    try:
        cur = gbl_db_conn.cursor()
        cur.execute("INSERT INTO public.food_data (food_id, food_name, food_description, ingredients, ingredients_clean, nutrition, veg_or_non_veg, allergies, cuisine, course, diet, state, region, seasons, dietary_restrictions, data_source) \
                                                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", \
                                                (item.food_id, item.food_name, item.food_description, item.ingredients, item.ingredients_clean, item.nutrition, item.veg_or_non_veg, item.allergies, item.cuisine, item.course, item.diet, item.state, item.region, item.seasons, item.dietary_restrictions, item.data_source))
        gbl_db_conn.commit()
    except (Exception, psycopg2.Error) as error:
        print("Error inserting data: ", error)
    finally:
        cur.close()

    return

In [None]:
# store all data into DB
res = df_merged.apply(food_data_save_to_db, axis=1)

In [None]:
# read db and validate stored dataset
df_food_data_sql = pd.read_sql_query('select * from "food_data"',con=gbl_db_conn)
print(df_food_data_sql.shape)
df_food_data_sql.head()

(238763, 17)


Unnamed: 0,food_id,food_name,food_description,ingredients,ingredients_clean,nutrition,veg_or_non_veg,allergies,cuisine,course,diet,state,region,seasons,dietary_restrictions,data_source,created_time
0,137739,arriba baked winter squash mexican style,autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...","[winter squash, mexican seasoning, mixed spice...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",veg,"[honey, milk, lactose, olive, squash, dairy]",,,,,,[autumn],"[low fat, low lactose]",raw_recipes,2023-06-17 15:32:27.077620
1,31490,a bit different breakfast pizza,this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...","[prepared pizza crust, sausage patty, egg, mil...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",non-veg,"[gluten, milk, lactose, dairy]",,,,,,[],"[low carb, low caffeine]",raw_recipes,2023-06-17 15:32:27.083003
2,112140,all in the kitchen chili,this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...","[ground beef, yellow onion, diced tomato, toma...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",non-veg,"[legume, ltp, milk, lactose, nightshade, dairy]",,,,,,"[autumn, winter]",[],raw_recipes,2023-06-17 15:32:27.083932
3,59389,alouette potatoes,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...","[spreadable cheese with garlic and herb, new p...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",veg,"[hypersensitivity, milk, lactose, olive, potat...",,,,,,[],[],raw_recipes,2023-06-17 15:32:27.084828
4,44061,amish tomato ketchup for canning,my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...","[tomato juice, apple cider vinegar, sugar, sal...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",veg,"[nightshade, oral, sugar]",,,,,,[],[low fat],raw_recipes,2023-06-17 15:32:27.085716


In [None]:
gbl_db_conn.close()

In [None]:
#df_merged['seasons'] = np.nan
#df_merged["seasons"] = df_merged["seasons"].apply(lambda x: list([]) if x is np.nan else x)