# Food Recall with GenAI

In [None]:
# data
import pandas as pd
import re

# json
import json
from flatten_json import flatten

# web content
import requests
from bs4 import BeautifulSoup

# Openai and langchain
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage

In [5]:
df = pd.read_xml('./data/recalls_2024.xml')
print(df.shape)

(82, 8)


In [6]:
df.head()

Unnamed: 0,Brand,Company,Date,ProductDescription,ProductType,Reason,Url,Terminated
0,Great Value,"John B Sanfilippo & Son, Inc.",03/13/2024,Honey Roasted Cashews,Food & Beverages,Undeclared milk and coconut allergens,https://www.fda.gov/safety/recalls-market-with...,
1,Par Pharmaceutical,"Endo International, Par Pharmaceutical",03/12/2024,Treprostinil 20mg/20mL Injection,Drugs,Potential Presence of Silicone Particulate Matter,https://www.fda.gov/safety/recalls-market-with...,
2,Wesco Fresh,"Wesco, Inc.",03/11/2024,Mint No Bake Cookies,Food & Beverages,Undeclared Peanut Allergen,https://www.fda.gov/safety/recalls-market-with...,
3,KALO,"KALO Foods, LLC",03/08/2024,Single Slices of Carrot Cake & Chocolate Cake,Food & Beverages,Potential or Undeclared Allergen - Soy,https://www.fda.gov/safety/recalls-market-with...,
4,La Fiesta,La Fiesta Food Products,03/08/2024,Ground Cinnamon,Food & Beverages,Potential Metal Contaminant - Lead,https://www.fda.gov/safety/recalls-market-with...,


In [7]:
# Get first url of df
url = df['Url'][0]
print(url)

https://www.fda.gov/safety/recalls-market-withdrawals-safety-alerts/john-b-sanfilippo-son-inc-issues-allergy-alert-undeclared-coconut-and-milk-great-value-honey-roasted


# Get data from the webpage

In [9]:
# Get the content of the url
response = requests.get(url)
print(response.text)

<!DOCTYPE html>
<html  lang="en" dir="ltr" prefix="og: https://ogp.me/ns#">
  <head>
    <meta charset="utf-8" />
<script async src="https://www.googletagmanager.com/gtag/js?id=G-273DTKB5QW"></script>
<meta name="description" content="John B. Sanfilippo &amp; Son, Inc (JBSS) announced today it is voluntarily recalling a limited amount of 8.25 oz Great Value Honey Roasted Cashews, because it may contain undeclared coconut and milk. People who have an allergy or severe sensitivity to coconut or milk run the risk of serious or life-thre" />
<meta name="dcterms.title" content="John B. Sanfilippo &amp; Son, Inc Issues Allergy Alert on Undeclared Coconut and Milk in Great Value Honey Roasted Cashews 8.25 Oz" />
<meta name="dcterms.creator" content="Office of Regulatory Affairs" />
<meta name="dcterms.description" content="John B. Sanfilippo &amp; Son, Inc (JBSS) announced today it is voluntarily recalling a limited amount of 8.25 oz Great Value Honey Roasted Cashews, because it may contain u

In [10]:
# Clean of html tags
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
print(repr(text))



In [11]:
def get_text_from_url(url):
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text()
    
    # # Drop newlines and replace multiple spaces with single space
    for pattern in [r'\n', r'\s+']:
        text = re.sub(pattern, ' ', text)

    # Drop text before the first occurrence of 'Summary'
    text = text[text.find('Summary'):]

    return text

# Test the function
text = get_text_from_url(df.Url[0])

# Show text with repr to see newlines and spaces
print(repr(text))
print(len(text))

"Summary Company Announcement Date: March 12, 2024 FDA Publish Date: March 13, 2024 Product Type: Food & Beverages Reason for Announcement: Recall Reason Description Undeclared milk and coconut allergens Company Name: John B Sanfilippo & Son, Inc. Brand Name: Brand Name(s) Great Value Product Description: Product Description Honey Roasted Cashews Company Announcement FOR IMMEDIATE RELEASE – ELGIN, IL - MARCH 12, 2024 - John B. Sanfilippo & Son, Inc (JBSS) announced today it is voluntarily recalling a limited amount of 8.25 oz Great Value Honey Roasted Cashews, because it may contain undeclared coconut and milk. People who have an allergy or severe sensitivity to coconut or milk run the risk of serious or life-threatening allergic reactions if they consume this product from the impacted best if used by date listed below. Potential affected products include the following details: Description: Great Value Honey Roasted Cashews Best if used by Jul 08 2025 GH2 (located on the bottom of the 

In [14]:
# Keys for prompt
cols = ['Recall Summary', 'Company Name', 'Announcement Date', 'FDA Publish Date', 'Product Type', 'Brand Name', 'Product Description', 'Reason for Recall', 'Description', 'Best if used by', 'UPC', 'Packaging', 'Distribution', 'Impact', 'Recall Trigger', 'Consumer Advisory', 'Contact Information']

In [15]:
# Initialize chat model
model = init_chat_model("gpt-4o-mini", model_provider="openai")
messages = [
    SystemMessage(f"Summarize the recall information provided which was scraped from the recall notification webpage. Ignore text irrelevant to the recall. Return as JSON object with keys: {cols}. Do not nest any keys, it should be flat. Do not output ```json at the beginning of the output or ``` at the end."),
    HumanMessage(text),
]

In [16]:
output = model.invoke(messages)

In [17]:
# Print each part of langchain_core.messages.ai.AIMessage on a new line
for message in output:
    print(message[1])
    break

{
  "Recall Summary": "John B. Sanfilippo & Son, Inc. is voluntarily recalling a limited amount of 8.25 oz Great Value Honey Roasted Cashews due to undeclared coconut and milk allergens.",
  "Company Name": "John B Sanfilippo & Son, Inc.",
  "Announcement Date": "March 12, 2024",
  "FDA Publish Date": "March 13, 2024",
  "Product Type": "Food & Beverages",
  "Brand Name": "Great Value",
  "Product Description": "Honey Roasted Cashews",
  "Reason for Recall": "Undeclared milk and coconut allergens",
  "Description": "Product may contain undeclared coconut and milk, posing a risk for individuals with allergies.",
  "Best if used by": "Jul 08 2025",
  "UPC": "078742133348",
  "Packaging": "8.25 oz plastic can with a blue wrap around label",
  "Distribution": "Distributed in select Walmart stores and via Walmart.com in various states.",
  "Impact": "No adverse reactions have been reported to date.",
  "Recall Trigger": "Consumer report of finding coconut cashews in honey roasted cashew con

In [20]:
list(output)[0]

('content',
 '{\n  "Recall Summary": "John B. Sanfilippo & Son, Inc. is voluntarily recalling a limited amount of 8.25 oz Great Value Honey Roasted Cashews due to undeclared coconut and milk allergens.",\n  "Company Name": "John B Sanfilippo & Son, Inc.",\n  "Announcement Date": "March 12, 2024",\n  "FDA Publish Date": "March 13, 2024",\n  "Product Type": "Food & Beverages",\n  "Brand Name": "Great Value",\n  "Product Description": "Honey Roasted Cashews",\n  "Reason for Recall": "Undeclared milk and coconut allergens",\n  "Description": "Product may contain undeclared coconut and milk, posing a risk for individuals with allergies.",\n  "Best if used by": "Jul 08 2025",\n  "UPC": "078742133348",\n  "Packaging": "8.25 oz plastic can with a blue wrap around label",\n  "Distribution": "Distributed in select Walmart stores and via Walmart.com in various states.",\n  "Impact": "No adverse reactions have been reported to date.",\n  "Recall Trigger": "Consumer report of finding coconut cashew

In [None]:
# Get the generated text
gen_text = str(list(output)[0][1])

# Convert to json
result_json = json.loads(gen_text)

# Flatten the json
result_json = flatten(result_json)

# Create a dataframe
result_df = pd.DataFrame(result_json, index=[0])

# Ensure columns match keys
result_df = result_df[cols]

# Print transposed results
result_df.T

Unnamed: 0,0
Recall Summary,"John B. Sanfilippo & Son, Inc. is voluntarily ..."
Company Name,"John B Sanfilippo & Son, Inc."
Announcement Date,"March 12, 2024"
FDA Publish Date,"March 13, 2024"
Product Type,Food & Beverages
Brand Name,Great Value
Product Description,Honey Roasted Cashews
Reason for Recall,Undeclared milk and coconut allergens
Description,Product may contain undeclared coconut and mil...
Best if used by,Jul 08 2025


In [None]:
for url in df.Url.to_list():
    text = get_text_from_url(url)
    messages = [
        SystemMessage(f"Summarize the recall information provided which was scraped from the recall notification webpage. Ignore text irrelevant to the recall. Return as flattened JSON object with keys: {keys}. Do not output ```json at the beginning of the output or ``` at the end."),
        HumanMessage(text),
    ]
    output = model.invoke(messages)
    gen_text = str(list(output)[0][1])
    result_json = json.loads(gen_text)
    result_json = flatten_json(result_json)
    result_df = pd.concat([result_df, pd.DataFrame(result_json, index=[0])], ignore_index=True)

In [None]:
print(list(output)[0][1])

**Recall Summary:**

- **Company Name:** John B Sanfilippo & Son, Inc.
- **Announcement Date:** March 12, 2024
- **FDA Publish Date:** March 13, 2024
- **Product Type:** Food & Beverages
- **Brand Name:** Great Value
- **Product Description:** Honey Roasted Cashews
- **Reason for Recall:** Undeclared milk and coconut allergens
- **Affected Product Details:**
  - **Description:** Great Value Honey Roasted Cashews
  - **Best if used by:** July 08, 2025
  - **UPC:** 078742133348
  - **Packaging:** 8.25 oz plastic can with a blue wrap-around label
- **Distribution:** Sold in select Walmart stores across various states and via Walmart.com
- **Impact:** Risk of serious or life-threatening allergic reactions for individuals allergic to coconut or milk.
- **Recall Trigger:** Consumer report of finding coconut cashews in a container labeled as honey roasted cashews due to incorrect labeling during manufacturing.
- **Consumer Advisory:** Do not consume the product. Discard it or return it to Wal

In [22]:
text_str = """
**Recall Summary:**

- **Company Name:** John B Sanfilippo & Son, Inc.
- **Announcement Date:** March 12, 2024
- **FDA Publish Date:** March 13, 2024
- **Product Type:** Food & Beverages
- **Brand Name:** Great Value
- **Product Description:** Honey Roasted Cashews
- **Reason for Recall:** Undeclared milk and coconut allergens
- **Affected Product Details:**
  - **Description:** Great Value Honey Roasted Cashews
  - **Best if used by:** July 08, 2025
  - **UPC:** 078742133348
  - **Packaging:** 8.25 oz plastic can with a blue wrap-around label
- **Distribution:** Sold in select Walmart stores across various states and via Walmart.com
- **Impact:** Risk of serious or life-threatening allergic reactions for individuals allergic to coconut or milk.
- **Recall Trigger:** Consumer report of finding coconut cashews in a container labeled as honey roasted cashews due to incorrect labeling during manufacturing.
- **Consumer Advisory:** Do not consume the product. Discard it or return it to Walmart for a full refund.
- **Contact Information:** John B. Sanfilippo & Son, Inc. - 1-800-874-8734 (Monday - Friday, 8:00am – 5:00pm CDT) or via email at info@jbssinc.com.
"""

In [None]:
keys = re.findall(r'(?<=\*\*)(.*?)(?=[:])',text_str)[:-1]
print(keys)

['Recall Summary', 'Company Name', 'Announcement Date', 'FDA Publish Date', 'Product Type', 'Brand Name', 'Product Description', 'Reason for Recall', 'Affected Product Details', 'Description', 'Best if used by', 'UPC', 'Packaging', 'Distribution', 'Impact', 'Recall Trigger', 'Consumer Advisory', 'Contact Information']
