# Food Recalls

### Setup

In [1]:
# data
import pandas as pd
import re

# json
import json

# web content
import requests
from bs4 import BeautifulSoup

# Openai and langchain
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
from dotenv import load_dotenv

In [2]:
# Load environment variables from .env file
load_dotenv()

True

### Read in data

In [3]:
df = pd.read_xml('./data/recalls_2024.xml')

In [4]:
df.head(2)

Unnamed: 0,Brand,Company,Date,ProductDescription,ProductType,Reason,Url,Terminated
0,Great Value,"John B Sanfilippo & Son, Inc.",03/13/2024,Honey Roasted Cashews,Food & Beverages,Undeclared milk and coconut allergens,https://www.fda.gov/safety/recalls-market-with...,
1,Par Pharmaceutical,"Endo International, Par Pharmaceutical",03/12/2024,Treprostinil 20mg/20mL Injection,Drugs,Potential Presence of Silicone Particulate Matter,https://www.fda.gov/safety/recalls-market-with...,


### 

In [26]:
df.Url[1]

'https://www.fda.gov/safety/recalls-market-withdrawals-safety-alerts/par-pharmaceutical-issues-voluntary-nationwide-recall-one-lot-treprostinil-injection-due-potential'

### Extract and clean text

In [None]:
# Get data with request
url = df.Url[0]

# See results
response = requests.get(url)
print(response.text)

In [7]:
# Use beautiful soup to clean and parse data
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()

In [8]:
print(repr(text))



In [25]:
# Drop newlines and extra spaces
text = re.sub(r'\n', ' ', text)
text = re.sub(r'\s+', ' ', text)

# Drop text before 'summary'
text = text[text.find('Summary'):text.find('Company Contact Information')]
print(repr(text))

'Summary Company Announcement Date: March 12, 2024 FDA Publish Date: March 13, 2024 Product Type: Food & Beverages Reason for Announcement: Recall Reason Description Undeclared milk and coconut allergens Company Name: John B Sanfilippo & Son, Inc. Brand Name: Brand Name(s) Great Value Product Description: Product Description Honey Roasted Cashews Company Announcement FOR IMMEDIATE RELEASE – ELGIN, IL - MARCH 12, 2024 - John B. Sanfilippo & Son, Inc (JBSS) announced today it is voluntarily recalling a limited amount of 8.25 oz Great Value Honey Roasted Cashews, because it may contain undeclared coconut and milk. People who have an allergy or severe sensitivity to coconut or milk run the risk of serious or life-threatening allergic reactions if they consume this product from the impacted best if used by date listed below. Potential affected products include the following details: Description: Great Value Honey Roasted Cashews Best if used by Jul 08 2025 GH2 (located on the bottom of the 

### Structure data

In [None]:
# Initialize model (gpt 4o-mini from openai)
model = init_chat_model("gpt-4o", model_provider="openai", temperature=0)

# cols to use
cols = ['company_announcement_date', 'fda_publish_date', 'product_type', 'reason_for_announcement', 'company_name', 'brand_name', 'product_description', 'recall_status', 'best_if_used_by', 'package_description', 'upc', 'distribution', 'consumer_contact_number', 'consumer_contact_email', 'no_adverse_reactions_reported', 'recall_initiation_reason', 'investigation_findings']

# HumanMessage and  SystemMessage
prompt = f"""

### Instructions
Summarize the recall information provided which was scraped from the recall notification webpage.
Ignore text irrelevant to the recall. Return as JSON object with keys: {cols}.

### Example
```{results}```

### Formatting
- Do not nest any keys, it should be flat.
- Do not output ```json at the beginning of the output or ``` at the end.
- Use " " for keys and values.
"""

messages = [SystemMessage(prompt), HumanMessage(text)]

# Generate response
response = model.invoke(messages)

In [31]:
results = json.loads(response.content)
results

{'company_announcement_date': 'March 12, 2024',
 'fda_publish_date': 'March 13, 2024',
 'product_type': 'Food & Beverages',
 'reason_for_announcement': 'Undeclared milk and coconut allergens',
 'company_name': 'John B Sanfilippo & Son, Inc.',
 'brand_name': 'Great Value',
 'product_description': 'Honey Roasted Cashews',
 'recall_status': '',
 'best_if_used_by': 'Jul 08 2025 GH2',
 'package_description': '8.25 oz plastic can with a blue wrap around label',
 'upc': '078742133348',
 'distribution': 'Select Walmart stores in AL, AR, CA, CO, CT, FL, GA, HI, IL, IN, KS, KY, LA, MA, MO, MS, NC, NE, NJ, NM, NV, NY, OH, OK, PA, SC, TN, TX, VT, WV and via Walmart.com',
 'consumer_contact_number': '1-800-874-8734',
 'consumer_contact_email': 'info@jbssinc.com',
 'no_adverse_reactions_reported': 'No adverse reactions have been reported to date.',
 'recall_initiation_reason': 'A consumer report of finding coconut cashews within a container labeled as honey roasted cashews.',
 'investigation_finding

In [20]:
# Get keys from json
cols = list(results.keys())
print(cols)

['company_announcement_date', 'fda_publish_date', 'product_type', 'reason_for_announcement', 'company_name', 'brand_name', 'product_description', 'recall_status', 'best_if_used_by', 'package_description', 'upc', 'distribution', 'consumer_contact_number', 'consumer_contact_email', 'no_adverse_reactions_reported', 'recall_initiation_reason', 'investigation_findings']


In [32]:
# Loop through all urls and concatenate results
fails = []
results = []
for url in df.Url:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text[text.find('Summary'):text.find('Company Contact Information')]
    messages = [SystemMessage(prompt), HumanMessage(text)]
    response = model.invoke(messages)
    try:
        result = json.loads(response.content)
        results.append(result)
    except:
        fails.append(url)
        continue

In [33]:
print('Fails:', len(fails))
print('Results:', len(results))

Fails: 7
Results: 75


In [35]:
results[0]

{'company_announcement_date': 'March 12, 2024',
 'fda_publish_date': 'March 13, 2024',
 'product_type': 'Food & Beverages',
 'reason_for_announcement': 'Undeclared milk and coconut allergens',
 'company_name': 'John B Sanfilippo & Son, Inc.',
 'brand_name': 'Great Value',
 'product_description': 'Honey Roasted Cashews',
 'recall_status': '',
 'best_if_used_by': 'Jul 08 2025 GH2',
 'package_description': '8.25 oz plastic can with a blue wrap around label',
 'upc': '078742133348',
 'distribution': 'Select Walmart stores in AL, AR, CA, CO, CT, FL, GA, HI, IL, IN, KS, KY, LA, MA, MO, MS, NC, NE, NJ, NM, NV, NY, OH, OK, PA, SC, TN, TX, VT, WV and via Walmart.com',
 'consumer_contact_number': '1-800-874-8734',
 'consumer_contact_email': 'info@jbssinc.com',
 'no_adverse_reactions_reported': 'No adverse reactions have been reported to date.',
 'recall_initiation_reason': 'A consumer report of finding coconut cashews within a container labeled as honey roasted cashews.',
 'investigation_finding

In [36]:
# Create dataframe from results
df_results = pd.DataFrame(results)
df_results.head(2)

Unnamed: 0,company_announcement_date,fda_publish_date,product_type,reason_for_announcement,company_name,brand_name,product_description,recall_status,best_if_used_by,package_description,upc,distribution,consumer_contact_number,consumer_contact_email,no_adverse_reactions_reported,recall_initiation_reason,investigation_findings
0,"March 12, 2024","March 13, 2024",Food & Beverages,Undeclared milk and coconut allergens,"John B Sanfilippo & Son, Inc.",Great Value,Honey Roasted Cashews,,Jul 08 2025 GH2,8.25 oz plastic can with a blue wrap around label,078742133348,"Select Walmart stores in AL, AR, CA, CO, CT, F...",1-800-874-8734,info@jbssinc.com,No adverse reactions have been reported to date.,A consumer report of finding coconut cashews w...,A limited number of incorrect honey roasted ca...
1,"March 12, 2024","March 12, 2024",Drugs,Potential Presence of Silicone Particulate Matter,"Endo International, Par Pharmaceutical",Par Pharmaceutical,Treprostinil 20mg/20mL Injection,This recall is being conducted with the knowle...,04/2024,20mL multidose vials as sterile solutions in w...,NDC #42023-206-01,Nationwide to wholesalers and hospitals from J...,1-800-828-9393,,"To date, Par has not received any reports of a...",The product is being recalled due to the poten...,


In [37]:
df_results.to_csv('./data/recalls_2024_full_results.csv', index=False)