In [1]:
import requests
import time
import os
import json
import string
import sqlite3

API_KEY = api_key = open("API_KEY.txt").read().strip()
  # Your USDA API key
BASE_URL = "https://api.nal.usda.gov/fdc/v1/foods/list"
FDC_DETAIL_PAGE_BASE = "https://fdc.nal.usda.gov/fdc-app.html#/food-details"
BASE_SEARCH_URL = "https://api.nal.usda.gov/fdc/v1/foods/search"
PAGE_SIZE = 200
MAX_PAGE_LIMIT = 250  # 50,000 results / 200 per page = 250 pages
SLEEP_DELAY = 0.5     # seconds between requests
RETRY_DELAY = 30      # seconds to wait on error retry
MAX_RETRIES = 3
MAX_DEPTH = 2         # Maximum additional characters to append for subdivision
CHECKPOINT_FILE = "checkpoint.json"
db_path = "branded_foods.db"

In [2]:
def create_tables(db_path: str):
    """
    Creates two tables:
      - food_items: stores abridged food item info including macros, derived metrics, 
        and additional fields (marketCountry, foodCategory, modifiedDate, dataSource, 
        servingSizeUnit, servingSize, householdServingFullText).
      - food_nutrients: stores all nutrient details for each food item.
    """
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()

    # Main table: store basic item info and key nutrient macros plus additional fields.
    cur.execute('''
        CREATE TABLE IF NOT EXISTS food_items (
            fdc_id INTEGER PRIMARY KEY,
            description TEXT,
            brand_owner TEXT,
            gtin_upc TEXT,
            published_date TEXT,
            market_country TEXT,
            food_category TEXT,
            modified_date TEXT,
            data_source TEXT,
            serving_size_unit TEXT,
            serving_size REAL,
            household_serving_full_text TEXT,
            protein_g REAL,
            fat_g REAL,
            carbs_g REAL,
            calories_kcal REAL,
            pct_protein_cal REAL
        )
    ''')

    # Secondary table: store all nutrient records.
    cur.execute('''
        CREATE TABLE IF NOT EXISTS food_nutrients (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            fdc_id INTEGER,
            nutrient_id INTEGER,
            nutrient_name TEXT,
            amount REAL,
            unit_name TEXT,
            derivation_code TEXT,
            derivation_description TEXT,
            FOREIGN KEY(fdc_id) REFERENCES food_items(fdc_id)
        )
    ''')

    conn.commit()
    conn.close()

In [3]:
def insert_branded_item(db_path: str, item: dict):
    """
    Inserts a single Branded food item into the database using the new food search schema.
    
    - Parses out nutrient 203 (Protein), 204 (Total lipid/fat), 205 (Carbohydrate, by difference),
      and 208 (Energy) using the new keys: 'nutrientNumber' and 'value'.
    - Uses 'publishedDate' for the item’s date.
    - Also extracts: marketCountry, foodCategory, modifiedDate, dataSource, servingSizeUnit,
      servingSize, and householdServingFullText.
    - Computes the derived field (pct_protein_cal) based on protein and energy.
    - Inserts all nutrient entries into the food_nutrients table.
    """
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()

    # Basic item fields.
    fdc_id = item.get("fdcId")
    description = item.get("description")
    brand_owner = item.get("brandOwner")
    gtin_upc = item.get("gtinUpc")
    published_date = item.get("publishedDate")
    market_country = item.get("marketCountry")
    food_category = item.get("foodCategory")
    modified_date = item.get("modifiedDate")
    data_source = item.get("dataSource")
    serving_size_unit = item.get("servingSizeUnit")
    serving_size = item.get("servingSize")
    household_serving_full_text = item.get("householdServingFullText")
    
    # Initialize macros with default 0 values.
    protein_g = 0
    fat_g = 0
    carbs_g = 0
    calories_kcal = 0

    # Process nutrient records.
    food_nutrients = item.get("foodNutrients", [])
    other_nutrients = []

    for nut in food_nutrients:
        try:
            num = nut.get("nutrientId")
        except (TypeError, ValueError):
            continue
        # In the new schema, the nutrient value is stored in "value"
        value = nut.get("value")
        
        if num == 1003:
            protein_g = value
        elif num == 1004:
            fat_g = value
        elif num == 1005:
            carbs_g = value
        elif num == 1008:
            calories_kcal = value
        
        other_nutrients.append(nut)

    # Compute derived percentage of protein calories.
    pct_protein_cal = 0
    if protein_g and calories_kcal and calories_kcal > 0:
        pct_protein_cal = (protein_g * 4 / calories_kcal) * 100

    # If calories are zero but macros are available, compute calories.
    if calories_kcal == 0 and protein_g and fat_g and carbs_g:
        calories_kcal = protein_g * 4 + fat_g * 9 + carbs_g * 4
        if calories_kcal > 0:
            pct_protein_cal = (protein_g * 4 / calories_kcal) * 100

    # Insert the main food item record into food_items.
    cur.execute('''
        INSERT OR REPLACE INTO food_items (
            fdc_id,
            description,
            brand_owner,
            gtin_upc,
            published_date,
            market_country,
            food_category,
            modified_date,
            data_source,
            serving_size_unit,
            serving_size,
            household_serving_full_text,
            protein_g,
            fat_g,
            carbs_g,
            calories_kcal,
            pct_protein_cal
        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    ''', (
        fdc_id,
        description,
        brand_owner,
        gtin_upc,
        published_date,
        market_country,
        food_category,
        modified_date,
        data_source,
        serving_size_unit,
        serving_size,
        household_serving_full_text,
        protein_g,
        fat_g,
        carbs_g,
        calories_kcal,
        pct_protein_cal
    ))

    # Insert nutrient records into food_nutrients.
    for nut in other_nutrients:
        try:
            num = nut.get("nutrientId")
        except (TypeError, ValueError):
            num = None
        nutrient_name = nut.get("nutrientName")
        value = nut.get("value")
        unit_name = nut.get("unitName")
        deriv_code = nut.get("derivationCode")
        deriv_desc = nut.get("derivationDescription")
        
        cur.execute('''
            INSERT INTO food_nutrients (
                fdc_id,
                nutrient_id,
                nutrient_name,
                amount,
                unit_name,
                derivation_code,
                derivation_description
            ) VALUES (?, ?, ?, ?, ?, ?, ?)
        ''', (
            fdc_id,
            num,
            nutrient_name,
            value,
            unit_name,
            deriv_code,
            deriv_desc
        ))

    conn.commit()
    conn.close()

In [4]:
def load_checkpoint():
    """Load the task queue from the checkpoint file, if it exists."""
    if os.path.exists(CHECKPOINT_FILE):
        try:
            with open(CHECKPOINT_FILE, "r") as f:
                tasks = json.load(f)
                print("Checkpoint loaded with", len(tasks), "tasks.")
                return tasks
        except Exception as e:
            print("Error loading checkpoint:", e)
    return None

In [5]:
def save_checkpoint(tasks):
    """Save the current task queue to the checkpoint file."""
    with open(CHECKPOINT_FILE, "w") as f:
        json.dump(tasks, f)

In [6]:
def fetch_page(search_term, page_number):
    """Fetch a page of results for a given search term and page number."""
    body = {
        "generalSearchInput": search_term,
        "dataType": ["Branded"],
        "pageNumber": page_number,
        "pageSize": PAGE_SIZE,
        "sortBy": "fdcId",
        "sortOrder": "asc"
    }
    params = {"api_key": API_KEY}
    attempt = 0
    while attempt < MAX_RETRIES:
        try:
            response = requests.post(BASE_SEARCH_URL, json=body, params=params, timeout=30)
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"Error fetching page {page_number} for term '{search_term}': {e}")
            attempt += 1
            if attempt < MAX_RETRIES:
                print(f"Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
    return None

In [7]:
def subdivide_task(task):
    """Subdivide a task into multiple sub-tasks by appending a character."""
    new_tasks = []
    for char in (string.ascii_lowercase + string.digits):
        new_tasks.append({
            "search_term": task["search_term"] + char,
            "depth": task["depth"] + 1,
            "current_page": 1,
            "total_pages": None,
            "completed": False
        })
    return new_tasks

In [15]:
def process_task(task):
    """
    Process a single task:
      - If total_pages is not set, fetch the first page to determine it.
      - If the total_pages exceed MAX_PAGE_LIMIT and depth is below MAX_DEPTH, subdivide.
      - Otherwise, paginate through all pages and insert items.
    Returns:
      - False if processing failed (so the task can be retried)
      - A list of new tasks if the task was subdivided
      - True if the task was processed successfully.
    """
    search_term = task["search_term"]
    current_page = task["current_page"]

    # Determine total_pages if not set.
    if task["total_pages"] is None:
        data = fetch_page(search_term, current_page)
        if data is None:
            return False  # Could not fetch; do not update task.
        total_pages = data.get("totalPages", 0)
        task["total_pages"] = total_pages
        print(f"Task '{search_term}' has {total_pages} pages at depth {task['depth']}")
    else:
        total_pages = task["total_pages"]

    # If too many pages, subdivide further (if depth allows).
    if total_pages > MAX_PAGE_LIMIT and task["depth"] < MAX_DEPTH:
        print(f"Task '{search_term}' at depth {task['depth']} too broad with {total_pages} pages; subdividing.")
        new_tasks = subdivide_task(task)
        task["completed"] = True  # Mark this task as subdivided.
        return new_tasks

    # Process pages.
    while current_page <= total_pages:
        data = fetch_page(search_term, current_page)
        if data is None:
            print(f"Skipping page {current_page} for term '{search_term}' after retries.")
            return False  # Let the task remain for retry.
        foods = data.get("foods", [])
        print(f"Processing term '{search_term}', page {current_page}/{total_pages} with {len(foods)} items.")
        for item in foods:
            insert_branded_item(db_path, item)
        current_page += 1
        task["current_page"] = current_page
        # Removed save_checkpoint call here.
        time.sleep(SLEEP_DELAY)
    task["completed"] = True
    return True


In [16]:
def ScrapeUSDA():
    create_tables(db_path)
    
    # Load checkpoint from file; if empty, initialize with seed tasks.
    task_queue = load_checkpoint()
    if not task_queue:
        task_queue = []
        for char in (list(string.ascii_lowercase) + list(string.digits)):
            task_queue.append({
                "search_term": char,
                "depth": 0,
                "current_page": 1,
                "total_pages": None,
                "completed": False
            })
        save_checkpoint(task_queue)
    
    # Process tasks until the queue is empty.
    while task_queue:
        task = task_queue.pop(0)
        if task.get("completed"):
            continue
        result = process_task(task)
        if result is False:
            # Processing failed; put task back for a later retry.
            task_queue.append(task)
        elif isinstance(result, list):
            # Task was subdivided; add new tasks.
            task_queue.extend(result)
        # Save checkpoint after processing each task.
        save_checkpoint(task_queue)
    
    print("Scraping complete.")


In [18]:
ScrapeUSDA()

Task 'a' has 1 pages at depth 0
Processing term 'a', page 1/1 with 1 items.
Task 'b' has 39 pages at depth 0
Processing term 'b', page 1/39 with 200 items.
Processing term 'b', page 2/39 with 200 items.
Processing term 'b', page 3/39 with 200 items.


KeyboardInterrupt: 

# Quick Database top Check

In [14]:
conn = sqlite3.connect(db_path)
cur = conn.cursor()
cur.execute('''
    SELECT fdc_id, description, protein_g, calories_kcal, pct_protein_cal
    FROM food_items
    WHERE pct_protein_cal > 0
    ORDER BY pct_protein_cal DESC
''')
rows = cur.fetchall()
conn.close()

for row in rows:
    fdc_id, desc, prot, cals, pct = row
    print(f"FDC: {fdc_id}, Desc: {desc}, Protein_g: {prot}, Cals: {cals}, Pct_Protein_Cals: {pct:.2f}%")


# Query a single ID

In [18]:
import requests
import json

# Replace with your actual API key
API_KEY = "mpbbx23Q6NA8oDb52byDfdMYcukHRyaZED2eQ5VF"
BASE_URL = "https://api.nal.usda.gov/fdc/v1/food/{}"

def query_food_item(fdc_id):
    """
    Queries the USDA FoodData Central API for a single food item.
    
    Args:
        fdc_id (int): The FoodData Central ID.
    
    Returns:
        dict: The JSON response with the food item details.
    """
    url = BASE_URL.format(fdc_id)
    params = {"api_key": API_KEY, "format": ['abridged']}
    response = requests.get(url, params=params)
    response.raise_for_status()  # raise an exception for HTTP errors
    return response.json()

def main():
    try:
        fdc_id = int(input("Enter a USDA FDC ID: ").strip())
    except ValueError:
        print("Please enter a valid numeric FDC ID.")
        return

    try:
        data = query_food_item(fdc_id)
    except Exception as e:
        print(f"Error fetching data: {e}")
        return

    # Pretty-print the JSON details
    print("\n--- USDA Food Item Details ---\n")
    print(json.dumps(data, indent=2))

if __name__ == "__main__":
    main()


Enter a USDA FDC ID: 345355

--- USDA Food Item Details ---

{
  "fdcId": 345355,
  "description": "Honeysuckle White Boneless Turkey Roast with Gravy Packet",
  "dataType": "Branded",
  "publicationDate": "2019-04-01",
  "brandOwner": "Cargill",
  "gtinUpc": "00642205513826",
  "foodNutrients": [
    {
      "number": "203",
      "name": "Protein",
      "amount": 23.0,
      "unitName": "G",
      "derivationCode": "LCGE",
      "derivationDescription": "Given by information provider as an exact value per 100 unit measure"
    },
    {
      "number": "204",
      "name": "Total lipid (fat)",
      "amount": 2.5,
      "unitName": "G",
      "derivationCode": "LCGE",
      "derivationDescription": "Given by information provider as an exact value per 100 unit measure"
    },
    {
      "number": "205",
      "name": "Carbohydrate, by difference",
      "amount": 1.0,
      "unitName": "G",
      "derivationCode": "LCGE",
      "derivationDescription": "Given by information provider 

# Query single page

In [89]:
def fetch_foods_list_get(page_number, page_size=200, max_retries=3, retry_delay=30):
    """
    Queries a single page of branded food items from USDA using a GET request.
    Implements retry logic to catch server errors.
    
    Args:
        page_number (int): The page number to fetch.
        page_size (int): Number of items per page (default is 200).
        max_retries (int): Number of retry attempts in case of error.
        retry_delay (int): Seconds to wait between retries.
    
    Returns:
        list: A list of food items (dictionaries) from the response, or an empty list if all attempts fail.
    """
    url = "https://api.nal.usda.gov/fdc/v1/foods/search"
    params = {
        "api_key": API_KEY,
        "dataType": "Branded",      # Restrict to Branded items
        "pageNumber": page_number,
        "pageSize": page_size,
        "sortBy": "fdcId",
        "sortOrder": "asc"
    }
    
    attempt = 0
    while attempt < max_retries:
        try:
            response = requests.get(url, params=params, timeout=30)
            response.raise_for_status()  # Will raise an HTTPError for 4xx/5xx responses
            data = response.json()
            if "foods" in data:
                return data["foods"]
            else:
                print(f"Unexpected response structure on page {page_number}: {data}")
                return []
        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error on page {page_number} (attempt {attempt+1}): {http_err}")
        except Exception as err:
            print(f"Error on page {page_number} (attempt {attempt+1}): {err}")
        attempt += 1
        if attempt < max_retries:
            print(f"Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)
    
    print(f"Failed to fetch page {page_number} after {max_retries} attempts.")
    return []

# Example usage:
if __name__ == "__main__":
    page = 251
    foods = fetch_foods_list_get(page)
    print(f"Fetched {len(foods)} items from page {page}")

HTTP error on page 251 (attempt 1): 500 Server Error: Internal Server Error for url: https://api.nal.usda.gov/fdc/v1/foods/search?api_key=mpbbx23Q6NA8oDb52byDfdMYcukHRyaZED2eQ5VF&dataType=Branded&pageNumber=251&pageSize=200&sortBy=fdcId&sortOrder=asc
Retrying in 30 seconds...
HTTP error on page 251 (attempt 2): 500 Server Error: Internal Server Error for url: https://api.nal.usda.gov/fdc/v1/foods/search?api_key=mpbbx23Q6NA8oDb52byDfdMYcukHRyaZED2eQ5VF&dataType=Branded&pageNumber=251&pageSize=200&sortBy=fdcId&sortOrder=asc
Retrying in 30 seconds...
HTTP error on page 251 (attempt 3): 500 Server Error: Internal Server Error for url: https://api.nal.usda.gov/fdc/v1/foods/search?api_key=mpbbx23Q6NA8oDb52byDfdMYcukHRyaZED2eQ5VF&dataType=Branded&pageNumber=251&pageSize=200&sortBy=fdcId&sortOrder=asc
Failed to fetch page 251 after 3 attempts.
Fetched 0 items from page 251


# Fetch Food list by page

In [17]:
def fetch_foods_list(page_number: int, page_size: int = 200) -> list:
    """
    Fetch a page of foods from USDA using the /foods/list endpoint.
    Returns a list of dicts (each representing a food item).
    An empty list means no more items.
    """
    # The USDA /foods/list endpoint requires a POST with JSON body specifying pageSize, pageNumber, etc.
    params = {
        "api_key": API_KEY
    }
    # Body for the POST request
    body = {
        "pageNumber": page_number,
        "pageSize": page_size,
        "sortBy": "fdcId",
        "sortOrder": "asc",
        "dataType": ["Branded"]
    }

    response = requests.post(BASE_URL, params=params, json=body)
    response.raise_for_status()
    return response.json()  # List of food items (or empty if no more pages)



In [35]:
json3 = fetch_foods_list(250,200)
json3[0]

{'fdcId': 1877488,
 'description': 'THORNTONS, IRRESISTIBLES TRUFFLES, GINGERBREAD, GINGERBREAD',
 'dataType': 'Branded',
 'publicationDate': '2021-07-29',
 'brandOwner': 'LBB Imports, LLC',
 'gtinUpc': '813715013124',
 'foodNutrients': [{'number': '203',
   'name': 'Protein',
   'amount': 4.76,
   'unitName': 'G',
   'derivationCode': 'LCCS',
   'derivationDescription': 'Calculated from value per serving size measure'},
  {'number': '204',
   'name': 'Total lipid (fat)',
   'amount': 31.0,
   'unitName': 'G',
   'derivationCode': 'LCCS',
   'derivationDescription': 'Calculated from value per serving size measure'},
  {'number': '205',
   'name': 'Carbohydrate, by difference',
   'amount': 54.8,
   'unitName': 'G',
   'derivationCode': 'LCCS',
   'derivationDescription': 'Calculated from value per serving size measure'},
  {'number': '208',
   'name': 'Energy',
   'amount': 500,
   'unitName': 'KCAL',
   'derivationCode': 'LCCS',
   'derivationDescription': 'Calculated from value per s

# Fetch Results By Search GET

In [20]:
def fetch_page_get(search_term, page_number):
    """
    Fetch a page of results for a given search term and page number using a GET request.
    
    Args:
        search_term (str): The search query.
        page_number (int): The page number to fetch.
    
    Returns:
        dict: The JSON response from the API, or None on failure.
    """
    import requests
    import time
    
    url = "https://api.nal.usda.gov/fdc/v1/foods/search"
    params = {
        "api_key": API_KEY,
        "generalSearchInput": search_term,
        "dataType": "Branded",  # using only Branded items
        "pageNumber": page_number,
        "pageSize": PAGE_SIZE,
        "sortBy": "fdcId",
        "sortOrder": "asc"
    }
    
    attempt = 0
    while attempt < MAX_RETRIES:
        try:
            response = requests.get(url, params=params, timeout=30)
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"Error fetching page {page_number} for term '{search_term}' using GET (attempt {attempt+1}): {e}")
            attempt += 1
            if attempt < MAX_RETRIES:
                print(f"Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
    return None

In [24]:
json2 = fetch_page_get('ab', 1)
json2

{'totalHits': 155,
 'currentPage': 1,
 'totalPages': 1,
 'pageList': [1],
 'foodSearchCriteria': {'dataType': ['Branded'],
  'generalSearchInput': 'ab',
  'pageNumber': 1,
  'sortBy': 'fdcId',
  'sortOrder': 'asc',
  'numberOfResultsPerPage': 50,
  'pageSize': 200,
  'requireAllWords': False,
  'foodTypes': ['Branded']},
 'foods': [{'fdcId': 1031871,
   'description': 'SOUR RASPBERRY CANDY FISH, SOUR RASPBERRY',
   'dataType': 'Branded',
   'gtinUpc': '7350010277767',
   'publishedDate': '2020-06-26',
   'brandOwner': 'KOLSVART AB',
   'ingredients': 'SUGAR, INVERT SUGAR SYRUP, WATER,CORN STARCH, GLUCOSE SYRUP, CITRIC ACID, TARTARIC ACID, CARROT CONCENTRATE, NATURAL RASPBERRY AROMA, COCONUT OIL, CARNAUBA WAX',
   'marketCountry': 'United States',
   'foodCategory': 'Candy',
   'modifiedDate': '2020-04-08',
   'dataSource': 'LI',
   'servingSizeUnit': 'g',
   'servingSize': 30.0,
   'householdServingFullText': '6 PCS',
   'tradeChannels': ['NO_TRADE_CHANNEL'],
   'allHighlightFields': '