# Web Scraping Recipes from websites

In [2]:
!pip install openai

Collecting openai
  Downloading openai-1.97.1-py3-none-any.whl.metadata (29 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.10.0-cp312-cp312-win_amd64.whl.metadata (5.3 kB)
Collecting tqdm>4 (from openai)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.9-py3-none-any.whl.metadata (21 kB)
Downloading openai-1.97.1-py3-none-any.whl (764 kB)
   ---------------------------------------- 0.0/764.4 kB ? eta -:--:--
   ---------------------------------------- 0.0/764.4 kB ? eta -:--:--
   ---------------------------------------- 0.0/764.4 kB ? eta -:--:--
   ---------------------------------------- 0.0/764.4 kB ? eta -:--:--
   --------------------------- ------------ 524.3/7


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import openai

# Set your OpenAI API key here
import os
openai.api_key = os.getenv("OPENAI_API_KEY")

def chat_with_gpt(messages, model="gpt-3.5-turbo"):

    response = openai.api_key.chat.completions.create(
        model=model,
        messages=messages
    )
    return response.choices[0].message['content']


In [11]:
import pandas as pd
import requests
import time

# Load the recipeData.csv into a DataFrame
df = pd.read_csv("recipeData.csv")

# Function to scrape a recipe URL and extract the main text (just use requests, no BeautifulSoup)
import random

def scrape_recipe_text(url):
    try:
        # Rotate user agents and add some headers to look more like a real browser
        user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
            "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1"
        ]
        headers = {
            "User-Agent": random.choice(user_agents),
            "Accept-Language": "en-US,en;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Connection": "keep-alive",
            "Referer": "https://www.google.com/"
        }
        # Add a small random delay to mimic human browsing
        time.sleep(random.uniform(1.2, 3.5))
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            return ""
        # Just return the raw text content of the page
        return response.text
    except Exception as e:
        return ""



In [13]:
# Scrape the first row and print the output

first_row = df.iloc[0]
url = first_row['recipe_url']

if pd.isna(url) or not isinstance(url, str) or not url.startswith("http"):
    print("No valid URL in the first row.")
else:
    recipe_text = scrape_recipe_text(url)
    print("Scraped recipe text (truncated to 1000 chars):\n")
    print(recipe_text)  # Print only the first 1000 characters for readability


Scraped recipe text (truncated to 1000 chars):

q��ܧ�|���K�ǉ6D�R/d\����i��h(D"��♌�йD�!R�x&`��'��0��W���B����
�忲^���4�hm�>����.X�=�jV�.(��P�X�i<��Q���2�ӍmX3��Yv�֧2��X�|�B<o���m3:�f��rz�Rd1����F4���qR��1���x#M�-�揱�[���$��x3e�&X�v�w�9՟-F�dNX�Ey|�^�I���l�J��7�-/�>z�[��E�ʡ^�\"x�ϝi"4*#�����F�K��vz˛��l�HK�]��4���Y�ޙU�P�HȴԚBgE�V�y`��0�R��@�ik�_�k���f�k?*��;���\;���a�k������l��z�W�՚.�t����x2$4�A��`.�o�
͔��Ә����k���(╌�D�!4Od,o��� QY�>JE�/�*m}S�&P$
��o�eيH�p(����ҕ"aEY^F�U�f���%�-V1���,+�G��bŊ4OiS(V��kJ����4@K�3�&V�ƐQc/y��H�Zb��&�܋��^�A�|_�u�F�!��d���=��B_�����RO��z�ph3ׅ�qx8��Z����0�	�3X8�4��Mg�W8؁�U��ݟ�.�
5<p<NSz�Q��¹w=���]?vE�.ݶ�G�J6^,-��P��������yMw���/�} 'p�H�U������� %�8w!(����/��C�{>택��M@y���������hɹ[Z��hO���`f�hȥU(X�f�]g�������]��u�3m4��,��J�P�.A������f\k�f������u�i4ׄD�:�d����t����S�ekm�>U�^�S�J�Qkɾ���Kv��ՒV!W�G�^��#Yy����>���ÒV�\�<��U
Y?������Tr�e��V��`��ך%�fi��8����24&:��ݻ��i�ӊ0�P>���ѧ���%