In [None]:
!pip install pandas
!pip install langchain_experimental
!pip install lxml
!pip install beautifulsoup4
!pip install python-dotenv

This function parse_worksheet takes a BeautifulSoup object doc as an input and extracts information from it to create a dictionary of activities. The dictionary called activity with four key-value pairs: title, activity_contents, categories, thumbnail pair with value extracting from target website by finding targeted class name. 

In [None]:
from bs4 import BeautifulSoup
import re

def parse_worksheet(doc):
  soup = BeautifulSoup(doc)
  try:
    activity_title = soup.h1.get_text()

    try:
      activity_contents = soup.find(class_='worksheet-module_contentDetailContainer_3Pou6').next_element.get_text()
    except AttributeError:
            print("Error: Couldn't find activity_contents element.")
            print("HTML content of the soup object:")
            print(soup.prettify())
            return None

    categories = {'brand':[], 'subject':[]}

    cat_title = soup.find_all(class_=re.compile('CategoryTree-module_values'))

    result1 = cat_title[0].find_all(class_=re.compile("Tag-module_tagBody"))

    for div in result1:
        cat_list = div.next_element.get_text();
        categories['brand'].append(cat_list)

    result2 = cat_title[1].find_all(class_=re.compile("Tag-module_tagBody"))

    for div in result2:
        cat_list = div.next_element.get_text();
        categories['subject'].append(cat_list)

    thumbnail = soup.find(class_="worksheet-module_thumbnail_3_tCk").next_element.next_element.get('src')
    activity = {'title' :activity_title, 'activity_contents': activity_contents, 'categories':categories, 'thumbnail':thumbnail }
    return activity
  except Exception as e:
        print("An error occurred:", e)
        return None

This code uses the Requests library to make HTTP requests and the BeautifulSoup library to parse HTML responses. It retrieves a list of activities from the Education.com website, and then extracts information about each activity, such as its title, contents, categories, and thumbnail image URL. a for loop that iterates over each result in the results list, but only up to 50 results will be processed at a time.  This line pauses the script for 4 seconds, so it doesn't overwhelm the Education.com website with too many requests.
urls[result['url']] = True - This line stores the URL of the current activity in the urls dictionary, so we can avoid duplicates. 

In [None]:
from os import truncate
import requests
import time

urls = {}
activity_array = []
page_num = 2
while page_num < 13:

  url_query = f'https://content.education.com/search?path=%2Fresources%2F%3Fsort%3DdateCreated%26page%3D{page_num}&onlyValidUrls=false'
  response = requests.get(url_query).json()
  print(f'got response from page {page_num}')
  page_num += 1
  results = response['results']
  for result in results[0:50]:
    if result['url'] not in urls:
      web_doc = requests.get(f'https://www.education.com{result["url"]}').text
      print(result['url'])
      # make soup
      activity_obj = parse_worksheet(web_doc)
      activity_obj['url'] = result['url']
      urls[result['url']] = True
      activity_array.append(activity_obj)
      time.sleep(4)


to write and read CSV files. 

In [None]:
import csv

with open('activities.csv', 'w', newline='') as file:
  field = ["title", "activity_contents", "categories", "url","thumbnail"]
  writer = csv.DictWriter(file, fieldnames=field)
  writer.writeheader()
  for activity in activity_array:
        writer.writerow(activity)


with open('activities.csv', 'r') as file:
    for line in file:
        print(line.strip())