# LLM Based Web Scraping of Clark U websites

## Import Libraries

In [39]:
from google.colab import auth
auth.authenticate_user()

ModuleNotFoundError: No module named 'google.colab'

In [None]:
# Import required libraries for the ScriptCreatorGraph
import os
from scrapegraphai.graphs import ScriptCreatorGraph, SearchGraph

# The following code is specific to Google Colab and GCP
# Uncomment if running in Google Colab environment

'''
PROJECT_ID = "hackathon-454621"
BUCKET_NAME = "course_information"

from google.cloud import storage
import subprocess, os

storage_client = storage.Client(project=PROJECT_ID)

def upload_to_bucket(bucket_name: str, destination_blob_name: str, local_file_path: str):
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(local_file_path)
    print(f"✅ Uploaded {destination_blob_name} to gs://{bucket_name}/")

# 2️⃣ Install HTTrack
print("🔧 Installing HTTrack…")
subprocess.run(["apt-get", "update"], check=True, stdout=subprocess.DEVNULL)
subprocess.run(["apt-get", "install", "-y", "httrack"], check=True)

# 3️⃣ Mirror site into local temp directory
URL = "https://www.clarku.edu/academics/majors-and-minors/"            # ← CHANGE to your target
LOCAL_DIR = "/content/htmirror"        # temporary local staging

# Filters: only HTML pages + links; exclude images/docs/etc.
filters = [
    f"+*{URL}*",
    "-*.jpg", "-*.jpeg", "-*.png", "-*.gif", "-*.svg",
    "-*.css", "-*.js", "-*.ico",
    "-*.pdf", "-*.doc*", "-*.xls*", "-*.ppt*", "-*.zip", "-*.mp4", "-*.mp3"
]

cmd = ["httrack", URL, "-O", LOCAL_DIR] + filters + ["-v"]
print("🌐 Running HTTrack…")
subprocess.run(cmd, check=True)

# 4️⃣ Walk local mirror and upload every file to GCS
print("📤 Uploading files to GCS…")
for root, _, files in os.walk(LOCAL_DIR):
    for filename in files:
        local_path = os.path.join(root, filename)
        # strip the base mirror directory to preserve structure
        blob_name = os.path.relpath(local_path, LOCAL_DIR)
        upload_to_bucket(BUCKET_NAME, blob_name, local_path)

print("🎉 All done! Your website mirror is now in gs://{}/".format(BUCKET_NAME))
'''

ModuleNotFoundError: No module named 'google.colab'

## LLM Setup

In [35]:
# gemini_key = os.getenv('GEMINI_KEY')
# hf_api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
openai_key = os.getenv('OPENAI_API_KEY')

# Define the configuration for the graph
graph_config = {
    "llm": {
        "api_key": openai_key,
        "model": "openai/gpt-4o-mini",
    },
    "verbose": True,
    "headless": True,         # Reduced depth for testing
    "library": "playwright",  # Add the required library key
}

## Main Code Execution

In [36]:
script_creator_graph = ScriptCreatorGraph(
   prompt="Create a python Script to get all Majors and Minors offered by clark university as well as the reuirements for each",
   source="http://www.clarku.edu/academics/majors-and-minors/",
   config=graph_config,
)



# Create the SearchGraph instance
# search_graph = SearchGraph(
#    prompt="What are the all the majors and minors offered at Clark University and What are all requirements for each major and minor?",
#    config=graph_config,
# )

result = script_creator_graph.run()


--- Executing Fetch Node ---
--- (Fetching HTML from: http://www.clarku.edu/academics/majors-and-minors/) ---
--- Executing ParseNode Node ---
--- Executing GenerateScraper Node ---


In [37]:
print(result)


```python
import asyncio
from playwright.async_api import async_playwright
import json
import re

async def main():
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto("http://www.clarku.edu/academics/majors-and-minors/")
        
        content = await page.content()
        
        majors_and_minors = []
        
        # Regex to find major and minor details from the page content
        program_pattern = re.compile(r'<h3 class="program-finder__program-value program__title">(.*?)<\/h3>.*?<h3 class="program-finder__program-value program__degree">(.*?)<\/h3>', re.DOTALL)
        programs = program_pattern.findall(content)

        for program in programs:
            title, degree = program
            majors_and_minors.append({
                "title": title.strip(),
                "degree": degree.strip()
            })
        
        await browser.close()
        
        print(jso