In [2]:
from firecrawl import FirecrawlApp
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
base_url = "https://github.com/trending"

In [4]:
from typing import Optional, List
from pydantic import BaseModel, Field

class GitHubRepository(BaseModel):
    name: str = Field(description="Full name of the repository (owner/repo)")
    description: Optional[str] = Field(None, description="Repository description")
    language: Optional[str] = Field(None, description="Main programming language")
    stars_count: Optional[str] = Field(None, description="Total number of stars")
    stars_today: Optional[str] = Field(None, description="Stars gained today")
    forks_count: Optional[str] = Field(None, description="Total number of forks")
    repo_owner: Optional[str] = Field(None, description="Repository owner")
    repo_url: Optional[str] = Field(None, description="Repository URL")
    
    
class Repositories(BaseModel):
    repositories: List[GitHubRepository]
    


In [22]:
app = FirecrawlApp()

result = app.scrape_url(
    base_url,
    params={
        "formats": ["extract"],
        "extract": {
            "prompt": "Scrape the GitHub trending page and extract the repositories based on the schema provided.",
            "schema": Repositories.model_json_schema()
        }
    }
)


In [23]:
result.keys()

dict_keys(['metadata', 'extract'])

In [24]:
result['extract'].keys()

dict_keys(['repositories'])

In [25]:
result['metadata'].keys()

dict_keys(['google-site-verification', 'twitter:creator:id', 'browser-errors-url', 'viewport', 'html-safe-nonce', 'apple-itunes-app', 'visitor-payload', 'twitter:card', 'request-id', 'ogDescription', 'og:image', 'og:site_name', 'twitter:site:id', 'expected-hostname', 'og:description', 'twitter:description', 'user-login', 'twitter:creator', 'twitter:image:width', 'hostname', 'color-scheme', 'og:url', 'route-action', 'ogUrl', 'github-keyboard-shortcuts', 'twitter:image:height', 'turbo-cache-control', 'og:image:width', 'title', 'ogSiteName', 'twitter:site', 'twitter:title', 'browser-stats-url', 'favicon', 'turbo-body-classes', 'octolytics-url', 'og:image:type', 'ogImage', 'route-controller', 'current-catalog-service-hash', 'route-pattern', 'og:title', 'og:image:height', 'description', 'visitor-hmac', 'twitter:image', 'theme-color', 'language', 'ogTitle', 'fb:app_id', 'scrapeId', 'sourceURL', 'url', 'statusCode'])

In [26]:
result['extract']['repositories']

[{'name': 'glance',
  'description': 'A self-hosted dashboard that puts all your feeds in one place',
  'language': 'Go',
  'stars_count': '16,243',
  'stars_today': '2,039',
  'forks_count': '574',
  'repo_owner': 'glanceapp',
  'repo_url': 'https://github.com/glanceapp/glance'},
 {'name': 'ollama-deep-researcher',
  'description': 'Fully local web research and report writing assistant',
  'language': 'Python',
  'stars_count': '3,941',
  'stars_today': '464',
  'forks_count': '455',
  'repo_owner': 'langchain-ai',
  'repo_url': 'https://github.com/langchain-ai/ollama-deep-researcher'},
 {'name': 'kickstart.nvim',
  'description': 'A launch point for your personal nvim configuration',
  'language': 'Lua',
  'stars_count': '22,700',
  'stars_today': '109',
  'forks_count': '29,875',
  'repo_owner': 'nvim-lua',
  'repo_url': 'https://github.com/nvim-lua/kickstart.nvim'},
 {'name': 'RD-Agent',
  'description': 'Research and development (R&D) is crucial for the enhancement of industrial p