In [None]:
# Step 0: Install necessary libraries
# We need requests for API calls, pandas and pyarrow for data handling,
# beautifulsoup4 and lxml for web scraping the security token, and tqdm for a progress bar.
!pip install requests pandas pyarrow tqdm beautifulsoup4 lxml

import requests
import pandas as pd
from tqdm.notebook import tqdm
from google.colab import files
import time
from bs4 import BeautifulSoup


In [4]:

# --- Step 1: Define the GraphQL Query ---

SEARCH_TAGS_QUERY = """
query SearchTags($input: TagSearchInput!) {
  tags(input: $input) {
    page
    perPage
    total
    results {
      ...TagAttrs
      taggingCount
    }
  }
}

fragment TagAttrs on Tag {
  category
  createdAt
  creatorId
  id
  name
  namespace
  pendingRevisions
  slug
  status
  type
  description
}
"""

# --- Step 2: Function to Initialize Session and Get CSRF Token ---

def initialize_session(session, initial_url):
    """
    Visits a page to initialize a session and extract the CSRF token.
    The CSRF token is required for making secure POST requests.
    """
    print("Initializing session and fetching CSRF token...")
    try:
        response = session.get(initial_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')

        token_tag = soup.find('meta', attrs={'name': 'csrf-token'})

        if token_tag and 'content' in token_tag.attrs:
            csrf_token = token_tag['content']
            print("Successfully retrieved CSRF token.")
            session.headers.update({'X-CSRF-Token': csrf_token})
            return True
        else:
            print("Warning: Could not find CSRF token on the page.")
            return False

    except Exception as e:
        print(f"An error occurred during session initialization: {e}")
        return False

# --- Step 3: Function to Fetch All Tags with Pagination ---

def get_all_tags(session):
    """
    Uses the paginated GraphQL endpoint to fetch all oracle tags.
    """
    all_tags_data = []
    page = 1
    total_pages = 1 # We'll update this after the first request

    print("\nFetching all tags from GraphQL API (this may take a few moments)...")

    # Use tqdm for a progress bar over the pages
    pbar = tqdm(total=total_pages, desc="Fetching Pages")

    while page <= total_pages:
        print(f"Fetching page {page}...")
        variables = {"input": {"type": "ORACLE_CARD_TAG", "name": None, "page": page}}
        payload = {"query": SEARCH_TAGS_QUERY, "variables": variables}

        try:
            response = session.post("https://tagger.scryfall.com/graphql", json=payload)
            response.raise_for_status()
            data = response.json()

            tags_response = data.get('data', {}).get('tags')
            if not tags_response:
                print(f"Warning: No data returned for page {page}.")
                break

            results = tags_response.get('results', [])
            all_tags_data.extend(results)

            # Update pagination info from the first response
            if page == 1:
                per_page = tags_response.get('perPage', 100)
                total_tags = tags_response.get('total', 0)
                total_pages = (total_tags + per_page - 1) // per_page
                pbar.total = total_pages # Update the progress bar's total

            pbar.update(1) # Increment the progress bar
            page += 1

            # Be polite to the API
            if page <= total_pages:
                time.sleep(0.5)

        except Exception as e:
            print(f"An error occurred while fetching page {page}: {e}")
            break

    pbar.close()
    return all_tags_data

# --- Main Script Execution ---

session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})

# We visit a known page to get the session cookie and token
initialization_successful = initialize_session(session, "https://tagger.scryfall.com")

if initialization_successful:
    tag_data = get_all_tags(session)

    if tag_data:
        print(f"\nProcessing complete. Found a total of {len(tag_data)} tags. Creating DataFrame...")
        df = pd.DataFrame(tag_data)

        print("Sample of the collected data:")
        print(df.sample(min(5, len(df))))

        output_filename = "scryfall_all_tags.parquet"
        df.to_parquet(output_filename)
        print(f"\nSuccessfully saved all tag data to '{output_filename}'")

        print("\nTo download the file to your local machine, run the following code in a new cell:")
        print("from google.colab import files")
        print(f"files.download('{output_filename}')")
    else:
        print("No tag data was collected. The final dataframe is empty.")
else:
    print("\nCould not initialize session. Aborting script.")

Initializing session and fetching CSRF token...
Successfully retrieved CSRF token.

Fetching all tags from GraphQL API (this may take a few moments)...


Fetching Pages:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Fetching page 13...
Fetching page 14...
Fetching page 15...
Fetching page 16...
Fetching page 17...
Fetching page 18...
Fetching page 19...
Fetching page 20...
Fetching page 21...
Fetching page 22...
Fetching page 23...
Fetching page 24...
Fetching page 25...
Fetching page 26...
Fetching page 27...
Fetching page 28...
Fetching page 29...
Fetching page 30...
Fetching page 31...
Fetching page 32...
Fetching page 33...
Fetching page 34...
Fetching page 35...
Fetching page 36...
Fetching page 37...
Fetching page 38...
Fetching page 39...
Fetching page 40...

Processing complete. Found a total of 3923 tags. Creating DataFrame...
Sample of the collected data:
      category             createdAt                             creatorId  \
1888     False  2025-07-09T00

In [5]:
# Filter the DataFrame to include only rows where the 'description' column is not empty or None
tags_with_description = df[df['description'].notna() & (df['description'] != '')]

# Get the count of these tags
num_tags_with_description = len(tags_with_description)

print(f"Number of tags with a non-empty description: {num_tags_with_description}")

Number of tags with a non-empty description: 809
