# Crossref Data Ingestion

This notebook handles the ingestion of raw data from the Crossref API.

In [None]:
import dlt
import requests
import logging
import json
from pyspark.sql.functions import current_timestamp
from pyspark.sql.types import StructType, StructField, StringType

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Fetch Data from Crossref API

In [None]:
def fetch_recent_crossref_data(cursor=None):
    url = "https://api.crossref.org/works"
    params = {
        "filter": "from-index-date:2024-09-23,until-index-date:2024-09-23",
        "rows": 1000,
        "cursor": cursor if cursor else "*",
        "sort": "indexed",
        "order": "desc"
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json()

## Define DLT Table for Raw Crossref Data

In [None]:
@dlt.table(
    comment="Raw Crossref works data",
    table_properties={"quality": "bronze"}
)
def crossref_raw_data():
    def fetch_all_data():
        cursor = None
        page = 1
        total_rows = 0
        
        while True:
            data = fetch_recent_crossref_data(cursor)
            items = data.get("message", {}).get("items", [])
            rows_this_page = len(items)
            
            for item in items:
                yield {'doi': item['DOI'], 'message': json.dumps(item)}
            
            total_rows += rows_this_page
            logger.info(f"Fetched page: {page} | Rows this page: {rows_this_page} | Total rows: {total_rows} | Cursor: {cursor}")
            
            page += 1
            new_cursor = data.get("message", {}).get("next-cursor")
            if not new_cursor or new_cursor == cursor:
                logger.info(f"Pagination complete. Total pages: {page-1} | Total rows: {total_rows}")
                break
            cursor = new_cursor

    raw_data_schema = StructType([
        StructField("doi", StringType(), True),
        StructField("message", StringType(), True)
    ])
    
    df = spark.createDataFrame(fetch_all_data(), schema=raw_data_schema) \
               .withColumn("ingestion_timestamp", current_timestamp())
    
    logger.info(f"Raw data ingestion complete. Total rows: {df.count()}")
    return df