# Crossref Data Processing

This notebook handles the processing of raw Crossref data into a structured format.

In [None]:
import dlt
import logging
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, ArrayType

# Setup Python logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Define Schema for Parsed Data

In [None]:
parsed_schema = StructType([
    StructField("DOI", StringType(), True),
    StructField("title", ArrayType(StringType()), True),
    StructField("author", ArrayType(StructType([
        StructField("family", StringType(), True),
        StructField("given", StringType(), True)
    ])), True),
    StructField("published", StructType([
        StructField("date-time", TimestampType(), True)
    ]), True)
])

## Define DLT Table for Processed Crossref Data

In [None]:
@dlt.table(
    comment="Processed Crossref works data.",
    table_properties={"quality": "gold"}
)
def crossref_processed_data():
    df = dlt.read("crossref_raw_data")
    
    processed_df = df.select(
        from_json("raw_data", parsed_schema).alias("parsed_data"),
        "ingestion_timestamp"
    ).select(
        col("parsed_data.DOI").alias("DOI"),
        col("parsed_data.title")[0].alias("title"),
        col("parsed_data.author").alias("author"),
        col("parsed_data.published.date-time").alias("published_date"),
        "ingestion_timestamp"
    )
    
    final_count = processed_df.count()
    logger.info(f"Data processing complete. Final row count: {final_count}")

    return processed_df