In [None]:
import json
from datetime import datetime
from pyspark.sql import functions as F

#### Create `openalex.common.openalex_concepts_snapshot` in same format as API

In [None]:
df_transformed = (
    spark.read.table("openalex.common.concepts_api")
    # Convert BIGINT id to full URL
    .withColumn("id", F.concat(F.lit("https://openalex.org/C"), F.col("id")))
)

df_transformed.write \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("openalex.common.openalex_concepts_snapshot")

#### Export in json lines format to S3

In [None]:
entity_type = "concepts"
date_str = datetime.now().strftime("%Y-%m-%d")
s3_base_path = f"s3://openalex-sandbox/snapshots/{date_str}"
output_path = f"{s3_base_path}/{entity_type}"

def export():
    print(f"Starting export to: {output_path}")
    
    df = spark.read.table("openalex.common.openalex_concepts_snapshot")
    record_count = df.count()
    print(f"Total records: {record_count:,}")
    
    # Concepts is small enough to export as a single partition
    # Partition by updated_date for consistency with other entities
    # Use created_date as fallback, then current_date if both are null
    df = df.withColumn("_partition_date", F.coalesce(F.to_date("updated_date"), F.to_date("created_date"), F.current_date()))
    
    # Coalesce to single partition per date since dataset is small
    (df.coalesce(1)
       .write
       .mode("overwrite")
       .option("compression", "gzip")
       .partitionBy("_partition_date")
       .json(output_path))
    
    print("Export completed!")

export()

#### Rename files and cleanup

In [None]:
def rename_files_and_cleanup(output_path):
    """Rename partition directories and files for consistency."""
    
    partitions = dbutils.fs.ls(output_path)
    partitions_to_process = [p for p in partitions if p.name.startswith("_partition_date=")]
    
    print(f"Found {len(partitions_to_process)} partitions to process")
    
    for partition in partitions_to_process:
        date_value = partition.name.replace("_partition_date=", "").rstrip("/")
        new_partition_path = f"{output_path}/updated_date={date_value}/"
        
        files = dbutils.fs.ls(partition.path)
        json_files = [f for f in files if f.name.endswith('.gz')]
        json_files.sort(key=lambda x: x.name)
        
        # Move and rename files
        for idx, file_info in enumerate(json_files):
            new_name = f"part_{str(idx).zfill(4)}.gz"
            new_path = f"{new_partition_path}{new_name}"
            dbutils.fs.mv(file_info.path, new_path)
            print(f"  Moved {file_info.name} -> updated_date={date_value}/{new_name}")
        
        # Clean up metadata files
        for f in files:
            if not f.name.endswith('.gz'):
                try:
                    dbutils.fs.rm(f.path)
                except:
                    pass
        
        # Remove old partition directory
        try:
            dbutils.fs.rm(partition.path, recurse=True)
        except:
            pass
    
    # Clean up root-level Spark metadata
    print("\nCleaning up root metadata files...")
    try:
        root_files = dbutils.fs.ls(output_path)
        for f in root_files:
            if f.name.startswith("_"):
                dbutils.fs.rm(f.path, recurse=True)
                print(f"  Removed {f.name}")
    except Exception as e:
        print(f"  Warning: Could not clean up root files: {e}")
    
    print("\nDone!")

rename_files_and_cleanup(output_path)

#### Create manifest

In [None]:
def create_manifest():
    """Create a manifest file with all file metadata."""
    output_path = f"{s3_base_path}/{entity_type}"
    
    print(f"\nCreating manifest...")
    
    partitions = dbutils.fs.ls(output_path)
    partitions_to_process = sorted(
        [p for p in partitions if p.name.startswith("updated_date=")],
        key=lambda x: x.name, reverse=True
    )
    
    entries = []
    total_content_length = 0
    total_record_count = 0
    
    for partition in partitions_to_process:
        files = dbutils.fs.ls(partition.path)
        for file_info in files:
            if not file_info.name.endswith('.gz'):
                continue
            
            # Count records
            record_count = spark.read.text(file_info.path).count()
            
            # Build S3 URL for prod
            raw = file_info.path.replace("dbfs:/", "s3://")
            marker = f"/{entity_type}/"
            idx = raw.find(marker)
            relative = raw[idx:]
            s3_url = f"s3://openalex/data{relative}"
            
            entry = {
                "url": s3_url,
                "meta": {
                    "content_length": file_info.size,
                    "record_count": record_count
                }
            }
            entries.append(entry)
            total_content_length += file_info.size
            total_record_count += record_count
            
            print(f"  {partition.name}{file_info.name}: {record_count:,} records, {file_info.size/(1024*1024):.2f} MB")
    
    entries.sort(key=lambda x: x["url"])
    
    manifest = {
        "entries": entries,
        "meta": {
            "content_length": total_content_length,
            "record_count": total_record_count
        }
    }
    
    manifest_path = f"{output_path}/manifest"
    manifest_json = json.dumps(manifest, indent=2)
    dbutils.fs.put(manifest_path, manifest_json, overwrite=True)
    
    print(f"\nManifest created: {manifest_path}")
    print(f"Total files: {len(entries)}")
    print(f"Total size (compressed): {total_content_length / (1024**2):.2f} MB")
    print(f"Total records: {total_record_count:,}")
    
    return manifest

create_manifest()