In [0]:
%pip install elasticsearch==8.19.0
%restart_python

In [0]:
%python
import uuid
from datetime import datetime, timedelta
from pyspark.sql import functions as F
from elasticsearch import Elasticsearch, helpers
import logging
import json

logging.basicConfig(level=logging.WARNING, format='[%(asctime)s]: %(message)s')
log = logging.getLogger(__name__)

ELASTIC_URL = dbutils.secrets.get(scope="elastic", key="elastic_url")

CONFIG = {
    "table_name": "openalex.awards.awards_api",
    "index_name": "awards-v3"
}

IS_FULL_SYNC = dbutils.widgets.get("is_full_sync").lower() == "true"
print(f"IS_FULL_SYNC: {IS_FULL_SYNC}")

def send_partition_to_elastic(partition, index_name):
    client = Elasticsearch(
        hosts=[ELASTIC_URL],
        max_retries=3,
        request_timeout=180
    )
    
    def generate_actions(op_type = "index"):
        for row in partition:
            yield {
                "_op_type": op_type,
                "_index": CONFIG["index_name"],
                "_id": row.id,
                "_source": row._source.asDict(True)
            }
    
    try:
        count = 0
        for success, info in helpers.parallel_bulk(
            client, 
            generate_actions(), 
            chunk_size=500,
            thread_count=4
        ):
            count += 1
            if not success:
                print(f"FAILED TO INDEX: {info}")
                raise Exception(f"Failed to index document: {info}")
        
        print(f"Successfully indexed {count} total documents to {index_name}")
        
    except Exception as e:
        log.error(f"Error indexing documents to {index_name}: {e}", stack_info=True, exc_info=True)
        print(f"Error indexing documents to {index_name}: {e}")

### Prepare Input

In [0]:

df = spark.read.table(CONFIG["table_name"])

if not IS_FULL_SYNC:
    two_days_ago = (datetime.now() - timedelta(days=2)).strftime('%Y-%m-%d')
    df = df.filter(F.col("updated_date") >= two_days_ago)
    print(f"Incremental sync: filtering to updated_date >= {two_days_ago}")
df = (
    df
    .withColumn("id", F.concat(F.lit("https://openalex.org/G"), F.col("id")))
    .select("id", F.struct(F.col("*")).alias("_source"))
)


num_partitions = 96 if IS_FULL_SYNC else 8
df = df.repartition(num_partitions)
print(f"Total records to process: {df.count()}")
display(df)

### Execute sync

In [0]:
print(f"\n=== Processing {CONFIG['table_name']} ===")

try:
    def send_partition_wrapper(partition):
        return send_partition_to_elastic(
            partition,
            CONFIG['index_name']
        )
    
    df.foreachPartition(send_partition_wrapper)
    
    print(f"Completed indexing {CONFIG['table_name']} to {CONFIG['index_name']}")
    
except Exception as e:
    print(f"Failed to process {CONFIG['table_name']}: {e}")
    log.error(f"Failed to process {CONFIG['table_name']}: {e}", stack_info=True, exc_info=True)

print("\nIndexing operation completed!")

In [0]:
client = Elasticsearch(
        hosts=[ELASTIC_URL],
        max_retries=3,
        request_timeout=180
    )

client.indices.refresh(index=CONFIG['index_name'])