In [0]:
%pip install elasticsearch==8.19.0
%restart_python

### Prepare Input

In [0]:
%sql
SELECT * FROM openalex.works.work_awards

In [0]:
df = spark.sql("""
WITH latest_work_awards AS (
  -- Step 1: De-duplicate records.
  SELECT *
  FROM openalex.works.work_awards
  QUALIFY ROW_NUMBER() OVER (PARTITION BY id, work_id ORDER BY indexed_timestamp DESC) = 1
),
aggregated_awards AS (
  -- Step 2: Group by 'a.id' and alias the struct of first values as 'src'.
  SELECT
    a.id,
    FIRST(
      STRUCT(
        a.award_id, a.title, a.native_id as doi, a.amount,
        a.start_date, a.end_date, a.planned_end_date, a.currency,
        f.funder_id, COALESCE(f.display_name, a.funder_name) as funder_name,
        a.funding_type, a.funder_scheme, a.funder_ids, a.description,
        a.lead_investigator, a.co_lead_investigator, a.investigators,
        a.primary_url as landing_page, a.doi_url, a.provenance, a.publisher,
        a.member as member_id, a.accepted_date, a.approved_date, a.issued_date,
        a.deposited_timestamp, a.created_timestamp, a.indexed_timestamp
      )
    ) AS src,
    
    transform(collect_set(work_id), w -> CONCAT('https://openalex.org/W', w)) as works_funded_outputs,
    size(collect_set(work_id)) as works_funded_count

  FROM latest_work_awards a
  JOIN openalex.mid.funder f 
    ON a.funder_ids.doi = f.doi OR a.funder_ids.ror_id = f.ror_id
  GROUP BY a.id
)
-- Step 3: Format the final output for Elasticsearch.
SELECT
  CONCAT('https://openalex.org/G', id) as id,
  STRUCT(
    CONCAT('https://openalex.org/G', id) as id,
    src.award_id as funder_award_id,
    CONCAT("https://api.openalex.org?filter=awards.id:G", id) as works_funded_api_url,
    works_funded_outputs,
    works_funded_count,    
    src.title,
    src.title as display_name,
    src.description,
    src.amount,
    src.currency,
    CONCAT('https://doi.org/', src.doi) as doi,

    STRUCT(
      CONCAT('https://openalex.org/F', src.funder_id) as id,
      src.funder_name as display_name,
      src.funder_ids.ror_id as ror,
      IF(src.funder_ids.doi IS NOT NULL, CONCAT('https://doi.org/', src.funder_ids.doi), NULL) as doi
    ) as funder,

    src.funding_type,
    src.funder_scheme as funder_award_scheme,
    src.start_date,
    src.end_date,

    -- src.lead_investigator,
    -- src.co_lead_investigator,
    -- revisit, normalize and combine all investigators into 1 with flatten(array(...))
    src.investigators as investigatorships,
    src.landing_page as landing_page_url,
    --src.doi_url,
    src.provenance,
    -- src.publisher,
    -- src.member_id,
    -- src.planned_end_date,
    -- src.accepted_date,
    -- src.approved_date,
    -- src.issued_date,
    -- src.deposited_timestamp,
    src.created_timestamp as updated_date, --for now
    date(src.created_timestamp) as created_date
    -- src.indexed_timestamp
  ) AS _source
FROM aggregated_awards;""")

rows = df.collect()

print(f"Awards count: {len(rows)}")

In [0]:
from elasticsearch import Elasticsearch, helpers
import json

ELASTIC_INDEX = "awards-v1"
ELASTIC_URL = dbutils.secrets.get(scope="elastic", key="elastic_url")

client = Elasticsearch(
    hosts = [ELASTIC_URL],
    request_timeout = 180,
    max_retries = 5,
    retry_on_timeout = True
)

def actions_from_spark(rows, op_type = "index"):
    for row in rows:
        yield {
            "_op_type": op_type,
            "_index": ELASTIC_INDEX,
            "_id": row.id,
            "_source": row._source.asDict(True)
        }

# Delete old index
if client.indices.exists(index=ELASTIC_INDEX):
    client.indices.delete(index=ELASTIC_INDEX)

ok = fail = 0
for success, info in helpers.streaming_bulk(client, actions_from_spark(rows),
    chunk_size=2000, request_timeout=60, max_retries=3):
    if success:
        ok += 1
    else:
        fail += 1

print(f"Indexed ok={ok}, failed={fail}")

In [0]:
client.indices.refresh(index=ELASTIC_INDEX)

In [0]:
client.count(index=ELASTIC_INDEX)