### Install utils libraries, import and register UDFs

In [0]:
%pip install /Volumes/openalex/default/libraries/openalex_dlt_utils-0.2.1-py3-none-any.whl
from nameparser import HumanName # Will be installed via pipeline libraries
from openalex.dlt.normalize import normalize_title_udf, udf_last_name_only
spark.udf.register("normalize_title_udf", normalize_title_udf)
spark.udf.register("udf_last_name_only", udf_last_name_only)

### Sync Tables from PROD

In [0]:
%sql
DECLARE OR REPLACE VARIABLE walden_paper_id BIGINT DEFAULT 9999999999;
SET VARIABLE walden_paper_id = (SELECT max(paper_id) from openalex.mid.work);

DECLARE OR REPLACE VARIABLE walden_full_updated_date TIMESTAMP DEFAULT current_timestamp();
SET VARIABLE walden_full_updated_date = (SELECT max(full_updated_date) from openalex.mid.work);

MERGE INTO openalex.mid.work AS target
USING (
      SELECT *
      FROM openalex_postgres.mid.work -- federated foreign table
      WHERE paper_id > walden_paper_id
         OR full_updated_date > walden_full_updated_date
) AS source
ON target.paper_id = source.paper_id
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *;

In [0]:
%sql
DECLARE OR REPLACE VARIABLE walden_affiliation_updated_date TIMESTAMP DEFAULT current_timestamp();
SET VARIABLE walden_affiliation_updated_date = (SELECT max(updated_date) from openalex.mid.affiliation);

MERGE INTO openalex.mid.affiliation AS target
USING (
      SELECT DISTINCT *
      FROM openalex_postgres.mid.affiliation -- federated foreign table
      WHERE updated_date > walden_affiliation_updated_date
        AND paper_id IS NOT NULL AND author_id IS NOT NULL
) AS source
ON target.paper_id = source.paper_id
  AND target.author_id = source.author_id
  AND target.affiliation_id = source.affiliation_id
  AND target.updated_date = source.updated_date --seems like target and source have a lot of dupes, maybe check later
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *;

In [0]:
%sql
DECLARE OR REPLACE VARIABLE walden_work_extra_id BIGINT DEFAULT 9999999999;
SET VARIABLE walden_work_extra_id = (SELECT max(paper_id) from openalex.mid.work_extra_ids);

MERGE INTO openalex.mid.work_extra_ids AS target
USING (
      SELECT DISTINCT *
      FROM openalex_postgres.mid.work_extra_ids -- federated foreign table
      WHERE paper_id > walden_work_extra_id
        AND paper_id IS NOT NULL
) AS source
ON target.paper_id = source.paper_id
  AND target.attribute_type = source.attribute_type
WHEN MATCHED THEN UPDATE SET target.attribute_value = source.attribute_value
WHEN NOT MATCHED THEN INSERT *;

### Update missing `normalized_title` and `match_author`

In [0]:
%sql
UPDATE openalex.mid.affiliation
SET match_author = udf_last_name_only(array(named_struct('name', original_author)))[0].author_key
WHERE match_author IS NULL;

In [0]:
%sql
UPDATE openalex.mid.work 
SET normalized_title = normalize_title_udf(original_title)
WHERE normalized_title IS NULL;