[317] NiH to general (#358)

* modified dataset es schema * updated mapping * tidied json * added batchable * added batchable * first commit changelog * Update CHANGELOG.md * Update CHANGELOG.md * Update CHANGELOG.md * docstrings * added tests * Update CHANGELOG.md
nestauk · May 27, 2021 · f0abf0b · f0abf0b
1 parent f4f62b5
commit f0abf0b
Show file tree

Hide file tree

Showing 11 changed files with 450 additions and 340 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,61 @@
+# CHANGELOG
+
+# 9/4/21
+
+* New fields corresponding to NiH project outputs, which are mainly IDs that could be linked to other open datasets:
+
+    * `terms_id_clinicalTrial`: Clinical trial IDs for this project that can be linked directly to `https://www.clinicaltrials.gov/ct2/show/{clinical_trial_id}`
+    * `terms_id_pubmedId`: PubMed IDs for this project that can be direclty linked to `https://pubmed.ncbi.nlm.nih.gov/{pubmed_id}`
+    * `terms_title_patent` and `terms_id_patent`: Patent titles and IDs. Would require linking to patstat for more info.
+
+* Deduplication according to strategy implemented in [this PR](nestauk/nesta#300), which helps explain the following new fields for the format `terms_{X}_project` with `X` =
+
+    * `exactDupeId`, `exactDupeTitle`: refering to IDs and titles of projects which we have removed from the dataset for being exact duplicates. Only distinct titles are provided, and any funding allocations, start and end dates and funding are incorporated into the deduplicated recorded in the logical way (earliest start date, latest end date, sum of funding)
+    * `nearDupeId`: referring to IDs of projects with similarity >= 0.8
+    * `verySimilarId`: referring to IDs of projects with 0.8 > similarity >= 0.65
+    * `similarId`: referring to IDs of projects with 0.65 > similarity >= 0.4
+
+    For example:
+
+    * [this project](https://search-general-wvbdcvn3kmpz6lvjo2nkx36pbu.eu-west-2.es.amazonaws.com/nih_v0/_search?q=_id:2180092) has several exact dupes and [one near dupe](https://search-general-wvbdcvn3kmpz6lvjo2nkx36pbu.eu-west-2.es.amazonaws.com/nih_v0/_search?q=_id:3297189), which itself has one exact dupe
+    * [this project](https://search-general-wvbdcvn3kmpz6lvjo2nkx36pbu.eu-west-2.es.amazonaws.com/nih_v0/_search?q=_id:100160) has [one very similar but non-identical project](https://search-general-wvbdcvn3kmpz6lvjo2nkx36pbu.eu-west-2.es.amazonaws.com/nih_v0/_search?q=_id:100296)
+    * [this project](https://search-general-wvbdcvn3kmpz6lvjo2nkx36pbu.eu-west-2.es.amazonaws.com/nih_v0/_search?q=_id:100074) and [this project](https://search-general-wvbdcvn3kmpz6lvjo2nkx36pbu.eu-west-2.es.amazonaws.com/nih_v0/_search?q=_id:7220147) are fairly similar
+
+
+
+* An additional form of deduplication was discovered as discussed in [this PR](nestauk/nesta#337), which says that what NiH tell you is a primary key isn't really a primary key. Instead we impute a primary key based on the core ID as per the PR and consolidate projects accordingly.
+
+* mesh terms are no longer supported as discussed [here](https://github.com/nestauk/nesta/pull/328#discussion_r512646286), [here](https://data-analytic-nesta.slack.com/archives/CK76G6NDD/p1603801230010600) and elsewhere.
+
+* The curation of `json_funding_project` is dealt with more gracefully and so the sub-schema (i.e. in the mapping) has been updated from e.g.:
+
+    ```json
+    {
+      "year": 1988,
+      "cost_ref": 3654295,
+      "start_date": 2011-07-31,
+      "end_date": 1988-04-01
+    }
+    ```
+
+    to
+
+    ```json
+    {
+      "year": 1988,
+      "total_cost": 3654295,
+      "project_end": "2011-07-31",
+      "project_start": "1988-04-01"
+    }
+    ```
+
+* Cleaning steps implemented in [this PR](nestauk/nesta#327):
+
+     * Split `;` `terms` into an array
+     * CAPS --> Camel Case
+     * Address the bad dq issue highlighted in nestauk/nesta#51
+     * Check Greek characters (etc) parse ok
+     * Check question marks (bad unicode parsing) fixed
+     * `"NULL"`, `""`, `"N/A"`, `[]` --> `null`
+
+
diff --git a/nesta/core/batchables/general/nih/sql2es/run.py b/nesta/core/batchables/general/nih/sql2es/run.py
@@ -0,0 +1,102 @@
+"""
+run.py (general.nih)
+--------------------
+
+Transfer pre-curated NiH data from MySQL
+to Elasticsearch.
+"""
+
+from ast import literal_eval
+import boto3
+import json
+import logging
+import os
+from datetime import datetime as dt
+
+from nesta.core.luigihacks.elasticsearchplus import ElasticsearchPlus
+from nesta.core.luigihacks.luigi_logging import set_log_level
+from nesta.core.orms.orm_utils import db_session, get_mysql_engine
+from nesta.core.orms.orm_utils import object_to_dict
+from nesta.core.orms.general_orm import NihProject as Project
+
+
+def datetime_to_date(row):
+    """Strip null time info from datetime and return as a date
+    
+    Args:
+        row (dict): Row object optionally containing
+                    'project_start' and 'project_end' dict keys
+    Returns:
+        rows (dict): Modified row, with null time info from 
+                     datetime and return as a date.
+    """
+    for key in ['project_start', 'project_end']:
+        if row[key] is None:
+            continue
+        date = dt.strptime(row[key], '%Y-%m-%dT00:00:00')        
+        row[key] = dt.strftime(date, '%Y-%m-%d')
+    return row
+
+
+def reformat_row(row):
+    """Reformat MySQL data ready for parsing to Elasticsearch"""
+    # Apply datetime --> date conversion on row
+    row = datetime_to_date(row)
+    # Also apply datetime --> date conversion on subfields in yearly_funds
+    for _row in row['yearly_funds']:
+        _row = datetime_to_date(_row)
+    return row
+
+
+def run():
+    """The 'main' function"""
+    # Extract env vars for this task
+    test = literal_eval(os.environ["BATCHPAR_test"])
+    bucket = os.environ['BATCHPAR_bucket']
+    batch_file = os.environ['BATCHPAR_batch_file']
+    db_name = os.environ["BATCHPAR_db_name"]
+    es_host = os.environ['BATCHPAR_outinfo']
+    es_port = int(os.environ['BATCHPAR_out_port'])
+    es_index = os.environ['BATCHPAR_out_index']
+    es_type = os.environ['BATCHPAR_out_type']
+    entity_type = os.environ["BATCHPAR_entity_type"]
+    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]
+
+    # Database setup
+    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
+
+    # es setup
+    logging.info('Connecting to ES')
+    es = ElasticsearchPlus(hosts=es_host,
+                           port=es_port,
+                           aws_auth_region=aws_auth_region,
+                           no_commit=("AWSBATCHTEST" in os.environ),
+                           entity_type=entity_type,
+                           strans_kwargs={'filename': 'nih.json'},
+                           null_empty_str=True,
+                           do_sort=False)
+
+    # collect file
+    logging.info('Retrieving article ids')
+    s3 = boto3.resource('s3')
+    obj = s3.Object(bucket, batch_file)
+    proj_ids = json.loads(obj.get()['Body']._raw_stream.read())
+    logging.info(f"{len(proj_ids)} project IDs retrieved from s3")
+
+    # Iterate over articles
+    logging.info('Processing rows')
+    with db_session(engine) as sess:
+        _filter = Project.application_id.in_(proj_ids)
+        query = sess.query(Project).filter(_filter)
+        for obj in query.all():
+            row = object_to_dict(obj)
+            row = reformat_row(row)
+            es.index(index=es_index, doc_type=es_type,
+                     id=row.pop('application_id'), body=row)
+    logging.info("Batch job complete.")
+
+
+if __name__ == "__main__":
+    set_log_level()
+    logging.info('Starting...')
+    run()
diff --git a/nesta/core/batchables/general/nih/sql2es/tests/test_nih.py b/nesta/core/batchables/general/nih/sql2es/tests/test_nih.py
@@ -0,0 +1,43 @@
+import pytest
+from unittest import mock
+from nesta.core.batchables.general.nih.sql2es import run
+
+def test_datetime_to_date_complete():
+    row = run.datetime_to_date({'project_start': '2020-01-01T00:00:00',
+                                'project_end': '2021-02-03T00:00:00'})
+    assert row == {'project_start': '2020-01-01',
+                   'project_end': '2021-02-03'}
+
+
+def test_datetime_to_date_incomplete():
+    # Null dates allowed, also keys != project_start, project_end ignored
+    row = run.datetime_to_date({'project_start': '2020-01-01T00:00:00',
+                                'project_end': None,
+                                'other': '2020-01-01T00:00:00'})
+    assert row == {'project_start': '2020-01-01',
+                   'project_end': None,
+                   'other': '2020-01-01T00:00:00'}
+
+
+def test_datetime_to_date_bad_date():
+    with pytest.raises(ValueError):
+        # Value of 12:00:00 doesn't match the expected 00:00:00
+        row = run.datetime_to_date({'project_start': '2020-01-01T12:00:00',
+                                    'project_end': None})
+
+
+@mock.patch('nesta.core.batchables.general.nih.sql2es.run.datetime_to_date')
+def test_reformat_row(mocked_dt2date):
+    mocked_dt2date.side_effect = lambda x: x
+    run.reformat_row({
+        'project_start': '2020-01-01T00:00:00',
+        'other': '2021-02-03T00:00:00',
+        'yearly_funds':
+        [
+            {'project_start': '2020-01-01T00:00:00',
+             'project_end': '2021-02-03T00:00:00'},
+            {'project_start': '2020-01-01T00:00:00',
+             'project_end': '2021-02-03T00:00:00'}
+        ]
+    })
+    assert mocked_dt2date.call_count == 3
diff --git a/nesta/core/config/elasticsearch.yaml b/nesta/core/config/elasticsearch.yaml
diff --git a/nesta/core/routines/projects/general/general_root.py b/nesta/core/routines/projects/general/general_root.py
@@ -11,6 +11,7 @@
 from nesta.core.luigihacks.misctools import find_filepath_from_pathstub as f3p
 
 from nesta.core.orms.general_orm import CrunchbaseOrg  # Already curated
+from nesta.core.orms.general_orm import NihProject  # Already curated
 from nesta.core.orms.gtr_orm import Projects as GtrProject  # Curated on ingestion
 from nesta.core.orms.arxiv_orm import Article as ArxivArticle  # Curated on ingestion
 from nesta.core.orms.patstat_orm import ApplnFamilyEU as PatstatFamily  # Curated on ingestion
@@ -26,7 +27,8 @@
             'arxiv': ('article', ArxivArticle.id),
             'companies': ('company', CrunchbaseOrg.id),
             'patstat': ('patent', PatstatFamily.docdb_family_id),  # <--- takes one week
-            'cordis': ('project', CordisProject.rcn)}
+            'cordis': ('project', CordisProject.rcn),
+            'nih': ('project', NihProject.application_id)}
 
 def kwarg_maker(dataset, routine_id):
     env_files=list(f3p(f) for f in ENV_FILES) + [f3p(f'tier_1/datasets/{dataset}.json')]

diff --git a/nesta/core/schemas/tier_1/datasets/nih.json b/nesta/core/schemas/tier_1/datasets/nih.json
@@ -1,35 +1,39 @@
 {
     "entity_type": "paper",
     "tier0_to_tier1": {
-        "_booleanFlag_autotranslated_entity": "booleanFlag_autotranslated_entity",
-        "_rank_rhodonite_abstract": "rank_rhodonite_abstract",
-        "_terms_iso2lang_entity": "terms_iso2lang_entity",
-        "_terms_of_countryTags": "terms_of_countryTags",
-        "_terms_of_funders": "terms_of_funders",
-        "_total_cost_usd2018": "_cost_usd2018_project",
         "abstract_text": "textBody_abstract_project",
-        "city": "placeName_city_organisation",
-        "continent": "id_of_continent",
+        "base_core_project_num": "id_of_project",
+        "clinicaltrial_ids": "terms_id_clinicalTrial",
+        "clinicaltrial_titles": "terms_title_clinicalTrial",
+        "continent_iso2": "id_of_continent",
         "coordinates": "coordinate_of_organisation",
-        "country": "placeName_country_organisation",
-        "country_alpha_2": "id_iso2_country",
-        "country_alpha_3": "id_iso3_country",
-        "country_numeric": "id_isoNumeric_country",
-        "duplicate_abstract": "booleanFlag_duplicate_abstract",
-        "full_project_num": "id_of_project",
+        "country_iso2": "id_iso2_country",
+        "country_mentions": "terms_of_countryTags",
+        "currency": "currency_total_cost",
+        "fairly_similar_ids": "terms_similarId_project",
         "fy": "year_fiscal_funding",
+        "grouped_ids": "terms_exactDupeId_project",
+        "grouped_titles": "terms_exactDupeTitle_project",
+        "ic_name": "title_of_funders",
         "mesh_terms": "terms_mesh_abstract",
+        "near_duplicate_ids": "terms_nearDupeId_project",
+        "org_city": "placeName_city_organisation",
+        "org_country": "placeName_country_organisation",
         "org_name": "title_of_organisation",
         "org_state": "id_state_organisation",
         "org_zipcode": "placeName_zipcode_organisation",
+        "patent_ids": "terms_id_patent",
+        "patent_titles": "terms_title_patent",
         "phr": "textBody_descriptive_project",
         "placeName_continent_organisation": "placeName_continent_organisation",
         "placeName_state_organisation": "placeName_state_organisation",
+        "pmids": "terms_pubmedId_article",
         "project_end": "date_end_project",
         "project_start": "date_start_project",
         "project_terms": "terms_descriptive_project",
         "project_title": "title_of_project",
         "total_cost": "cost_total_project",
-        "total_cost_currency": "currency_total_cost"
+        "very_similar_ids": "terms_verySimilarId_project",
+        "yearly_funds": "json_funding_project"
     }
 }