In [None]:
# Ingest country iso mapping data into Trino pipeline

In [3]:
from pcaf_env import *

ingest_table = 'pcaf_sovereign_map_country_iso'
models_table = 'pcaf_dbt_models'



using connect string: trino://MichaelTiemannOSC@trino-secure-odh-trino.apps.odh-cl2.apps.os-climate.org:443/osc_datacommons_dev
create schema if not exists osc_datacommons_dev.mdt_sandbox


In [4]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('aicoe_osc_demo_results',)
('default',)
('demo_dv',)
('dera',)
('essd',)
('iceberg_demo',)
('information_schema',)
('ingest',)
('mdt_sandbox',)
('pcaf_sovereign_footprint',)
('rmi',)
('sandbox',)
('wri_gppd',)


Load GDP file (updated sporadically from https://data.worldbank.org/indicator/NY.GDP.PCAP.PP.CD)

In [18]:
iso_country_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/GENERAL/PCAF_map_iso_country.csv')
iso_country_file.download_file(f'/tmp/PCAF_map_iso_country.csv')
df = pd.read_csv(f'/tmp/PCAF_map_iso_country.csv',sep=";",encoding='latin-1')
df = df.rename(columns=str.lower).convert_dtypes()
#df = df.drop(df[(df.ISO_CODE2=='CI')].index)
#df = df.drop(df[(df.ISO_CODE2=='KP')].index)
#df = df.drop(df[(df.ISO_CODE2=='LA')].index)

df


Unnamed: 0,iso_code2,iso_code3,country_name
0,AD,AND,Andorra
1,AE,ARE,United Arab Emirates
2,AE,ARE,United Arab Em
3,AF,AFG,Afghanistan
4,AG,ATG,Antigua and Barbuda
...,...,...,...
242,WS,WSM,Samoa
243,YE,YEM,Yemen
244,ZA,ZAF,South Africa
245,ZM,ZMB,Zambia


In [23]:
df = df.groupby(['iso_code2', 'iso_code3'], as_index=False).agg({'country_name': lambda s: max(s, key=len)})
df

Unnamed: 0,iso_code2,iso_code3,country_name
0,AD,AND,Andorra
1,AE,ARE,United Arab Emirates
2,AF,AFG,Afghanistan
3,AG,ATG,Antigua and Barbuda
4,AL,ALB,Albania
...,...,...,...
190,WS,WSM,Samoa
191,YE,YEM,Yemen
192,ZA,ZAF,South Africa
193,ZM,ZMB,Zambia


In [24]:
create_trino_table_and_dbt_metadata(ingest_table, df, ['iso_code2'], custom_meta_content='', custom_meta_fields='', verbose=True)

drop table if exists mdt_sandbox.pcaf_sovereign_map_country_iso_source
enforcing dataframe partition column order

verifying existence of table osc_datacommons_dev.mdt_sandbox.pcaf_sovereign_map_country_iso_source
create table if not exists osc_datacommons_dev.mdt_sandbox.pcaf_sovereign_map_country_iso_source (
    iso_code3 varchar,
    country_name varchar,
    iso_code2 varchar
) with (
    format = 'parquet',
    partitioning = array['iso_code2']
)

staging dataframe parquet to s3 osc-datacommons-s3-bucket-dev02
/tmp/ingest_temp_25c787d9/iso_code2=AD/144c8fb7ab7a428baa578ff7b5f637b0-0.parquet  -->  trino/ingest/ingest_temp_25c787d9/iso_code2=AD/144c8fb7ab7a428baa578ff7b5f637b0-0.parquet
/tmp/ingest_temp_25c787d9/iso_code2=AE/144c8fb7ab7a428baa578ff7b5f637b0-0.parquet  -->  trino/ingest/ingest_temp_25c787d9/iso_code2=AE/144c8fb7ab7a428baa578ff7b5f637b0-0.parquet
/tmp/ingest_temp_25c787d9/iso_code2=AG/144c8fb7ab7a428baa578ff7b5f637b0-0.parquet  -->  trino/ingest/ingest_temp_25c787d9/

In [25]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}_source
"""
pd.read_sql(sql, engine)


Unnamed: 0,iso_code3,country_name,iso_code2
0,CAF,Central African Republic,CF
1,ZWE,Zimbabwe,ZW
2,AZE,Azerbaijan,AZ
3,UGA,Uganda,UG
4,ITA,Italy,IT
...,...,...,...
190,PER,Peru,PE
191,BLR,Belarus,BY
192,MDG,Madagascar,MG
193,GEO,Georgia,GE


In [16]:
osc._do_sql(f"insert into {ingest_schema}.{models_table} values(\'{json.dumps(dbt_dict['models'])}\')", engine, verbose=True)

insert into mdt_sandbox.pcaf_dbt_models values('{"sf_wdi_gdp": {"description": "\"\\nThis indicator provides per capita values for gross domestic product (GDP) expressed in current international dollars converted by purchasing power parity (PPP) conversion factor. \\n\\nGDP is the sum of gross value added by all resident producers in the country plus any product taxes and minus any subsidies not included in the value of the products. conversion factor is a spatial price deflator and currency converter that controls for price level differences between countries. Total population is a mid-year population based on the de facto definition of population, which counts all residents regardless of legal status or citizenship.\"", "columns": {"rec_source": {"description": "\"API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv and API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv\""}, "data_provider": {"description": "\"WDI\"", "tags": ["WDI"]}, "country_iso_code": {"description": "\"ISO-3166 Country Code (alpha_3)\"", 

[(1,)]