In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib as mpl
import cartopy.crs as ccrs
import cartopy

import geopandas
import cartopy.io.shapereader as shpreader
import shapely.vectorized

import seaborn as sns

In [2]:
ipcc_regions = geopandas.read_file("../data/referenceRegions.dbf")
ipcc_regions.head()

Unnamed: 0,NAME,LAB,USAGE,geometry
0,Alaska/N.W. Canada [ALA:1],ALA,land,"POLYGON ((-105.00000 60.00000, -168.00000 60.0..."
1,Amazon [AMZ:7],AMZ,land,"POLYGON ((-66.40000 -20.00000, -79.70000 -1.20..."
2,Central America/Mexico [CAM:6],CAM,land,"POLYGON ((-68.80000 11.40000, -79.70000 -1.200..."
3,small islands regions Caribbean,CAR*,all,"POLYGON ((-68.80000 11.40000, -85.80000 25.000..."
4,Central Asia [CAS:20],CAS,land,"POLYGON ((60.00000 30.00000, 60.00000 50.00000..."


In [3]:
place_gridcells = pd.read_csv("../data/study_gridcell_all_2.5.csv")
place_gridcells.head()

Unnamed: 0,ndf_id,doc_id
0,7574.0,486888.0
1,7574.0,3323227.0
2,7574.0,1358994.0
3,7574.0,2356292.0
4,7574.0,1552852.0


In [4]:
degrees = 2.5
tdf = pd.read_csv(f'../data/study_da_6 - Temperature - upper_pred_{degrees}.csv')
tdf["da_var"] = "Temperature"
pdf = pd.read_csv(f'../data/study_da_6 - Precipitation - upper_pred_{degrees}.csv')
pdf["da_var"] = "Precipitation"

dadf = pd.concat([tdf,pdf])

dadf = dadf[pd.notna(dadf["gridcells"]) & dadf["gridcells"]>0]

dadf["da_trend_p"] = dadf["da_trend_cells"] / dadf["gridcells"]

dadf["da_trend_cat"] = None

dadf.loc[dadf['da_trend_p']==0,"da_trend_cat"] = "0==DA"
dadf.loc[dadf['da_data_cells']==0,"da_trend_cat"] = "NA"
dadf.loc[dadf['da_trend_p']>0,"da_trend_cat"] = "0<DA<0.5"
dadf.loc[dadf['da_trend_p']>0.5,"da_trend_cat"] = "DA>0.5"

places =  pd.read_csv('../data/place_df.csv')

In [5]:
tdf['temperature_da'] = tdf['da_trend_cells']
pdf['precip_da'] = tdf['da_trend_cells']

dadf = tdf[['id','temperature_da']].merge(pdf[['id','precip_da']], how="outer").fillna(0)

dadf['da'] = np.max(dadf[['temperature_da','precip_da']],axis=1)
dadf = dadf[['id','da']]

dadf.head()

Unnamed: 0,id,da
0,13201,0.0
1,1544528,1.0
2,1287688,0.0
3,245321,0.0
4,1549132,0.0


In [12]:
dadf.da.unique()

array([  0.,   1.,  10.,   2.,  67.,   3.,  12.,  13., 168.,  81.,   8.,
        33., 524.,   4., 570.,  17.,   5.,  25.,  15.,   7.,   9.,   6.,
        45., 206., 268.,  11.,  27.,  19.,  14.,  80., 392.,  18.,  34.,
        24.,  61.,  54.,  30., 108.,  47., 213.,  52.,  28.,  48.])

In [6]:
dadf.columns


Index(['id', 'da'], dtype='object')

In [7]:
cat_df = pd.read_csv('../data/1_predicted_category_documents.csv')
predictions = pd.read_csv('../data/1_document_relevance.csv')
df = dadf.merge(cat_df.merge(predictions), how="outer")

pred_cats = [c for c  in df.columns if "12 - " in c and " - mean_prediction" in c]
for c in pred_cats:
    print(df[c].sum())
    label = c.replace(" - mean_prediction",'')
    cs = [c, c.replace('mean_prediction','lower_pred'), c.replace('mean_prediction','upper_pred')]
    df.loc[df[label]==1,cs] = 1
    print(df[c].sum())
print(pred_cats)

pred_cats = [
    "12 - Terrestrial ES - mean_prediction",
    "12 - Coastal and marine Ecosystems - mean_prediction",
    "12 - Mountains, snow and ice - mean_prediction",
    "12 - Rivers, lakes, and soil moisture - mean_prediction",
    "12 - Human and managed - mean_prediction",
    "12 - Total"
]

pcols = [
    '0 - relevance - mean_prediction',
    '0 - relevance - lower_pred',
    '0 - relevance - upper_pred'
]

df.loc[df['relevant']==1,pcols]=1



12291.100000000002
12464.100000000002
10176.9
10457.9
4585.8
4744.8
11683.5
11976.5
29838.4
30281.4
['12 - Coastal and marine Ecosystems - mean_prediction', '12 - Human and managed - mean_prediction', '12 - Mountains, snow and ice - mean_prediction', '12 - Rivers, lakes, and soil moisture - mean_prediction', '12 - Terrestrial ES - mean_prediction']


In [8]:
ndf = pd.read_csv("../data/gridcell_studies_all_2.5.csv")

ndf.loc[ndf['LON']>180,"LON"]-=360
ndf['ipccreg'] = 0

#df = pd.DataFrame(columns=['IPCC region', 'Documents'])
index = pd.Index(ipcc_regions.NAME, name="IPCC Region")

table = pd.DataFrame(columns=['Documents'], index=index)

for i, row in ipcc_regions.iterrows():
    inplace = shapely.vectorized.contains(row.geometry,ndf['LON'],ndf['LAT'])
    idx = np.argwhere(inplace==True)
    
    ndf.loc[idx[:,0],"ipccreg"] = i+1
    dids = place_gridcells[place_gridcells['ndf_id'].isin(idx)]['doc_id'].unique()
    
    mid = df[(df['id'].isin(dids)) & (df["0 - relevance - mean_prediction"]>=0.5) ].shape[0]
    low = df[(df['id'].isin(dids)) & (df["0 - relevance - lower_pred"]>=0.5) ].shape[0]
    high = df[(df['id'].isin(dids)) & (df["0 - relevance - upper_pred"]>=0.5) ].shape[0]
    
    table.loc[row.NAME,"Documents"] = f"{mid} ({low}-{high})"
    
    break



In [9]:
ndf = pd.read_csv("../data/gridcell_studies_all_2.5.csv")

ndf.loc[ndf['LON']>180,"LON"]-=360
ndf['ipccreg'] = 0


index = pd.MultiIndex.from_product([ipcc_regions.NAME, [x.split(' - ')[1] for x in pred_cats]], names=['IPCC Region', 'Impact'])

table = pd.DataFrame(columns=['Documents'], index=index)

for i, row in ipcc_regions.iterrows():
    inplace = shapely.vectorized.contains(row.geometry,ndf['LON'],ndf['LAT'])
    idx = np.argwhere(inplace==True)
    
    ndf.loc[idx[:,0],"ipccreg"] = i+1
    dids = place_gridcells[place_gridcells['ndf_id'].isin(idx)]['doc_id'].unique()
    
    for j, pc in enumerate(pred_cats):
        
        if "Total" in pc:
            mid = df[(df['id'].isin(dids)) & (df["0 - relevance - mean_prediction"]>=0.5) ].shape[0]
            low = df[(df['id'].isin(dids)) & (df["0 - relevance - lower_pred"]>=0.5) ].shape[0]
            high = df[(df['id'].isin(dids)) & (df["0 - relevance - upper_pred"]>=0.5) ].shape[0]            
        else:
            mid = df[
                (df['id'].isin(dids)) & 
                (df["0 - relevance - mean_prediction"]>=0.5) &
                (df[pc] >= 0.5)
            ].shape[0]
            low = df[
                (df['id'].isin(dids)) & 
                (df["0 - relevance - lower_pred"]>=0.5) &
                (df[pc.replace('mean_prediction','lower_pred')]>=0.5)
            ].shape[0]
            high = df[
                (df['id'].isin(dids)) & 
                (df["0 - relevance - upper_pred"]>=0.5) &
                (df[pc.replace('mean_prediction','upper_pred')]>=0.5)
            ].shape[0]
    
        table.loc[(row.NAME,pc.split(' - ')[1]),"Documents"] = f"{mid} ({low}-{high})"

print(table.shape)
table.head(10)

(198, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Documents
IPCC Region,Impact,Unnamed: 2_level_1
Alaska/N.W. Canada [ALA:1],Terrestrial ES,1629 (1295-1942)
Alaska/N.W. Canada [ALA:1],Coastal and marine Ecosystems,364 (254-476)
Alaska/N.W. Canada [ALA:1],"Mountains, snow and ice",512 (403-609)
Alaska/N.W. Canada [ALA:1],"Rivers, lakes, and soil moisture",441 (317-551)
Alaska/N.W. Canada [ALA:1],Human and managed,183 (121-253)
Alaska/N.W. Canada [ALA:1],Total,3961 (3363-4534)
Amazon [AMZ:7],Terrestrial ES,52 (36-63)
Amazon [AMZ:7],Coastal and marine Ecosystems,313 (227-391)
Amazon [AMZ:7],"Mountains, snow and ice",30 (23-33)
Amazon [AMZ:7],"Rivers, lakes, and soil moisture",69 (37-101)


In [10]:
ndf = pd.read_csv("../data/gridcell_studies_all_2.5.csv")

ndf.loc[ndf['LON']>180,"LON"]-=360
ndf['ipccreg'] = 0

#df = pd.DataFrame(columns=['IPCC region', 'Documents'])
index = pd.Index(ipcc_regions.NAME, name="IPCC Region")

table = pd.DataFrame(columns=['D&A Trend', "nonD&A Trend", "NAD&A Trend", "Sensitivity","Detection"], index=index)

for i, row in ipcc_regions.iterrows():
    inplace = shapely.vectorized.contains(row.geometry,ndf['LON'],ndf['LAT'])
    idx = np.argwhere(inplace==True)
    
    ndf.loc[idx[:,0],"ipccreg"] = i+1
    dids = place_gridcells[place_gridcells['ndf_id'].isin(idx)]['doc_id'].unique()
    
    midids = df[(df['id'].isin(dids)) & (df["0 - relevance - mean_prediction"]>=0.5) ]['id']
    lowids = df[(df['id'].isin(dids)) & (df["0 - relevance - lower_pred"]>=0.5) ]['id']
    highids = df[(df['id'].isin(dids)) & (df["0 - relevance - upper_pred"]>=0.5) ]['id']
    
    #D&A Trend
    sub_dadf = dadf[dadf['da_trend_cat']=='DA>0.5']
    mid = sub_dadf[sub_dadf['id'].isin(midids)].shape[0]
    low = sub_dadf[sub_dadf['id'].isin(lowids)].shape[0]
    high = sub_dadf[sub_dadf['id'].isin(highids)].shape[0]
    
    table.loc[row.NAME,"D&A Trend"] = f"{mid} ({low}-{high})"
    
    #NO D&A Trend
    sub_dadf = dadf[dadf['da_trend_cat']=='0==DA']
    mid = sub_dadf[sub_dadf['id'].isin(midids)].shape[0]
    low = sub_dadf[sub_dadf['id'].isin(lowids)].shape[0]
    high = sub_dadf[sub_dadf['id'].isin(highids)].shape[0]
    
    table.loc[row.NAME,"nonD&A Trend"] = f"{mid} ({low}-{high})"
    
    #NO D&A Trend
    sub_dadf = dadf[dadf['da_trend_cat']=='NA']
    mid = sub_dadf[sub_dadf['id'].isin(midids)].shape[0]
    low = sub_dadf[sub_dadf['id'].isin(lowids)].shape[0]
    high = sub_dadf[sub_dadf['id'].isin(highids)].shape[0]
    
    table.loc[row.NAME,"NAD&A Trend"] = f"{mid} ({low}-{high})"
    
    #NO D&A Trend
    sub_dadf = dadf[dadf['da_trend_cat']=='NA']
    mid = sub_dadf[sub_dadf['id'].isin(midids)].shape[0]
    low = sub_dadf[sub_dadf['id'].isin(lowids)].shape[0]
    high = sub_dadf[sub_dadf['id'].isin(highids)].shape[0]
    
    table.loc[row.NAME,"NAD&A Trend"] = f"{mid} ({low}-{high})"
    
    #break
    
table.head(10)




KeyError: 'da_trend_cat'

In [None]:
dadf.da_trend_cat.unique()

In [None]:
ndf = pd.read_csv("../data/gridcell_studies_all_2.5.csv")
ndf[ndf['index']==7574].head()

In [None]:
place_gridcells = pd.read_csv("../data/study_gridcell_all_2.5.csv")
place_gridcells.head()

In [None]:
ipcc_regions

In [None]:
extra_cat_df = pd.read_csv('../data/1_predicted_category_documents_specific.csv')


In [None]:
merged_df = df.merge(extra_cat_df)

merged_df.head()

In [None]:
specific_impact_cats = [x for x in merged_df.columns if "18 -" in x and "mean" in x]

specific_impact_cats = [
    '18 - Food/Agriculture - mean_prediction',
    '18 - Livelihoods and wellbeing - mean_prediction',
    '18 - Health - mean_prediction',
    '18 - Displacement and migration - mean_prediction',
]

for c in specific_impact_cats:
    print(merged_df[c].sum())
    label = c.replace(" - mean_prediction",'')
    cs = [c, c.replace('mean_prediction','lower_pred'), c.replace('mean_prediction','upper_pred')]
    merged_df.loc[merged_df[label]==1,cs] = 1
    print(merged_df[c].sum())
print(pred_cats)

specific_impact_cats

In [None]:
from pycountry_convert import country_name_to_country_alpha3

In [None]:
country_dict = []
for x in """Algeria, Angola, Benin, Botswana, Burkina Faso, Burundi,  Cameroon, Chad, Congo, Ivory Coast, Djibouti, Egypt, Eritrea, Eswatini, Swaziland, Ethiopia, Gabon, Gambia, Ghana, Guinea, Kenya, Lesotho, Liberia, Libya, Malawi, Mali, Mauritania, Morocco, Mozambique, Namibia, Niger, Nigeria, Rwanda, Senegal, Sierra Leone, Somalia, Sudan, Tanzania, Togo, Tunisia, Uganda, Zambia, Zimbabwe""".split(', '):
    country_dict.append({"country": country_name_to_country_alpha3(x.strip()), "region": "Africa"})
    
for x in """Afghanistan, Armenia, Azerbaijan, Bahrain, Bangladesh, Bhutan, Brunei, Cambodia, China, Cyprus, Georgia, India, Indonesia, Iran, Iraq, Israel, Japan, Jordan, Kazakhstan, Kuwait, Kyrgyzstan, Laos, Lebanon, Malaysia, Mongolia, Myanmar, Nepal, South Korea, North Korea, Oman, Pakistan, Palestine, Philippines, Qatar, Russia, Saudi Arabia, Singapore, Sri Lanka, Syria, Taiwan, Tajikistan, Thailand, Turkey, Turkmenistan, United Arab Emirates, Uzbekistan, Vietnam, Yemen
""".split(', '):
    country_dict.append({"country": country_name_to_country_alpha3(x.strip()), "region": "Asia"})
    
for x in """Australia,  New Zealand""".split(', '):
    country_dict.append({"country": country_name_to_country_alpha3(x.strip()), "region": "Australasia"})
    
for x in """Belize, Costa Rica, El Salvador, Guatemala, Honduras, Nicaragua, Panama, Argentina, Bolivia, Brazil, Chile, Colombia, Ecuador, French Guiana, Guyana, Paraguay, Peru, Suriname, Uruguay, Venezuela
""".split(', '):
    country_dict.append({"country": country_name_to_country_alpha3(x.strip()), "region": "Central and South America"})
    
for x in """United States, Canada, Mexico, Greenland
""".split(', '):
    country_dict.append({"country": country_name_to_country_alpha3(x.strip()), "region": "North America"})
    
for x in """Albania, Andorra, Armenia, Austria, Azerbaijan, Belarus, Belgium, Bosnia and Herzegovina, Bulgaria, Croatia, Cyprus, Czech Republic, Denmark, Estonia, Finland, France, Georgia, Germany, Greece, Hungary, Iceland, Ireland, Italy, Kazakhstan, Kosovo, Latvia, Liechtenstein, Lithuania, Luxembourg, Malta, Moldova, Monaco, Montenegro, Netherlands, Macedonia, Norway, Poland, Portugal, Romania, Russia, San Marino, Serbia, Slovakia, Slovenia, Spain, Sweden, Switzerland, Turkey, Ukraine, United Kingdom, Vatican City
""".split(', '):
    try:
        country_dict.append({"country": country_name_to_country_alpha3(x.strip()), "region": "Europe"})
    except:
        if x.strip()=="Kosovo":
            country_dict.append({"country": "XKX", "region": "Europe"})
        elif x.strip()=="Vatican City":
            country_dict.append({"country": "VAT", "region": "Europe"})
    
for x in """Anguilla, Aruba, Antigua and Barbuda, Bahamas, Bahrain, Barbados, Bermuda, British Virgin Islands, Cayman Islands, Northern Mariana Islands, Belize, Comoros, Cuba, Dominica, Grenada, Guyana, Haiti, Jamaica, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Suriname, Trinidad and Tobago, Cabo Verde, Curaçao, Comoros, Guinea, Maldives, Mauritius, São Tomé and Príncipe, Seychelles, Singapore, Cook Islands, Fiji, Kiribati, Marshall Islands, Micronesia, Nauru, Niue, Palau, Samoa, Solomon Islands, Seychelles, East Timor, Tonga, Tuvalu, Vanuatu, French Polynesia, Guadeloupe, Guam, Martinique, Montserrat, New Caledonia, Puerto Rico, Saint Martin, Turks and Caicos, U.S. Virgin Islands, Guinea-Bissau, Cabo Verde, Comoros, Madagascar, Mauritius, Sao Tome and Principe, Seychelles
""".split(', '):
    try:
        country_dict.append({"country": country_name_to_country_alpha3(x.strip()), "region": "Small Island States"})
    except:
        if x.strip()=="U.S. Virgin Islands":
            country_dict.append({"country": "VIR", "region": "Small Island States"})


In [None]:
region_df = pd.DataFrame.from_dict(country_dict)
regions = region_df.region.unique()
region_df.head()

In [None]:
places = pd.read_csv('../data/place_df.csv')

In [None]:
merged_df.head()

In [None]:
index = pd.MultiIndex.from_product([specific_impact_cats, ["Partially attributed","Not attributed"]], names=['IPCC Region', 'Impact'])

table = pd.DataFrame(columns=regions, index=index)

for region in regions:
    countries = region_df.loc[region_df['region']==region,"country"]
    place_ids = places.loc[places['country_predicted'].isin(countries),"doc_id"]
    for impact in specific_impact_cats:
        sub_df = merged_df.loc[
            (merged_df['id'].isin(place_ids)) & 
            (merged_df[impact]>0.5)
        ]
        table.loc[(impact,"Partially attributed"),region] = sub_df[sub_df["da"]>0].shape[0]
        table.loc[(impact,"Not attributed"),region] = sub_df.shape[0] - sub_df[sub_df["da"]>0].shape[0]

                                    
table.head(10)
table.to_excel('../data/human_regions.xlsx')

In [None]:
places