In [35]:
import pandas as pd
import numpy as np
import os
import sys

parentdir = os.path.abspath('..')
grandparentdir=os.path.dirname(parentdir)
sys.path.insert(0, parentdir)
sys.path.insert(0, grandparentdir)

# load land cover
dir_data_landcover = parentdir+"/row_data/Land_cover_data/"
datalc = pd.read_csv(dir_data_landcover+"FAOSTAT_data_7-11-2022.csv")

replace_dict_lc={'Bolivia (Plurinational State of)':'Bolivia',
                 "C?te d'Ivoire":"Cote d'Ivoire",
                 'China, Hong Kong SAR':'Hong Kong SAR, China',
                 'Republic of Korea':'Korea',
                 'Democratic Republic of the Congo':'Congo, Dem. Rep.',
                 'Congo':'Congo, Rep.',
                 'Gambia':'Gambia, The',
                 'Iran (Islamic Republic of)':'Iran',
                 'Republic of Korea':'Korea',
                 "Democratic People's Republic of Korea":"Korea, Dem. People's Rep.",
                 'Kyrgyzstan':'Kyrgyz Republic',
                 "Lao People's Democratic Republic":'Lao PDR',
                 'China, Macao SAR':'Macao SAR, China',
                 'Micronesia (Federated States of)':'Micronesia, Fed. Sts.',
                 'Slovakia':'Slovak Republic',
                 'Saint Kitts and Nevis':'St. Kitts and Nevis',
                 'Saint Lucia':'St. Lucia',
                 'Saint Vincent and the Grenadines':'St. Vincent and the Grenadines',
                 'United Republic of Tanzania':'Tanzania',
                 'T?rkiye':'Turkey',
                 'United Kingdom of Great Britain and Northern Ireland':'United Kingdom',
                 'United States of America':'United States',
                 'Venezuela (Bolivarian Republic of)':'Venezuela',
                 'Viet Nam':'Vietnam',
                 'United States Virgin Islands':'Virgin Islands (U.S.)',
                 'Yemen':'Yemen, Rep.'
                }
for fao_Area in replace_dict_lc.keys():
    datalc.replace(to_replace=fao_Area, value=replace_dict_lc[fao_Area],inplace=True)

# we process only areas estimated from Area from CCI_LC
datalc = datalc[datalc['Element'] == "Area from CCI_LC"]

# load land use
dir_data_landuse = parentdir+"/row_data/Land_use_data/"
datalu = pd.read_csv(dir_data_landuse+"/FAOSTAT_data_7-11-2022.csv")

replace_dict_lu={'Bolivia (Plurinational State of)':'Bolivia',
                 "Côte d'Ivoire":"Cote d'Ivoire",
                 'China, Hong Kong SAR':'Hong Kong SAR, China',
                 'Republic of Korea':'Korea',
                 'Democratic Republic of the Congo':'Congo, Dem. Rep.',
                 'Congo':'Congo, Rep.',
                 'Gambia':'Gambia, The',
                 'Iran (Islamic Republic of)':'Iran',
                 'Republic of Korea':'Korea',
                 "Democratic People's Republic of Korea":"Korea, Dem. People's Rep.",
                 'Kyrgyzstan':'Kyrgyz Republic',
                 "Lao People's Democratic Republic":'Lao PDR',
                 'China, Macao SAR':'Macao SAR, China',
                 'Micronesia (Federated States of)':'Micronesia, Fed. Sts.',
                 'Slovakia':'Slovak Republic',
                 'Saint Kitts and Nevis':'St. Kitts and Nevis',
                 'Saint Lucia':'St. Lucia',
                 'Saint Vincent and the Grenadines':'St. Vincent and the Grenadines',
                 'United Republic of Tanzania':'Tanzania',
                 'Türkiye':'Turkey',
                 'United Kingdom of Great Britain and Northern Ireland':'United Kingdom',
                 'United States of America':'United States',
                 'Venezuela (Bolivarian Republic of)':'Venezuela',
                 'Viet Nam':'Vietnam',
                 'United States Virgin Islands':'Virgin Islands (U.S.)',
                 'Yemen':'Yemen, Rep.',
                 'Saint-Martin (French part)':'St. Martin (French part)'
                }
for fao_Area in replace_dict_lu.keys():
    datalu.replace(to_replace=fao_Area, value=replace_dict_lu[fao_Area],inplace=True)

# edit land use data to estimate different types of forests
datalu = datalu[datalu['Item'].isin(["Naturally regenerating forest", "Planted Forest"])]
datalu = datalu.pivot_table(index=["Area", "Year"], columns='Item', values='Value').reset_index()
datalu['Planted.Forest'] = datalu['Planted Forest'].fillna(0)
datalu['Total'] = datalu['Planted.Forest'] + datalu['Naturally regenerating forest']
datalu['Planted.Forest'] = datalu['Planted.Forest'] / datalu['Total']
datalu['Naturally.regenerating.forest'] = datalu['Naturally regenerating forest'] / datalu['Total']
datalu = datalu.drop(columns=['Total'])

# subset to countries of interest
#dir_countries = r"(C:\Users\L03054557\OneDrive\Edmundo-ITESM\3.Proyectos\42. LAC Decarbonization\)"
# countries = pd.read_csv("CountriesList.csv", usecols=["Nation_Fao", "nation_SISEPUEDE"])
# countries['Nation'] = countries['nation_SISEPUEDE']
# countries = countries.drop(columns=['nation_SISEPUEDE'])
countries = pd.read_csv("../row_data/Countries_ISO3.csv")


# land cover
# print(datalc.shape)
datalc = datalc[datalc['Area'].isin(countries['Nation'].unique())]
# print(datalc.shape)

# land use
#print(datalu.shape)
datalu = datalu[datalu['Area'].isin(countries['Nation'].unique())]
#print(datalu.shape)


# read items_classification.csv file
items = pd.read_csv(dir_data_landcover + "items_classification.csv")

# merge datalc with items based on the "Item" column
datalc = pd.merge(datalc, items, on="Item")

# subset datalc by removing rows where the "sisepuede_item" column is empty
datalc = datalc[datalc["sisepuede_item"] != ""]

# aggregate by summing up the "Value" column based on "Area", "Year", "sisepuede_item", and "Unit"
data = datalc.groupby(["Area", "Year", "sisepuede_item", "Unit"]).agg({"Value": "sum"}).reset_index()

data=data.rename(columns={'sisepuede_item': 'Item'})


# filter data for Item == "frac_lndu_initial_forests"
datafor = data[data["Item"] == "frac_lndu_initial_forests"].copy()

# merge datafor with datalu using "Area" column
datafor = pd.merge(datafor, datalu, on=["Area","Year"])

# crear una copia de datafor
datafor_1 = datafor.copy()

# multiplicar la columna "Value" por la columna "Naturally.regenerating.forest"
datafor_1["Value"] = datafor_1["Value"] * datafor_1["Naturally.regenerating.forest"]

# eliminar las columnas "Naturally.regenerating.forest" y "Planted.Forest"
datafor_1 = datafor_1.drop(["Naturally.regenerating.forest", "Planted.Forest"], axis=1)

# cambiar el valor de la columna "Item" a "frac_lndu_initial_forests_primary"
datafor_1["Item"] = "frac_lndu_initial_forests_primary"

# crear una copia de datafor
datafor_2 = datafor.copy()

# multiplicar la columna "Value" por la columna "Planted.Forest"
datafor_2["Value"] = datafor_2["Value"] * datafor_2["Planted.Forest"]

# eliminar las columnas "Naturally.regenerating.forest" y "Planted.Forest"
datafor_2 = datafor_2.drop(["Naturally.regenerating.forest", "Planted.Forest"], axis=1)

# cambiar el valor de la columna "Item" a "frac_lndu_initial_forests_secondary"
datafor_2["Item"] = "frac_lndu_initial_forests_secondary"

# eliminar la categoría de bosque general del archivo original
data = data[data["Item"] != "frac_lndu_initial_forests"]

# agregar las dos nuevas tablas a data
data = pd.concat([data, datafor_1, datafor_2], ignore_index=True)
# data

# edit agricultural area
dataAgg = data[data['Item'] == 'frac_lndu_initial_croplands'].copy()

dir_data_agg = parentdir+"/row_data/harvested_fao_data/"
data_agg = pd.read_csv(dir_data_agg+"FAOSTAT_data_7-11-2022.csv",encoding="ISO-8859-1")
replace_dict_lu={'Bolivia (Plurinational State of)':'Bolivia',
                 "Côte d'Ivoire":"Cote d'Ivoire",
                 'China, Hong Kong SAR':'Hong Kong SAR, China',
                 'Republic of Korea':'Korea',
                 'Democratic Republic of the Congo':'Congo, Dem. Rep.',
                 'Congo':'Congo, Rep.',
                 'Gambia':'Gambia, The',
                 'Iran (Islamic Republic of)':'Iran',
                 'Republic of Korea':'Korea',
                 "Democratic People's Republic of Korea":"Korea, Dem. People's Rep.",
                 'Kyrgyzstan':'Kyrgyz Republic',
                 "Lao People's Democratic Republic":'Lao PDR',
                 'China, Macao SAR':'Macao SAR, China',
                 'Micronesia (Federated States of)':'Micronesia, Fed. Sts.',
                 'Slovakia':'Slovak Republic',
                 'Saint Kitts and Nevis':'St. Kitts and Nevis',
                 'Saint Lucia':'St. Lucia',
                 'Saint Vincent and the Grenadines':'St. Vincent and the Grenadines',
                 'United Republic of Tanzania':'Tanzania',
                 'Türkiye':'Turkey',
                 'United Kingdom of Great Britain and Northern Ireland':'United Kingdom',
                 'United States of America':'United States',
                 'Venezuela (Bolivarian Republic of)':'Venezuela',
                 'Viet Nam':'Vietnam',
                 'United States Virgin Islands':'Virgin Islands (U.S.)',
                 'Yemen':'Yemen, Rep.',
                 'Saint-Martin (French part)':'St. Martin (French part)',
                 'Republic of Moldova':'Moldova'
                }
for fao_Area in replace_dict_lu.keys():
    data_agg.replace(to_replace=fao_Area, value=replace_dict_lu[fao_Area],inplace=True)
data_agg["Item"] = data_agg["Item"].str.replace("Maté", "Mate")

cw = pd.read_csv("https://raw.githubusercontent.com/egobiernoytp/lac_decarbonization/main/ref/data_crosswalks/fao_crop_categories.csv")
cw.columns = ["Item", "cat_1", "sisepuede_item", "super_cat"]

data_agg=data_agg.merge(cw)

data_agg = data_agg.groupby(['Area', 'Year', 'sisepuede_item', 'Unit'], as_index=False)['Value'].agg('sum')

data_totals_agg = data_agg.groupby(["Area", "Year"]).agg({"Value": "sum"})
data_totals_agg["total_crop_area"] = data_totals_agg["Value"]
data_totals_agg.drop(columns=["Value"], inplace=True)
data_totals_agg=data_totals_agg.reset_index()

# read estimated total
# aggTotals = pd.read_csv(dir_data_landcover + 'total_agg_areas.csv')

aggTotals = data_totals_agg

# divide by 1,000 to have same units
aggTotals['total_crop_area'] = aggTotals['total_crop_area'] / 1000

# merge
dataAgg = pd.merge(dataAgg, aggTotals)

# print(dataAgg.shape)
# dataAgg

# Crear columna 'Diff' con los valores corregidos de área de cultivo
dataAgg['Diff'] = np.where(dataAgg['Value'] - dataAgg['total_crop_area'] > 0, dataAgg['Value'] - dataAgg['total_crop_area'], 0)

# Crear un dataframe 'dataCorrections' con las columnas 'Area', 'Year' y 'Diff'
dataCorrections = dataAgg[['Area', 'Year', 'Diff']].copy()

# Reemplazar la columna 'Value' por 'total_crop_area'
dataAgg['Value'] = dataAgg['total_crop_area']

# Eliminar las columnas 'total_crop_area' y 'Diff'
dataAgg.drop(columns=['total_crop_area', 'Diff'], inplace=True)

# remove agricultural areas from original
data = data[data["Item"] != "frac_lndu_initial_croplands"]

# add the modified agricultural data
data = pd.concat([data, dataAgg], ignore_index=True)

# merge data corrections
# print(data.shape)
data = pd.merge(data, dataCorrections, on=["Area", "Year"])

data['Value_new'] = np.where(data['Item'].isin(['frac_lndu_initial_other']), 0.33 * data['Diff'], 0)
data['Value'] = data['Value'] + data['Value_new']
data.drop(['Value_new', 'Diff'], axis=1, inplace=True)

# Estimate totals so we can add total fractions
data_totals = data.groupby(["Area", "Year"]).agg({"Value": "sum"}).reset_index()
data_totals["total_area"] = data_totals["Value"]
data_totals = data_totals.drop("Value", axis=1)

# Merge with original
data = pd.merge(data, data_totals, on=["Area", "Year"])

# Test data
data["percent"] = data["Value"] / data["total_area"]

data


Unnamed: 0,Area,Year,Item,Unit,Value,Naturally regenerating forest,Planted Forest,total_area,percent
0,Afghanistan,2011,frac_lndu_initial_forests_mangroves,1000 ha,0.000000,,,62558.297380,0.000000
1,Afghanistan,2011,frac_lndu_initial_grasslands,1000 ha,27921.014900,,,62558.297380,0.446320
2,Afghanistan,2011,frac_lndu_initial_other,1000 ha,30115.308580,,,62558.297380,0.481396
3,Afghanistan,2011,frac_lndu_initial_settlements,1000 ha,49.083500,,,62558.297380,0.000785
4,Afghanistan,2011,frac_lndu_initial_wetlands,1000 ha,63.933300,,,62558.297380,0.001022
...,...,...,...,...,...,...,...,...,...
13731,Zimbabwe,2019,frac_lndu_initial_settlements,1000 ha,55.814700,,,31024.068311,0.001799
13732,Zimbabwe,2019,frac_lndu_initial_wetlands,1000 ha,447.401500,,,31024.068311,0.014421
13733,Zimbabwe,2019,frac_lndu_initial_forests_primary,1000 ha,11577.979153,17382.65,108.0,31024.068311,0.373193
13734,Zimbabwe,2019,frac_lndu_initial_forests_secondary,1000 ha,71.935047,17382.65,108.0,31024.068311,0.002319


In [None]:
nations=data['Area'].unique()
years_hist=data['Year'].unique()

incomplete_data_vect=[]
for sisepuede_var in data['Item'].unique():
    sub_df=data[data['Item']==sisepuede_var]
    for c in nations:
        for y in years_hist:
            try:
                percent=sub_df[(sub_df['Area']==c)&(sub_df['Year']==y)]['percent'].values[0]
            except:
                incomplete_data_vect.append(c)
                print('incomplete data for:',c,y,sisepuede_var)

data = data[~data['Area'].isin(set(incomplete_data_vect))]

nations=data['Area'].unique()
years_hist=data['Year'].unique()
years_proj=range(max(years_hist)+1,2051)

for sisepuede_var in data['Item'].unique():
    sub_df=data[data['Item']==sisepuede_var]
    data_vect=[]
    for c in nations:
        iso3=countries[countries['Nation']==c]['iso_code3'].values[0]
        for y in years_hist:
            try:
                percent=sub_df[(sub_df['Area']==c)&(sub_df['Year']==y)]['percent'].values[0]
                data_vect.append([c,iso3,y,percent])
            except:
                print(c,y,sisepuede_var)
                pass
    df_historical=pd.DataFrame(data_vect,columns=['Nation','iso_code3','Year',sisepuede_var])
    df_historical.to_csv(grandparentdir+'/'+sisepuede_var+'/input_to_sisepuede/historical/'+sisepuede_var+'.csv')
    data_vect_proj=[]
    for c in nations:
        iso3=countries[countries['Nation']==c]['iso_code3'].values[0]
        for y in years_proj:
            percent=df_historical[(df_historical['Nation']==c)&
                                  (df_historical['Year']==max(years_hist))][sisepuede_var].values[0]
            data_vect_proj.append([c,iso3,y,percent])
            
    df_projected=pd.DataFrame(data_vect_proj,columns=['Nation','iso_code3','Year',sisepuede_var])
    df_projected.to_csv(grandparentdir+'/'+sisepuede_var+'/input_to_sisepuede/projected/'+sisepuede_var+'.csv')
N_nations=len(df_projected['Nation'].unique())
print(f'Done for {N_nations} countries')
print('Missing values in:')
print(sorted(set(incomplete_data_vect)))