### Data preprocessing script for initial livestock data
- Author: Juan Antonio Robledo Lara
- Objective: This notebooks allows the user to test the data preprocessing script for initial livestock data so its easier to create new updates if necessary.
- Notes: Refer to the main script `initial_lvst_processing.py` to run a new update of the livestock data.

In [17]:
import pandas as pd
import os

In [18]:
raw_data_path = '../AFOLU/pop_lvst_data_raw'

In [19]:
lvst_df = pd.read_csv(os.path.join(raw_data_path, 'FAOSTAT_data.csv'))
lvst_df.head()

Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code (CPC),Item,Year Code,Year,Unit,Value,Flag,Flag Description,Note
0,QCL,Crops and livestock products,4,Afghanistan,5111,Stocks,2132.0,Asses,2011,2011,An,1466000,A,Official figure,
1,QCL,Crops and livestock products,4,Afghanistan,5111,Stocks,2132.0,Asses,2012,2012,An,1423000,A,Official figure,
2,QCL,Crops and livestock products,4,Afghanistan,5111,Stocks,2132.0,Asses,2013,2013,An,1451000,A,Official figure,
3,QCL,Crops and livestock products,4,Afghanistan,5111,Stocks,2132.0,Asses,2014,2014,An,1441000,A,Official figure,
4,QCL,Crops and livestock products,4,Afghanistan,5111,Stocks,2132.0,Asses,2015,2015,An,1481000,A,Official figure,


In [20]:
# Drop irrelevant columns
lvst_relevant_df = lvst_df[['Area', 'Year', 'Item', 'Value']]
lvst_relevant_df = lvst_relevant_df.rename(columns={'Area':'Nation'})
lvst_relevant_df.head()

Unnamed: 0,Nation,Year,Item,Value
0,Afghanistan,2011,Asses,1466000
1,Afghanistan,2012,Asses,1423000
2,Afghanistan,2013,Asses,1451000
3,Afghanistan,2014,Asses,1441000
4,Afghanistan,2015,Asses,1481000


In [21]:
# Load m49 JSON to get ISO3 country codes.
df_json = pd.read_json(os.path.join(raw_data_path, 'm49-countries.json'))
df_json.head()

Unnamed: 0,m49,ISO3,country_name_en,country_name_es,country_name_ar,country_name_ru,country_name_fr,country_name_zh
0,533,ABW,Aruba,Aruba,أَروبا,Аруба,Aruba,阿鲁巴岛
1,4,AFG,Afghanistan,Afganistán,أفغانستان,Афганистан,Afghanistan,阿富汗
2,24,AGO,Angola,Angola,أنغولا,Ангола,Angola,安哥拉
3,660,AIA,Anguilla,Anguila,أنغويلا,Ангилья,Anguilla,安圭拉
4,248,ALA,Åland Islands,Islas Åland,جزر ألاند,Аландские острова,Îles d'Åland,奥兰群岛


In [22]:
# Create a new column with the ISO3 country codes
lvst_merged_df = lvst_relevant_df.merge(df_json, how='left', left_on='Nation', right_on='country_name_en')
lvst_merged_df = lvst_merged_df[['ISO3', 'Nation', 'Year', 'Item', 'Value']]
lvst_merged_df = lvst_merged_df.rename(columns={'ISO3': 'iso_code3'})

# Display the result
lvst_merged_df.head()

Unnamed: 0,iso_code3,Nation,Year,Item,Value
0,AFG,Afghanistan,2011,Asses,1466000
1,AFG,Afghanistan,2012,Asses,1423000
2,AFG,Afghanistan,2013,Asses,1451000
3,AFG,Afghanistan,2014,Asses,1441000
4,AFG,Afghanistan,2015,Asses,1481000


In [23]:
# Checking if there are missing iso_code3 values
lvst_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21419 entries, 0 to 21418
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   iso_code3  20681 non-null  object
 1   Nation     21419 non-null  object
 2   Year       21419 non-null  int64 
 3   Item       21419 non-null  object
 4   Value      21419 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 836.8+ KB


In [24]:
# If there are null values let's check the countries with null values
iso_is_null_df = lvst_merged_df[lvst_merged_df.iso_code3.isnull()]
iso_is_null_df.Nation.unique()

array(['China, mainland', 'China, Taiwan Province of',
       'Netherlands (Kingdom of the)', 'Sudan (former)', 'Türkiye',
       'United Kingdom of Great Britain and Northern Ireland'],
      dtype=object)

In [25]:
# function to handle special cases
def handle_special_cases(row):
    if row['Nation'] == 'China, mainland' or row['Nation'] == 'China, Taiwan Province of' or row['Nation'] == 'Sudan (former)':
        return row['Nation']  # Copy the Nation name to the iso_code3 column
    elif row['Nation'] == 'Netherlands (Kingdom of the)':
        row['Nation'] = 'Netherlands'  # Change Nation name to 'Netherlands'
        return 'NLD'  # Set iso_code3 to 'NLD'
    elif row['Nation'] == 'United Kingdom of Great Britain and Northern Ireland':
        return 'GBR'  # Set iso_code3 to 'GBR'
    elif row['Nation'] == 'Türkiye':
        return 'TUR'  # Set iso_code3 to 'TUR'
    else:
        return row['iso_code3']  # Return the existing ISO3 code if no special case

# Apply the function to the rows
lvst_merged_df['iso_code3'] = lvst_merged_df.apply(handle_special_cases, axis=1)

In [26]:
print(f'There are {lvst_merged_df.iso_code3.isnull().sum()} null values')

There are 0 null values


In [27]:
# Checking that the df looks good using IRAN pigs as a example which should be all 0.
lvst_merged_df[(lvst_merged_df.iso_code3 == 'IRN') & (lvst_merged_df.Item == 'Swine / pigs')]

Unnamed: 0,iso_code3,Nation,Year,Item,Value
9393,IRN,Iran (Islamic Republic of),2011,Swine / pigs,0
9394,IRN,Iran (Islamic Republic of),2012,Swine / pigs,0
9395,IRN,Iran (Islamic Republic of),2013,Swine / pigs,0
9396,IRN,Iran (Islamic Republic of),2014,Swine / pigs,0
9397,IRN,Iran (Islamic Republic of),2015,Swine / pigs,0
9398,IRN,Iran (Islamic Republic of),2016,Swine / pigs,0
9399,IRN,Iran (Islamic Republic of),2017,Swine / pigs,0
9400,IRN,Iran (Islamic Republic of),2018,Swine / pigs,0
9401,IRN,Iran (Islamic Republic of),2019,Swine / pigs,0
9402,IRN,Iran (Islamic Republic of),2020,Swine / pigs,0


In [28]:
# Create a dictionary from items_classification.csv to decide the format and which livestock items we will use to create the inptus for sisepuede
crosswalk_df = pd.read_csv(os.path.join(raw_data_path,'items_classification.csv'))
crosswalk_df_dict = dict(zip(crosswalk_df['Item_Fao'], crosswalk_df['File_Sisepuede']))
crosswalk_df_dict

{'Buffalo': 'pop_lvst_initial_buffalo',
 'Cattle': 'pop_lvst_initial_cattle_nondairy',
 'Chickens': 'pop_lvst_initial_chickens',
 'Goats': 'pop_lvst_initial_goats',
 'Horses': 'pop_lvst_initial_horses',
 'Mules and hinnies': 'pop_lvst_initial_mules',
 'Swine / pigs': 'pop_lvst_initial_pigs',
 'Sheep': 'pop_lvst_initial_sheep'}

In [13]:
# Generate new csv files for each selected livestock item
for lvst_item in crosswalk_df_dict.keys():
    
    # Generate historical input data
    historical_lvst_df = lvst_merged_df[lvst_merged_df.Item == lvst_item]
    sisepuede_input_name = crosswalk_df_dict[lvst_item]

    # Apply bfill and ffill within each country group (grouped by 'iso_code3') only if there are null values
    if historical_lvst_df['Value'].isnull().any():
        historical_lvst_df['Value'] = historical_lvst_df.groupby('iso_code3')['Value'].apply(lambda group: group.bfill().ffill())
    
    # Rename and drop columns to match SISEPUEDE format
    historical_lvst_df = historical_lvst_df.drop(columns=['Item'])
    historical_lvst_df = historical_lvst_df.rename(columns={'Value': sisepuede_input_name})

    # Save the new df in its directory.
    dir_path = f'../AFOLU/{sisepuede_input_name}/input_to_sisepuede'
    historical_lvst_df.to_csv(os.path.join(dir_path, f'historical/{sisepuede_input_name}.csv'), index = False)
    
    # Generate projected input data
    max_year = historical_lvst_df.Year.max()

    projected_lvst_df = historical_lvst_df[historical_lvst_df.Year == max_year]

    projected_lvst_df = projected_lvst_df.drop(columns = "Year")

    projected_years_df = pd.DataFrame({"Year" : range(max_year+1, 2051)})
    projected_lvst_df = projected_lvst_df.merge(projected_years_df, how = "cross") 

    projected_lvst_df.to_csv(os.path.join(dir_path, f"projected/{sisepuede_input_name}.csv"), index = False)

    # Generate file for cattle non-dairy
    if lvst_item == 'Cattle':
        """
        Cattle,pop_lvst_initial_cattle_nondairy,I assume 50% of cattle in FAO is nondairy

        This just copies the cattle dairy to non-dairy. Perhaps we need to change this to something else
        """
        historical_lvst_df.to_csv(os.path.join(dir_path, f'historical/pop_lvst_initial_cattle_nondairy.csv'), index = False)
        projected_lvst_df.to_csv(os.path.join(dir_path, f"projected/pop_lvst_initial_cattle_nondairy.csv"), index = False)