### Data Preprocessing Notebook for Initial Livestock Data
- Author: Juan Antonio Robledo Lara
- Objective: This notebooks allows the user to test the data preprocessing script for initial livestock data so its easier to create new updates if necessary.
- Notes: Refer to the main script `initial_lvst_preprocessing.py` to run a new update of the livestock data.

In [1]:
import pandas as pd
import os

In [2]:
raw_data_path = '../AFOLU/pop_lvst_raw_data'
raw_data_file_name = 'FAOSTAT_livestock_data.csv'

In [3]:
lvst_raw_df = pd.read_csv(os.path.join(raw_data_path, raw_data_file_name))
lvst_raw_df.head()

Unnamed: 0,Domain Code,Domain,Area Code (ISO3),Area,Element Code,Element,Item Code (CPC),Item,Year Code,Year,Unit,Value,Flag,Flag Description
0,QCL,Crops and livestock products,AFG,Afghanistan,5111,Stocks,2132.0,Asses,2011,2011,An,1466000,A,Official figure
1,QCL,Crops and livestock products,AFG,Afghanistan,5111,Stocks,2132.0,Asses,2012,2012,An,1423000,A,Official figure
2,QCL,Crops and livestock products,AFG,Afghanistan,5111,Stocks,2132.0,Asses,2013,2013,An,1451000,A,Official figure
3,QCL,Crops and livestock products,AFG,Afghanistan,5111,Stocks,2132.0,Asses,2014,2014,An,1441000,A,Official figure
4,QCL,Crops and livestock products,AFG,Afghanistan,5111,Stocks,2132.0,Asses,2015,2015,An,1481000,A,Official figure


In [4]:
# Drop irrelevant columns
lvst_df = lvst_raw_df[['Area Code (ISO3)', 'Area', 'Year', 'Item', 'Value']]
lvst_df = lvst_df.rename(columns={'Area Code (ISO3)':'iso_code3', 'Area':'Nation'})
lvst_df.head()

Unnamed: 0,iso_code3,Nation,Year,Item,Value
0,AFG,Afghanistan,2011,Asses,1466000
1,AFG,Afghanistan,2012,Asses,1423000
2,AFG,Afghanistan,2013,Asses,1451000
3,AFG,Afghanistan,2014,Asses,1441000
4,AFG,Afghanistan,2015,Asses,1481000


In [5]:
# Checking if there are missing iso_code3 values
print(lvst_df.info())
print(lvst_df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21419 entries, 0 to 21418
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   iso_code3  21419 non-null  object
 1   Nation     21419 non-null  object
 2   Year       21419 non-null  int64 
 3   Item       21419 non-null  object
 4   Value      21419 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 836.8+ KB
None
iso_code3    0
Nation       0
Year         0
Item         0
Value        0
dtype: int64


In [6]:
# Checking that the df looks good using IRAN pigs as a example which should be all 0.
lvst_df[(lvst_df.iso_code3 == 'IRN') & (lvst_df.Item == 'Swine / pigs')]

Unnamed: 0,iso_code3,Nation,Year,Item,Value
9393,IRN,Iran (Islamic Republic of),2011,Swine / pigs,0
9394,IRN,Iran (Islamic Republic of),2012,Swine / pigs,0
9395,IRN,Iran (Islamic Republic of),2013,Swine / pigs,0
9396,IRN,Iran (Islamic Republic of),2014,Swine / pigs,0
9397,IRN,Iran (Islamic Republic of),2015,Swine / pigs,0
9398,IRN,Iran (Islamic Republic of),2016,Swine / pigs,0
9399,IRN,Iran (Islamic Republic of),2017,Swine / pigs,0
9400,IRN,Iran (Islamic Republic of),2018,Swine / pigs,0
9401,IRN,Iran (Islamic Republic of),2019,Swine / pigs,0
9402,IRN,Iran (Islamic Republic of),2020,Swine / pigs,0


In [7]:
# Create a dictionary from items_classification.csv to decide the format and which livestock items we will use to create the inptus for sisepuede
cw_df = pd.read_csv(os.path.join(raw_data_path,'items_classification.csv'))
cw_dict = dict(zip(cw_df['Item_Fao'], cw_df['File_Sisepuede']))
cw_dict

{'Buffalo': 'pop_lvst_initial_buffalo',
 'Cattle': 'pop_lvst_initial_cattle_nondairy',
 'Chickens': 'pop_lvst_initial_chickens',
 'Goats': 'pop_lvst_initial_goats',
 'Horses': 'pop_lvst_initial_horses',
 'Mules and hinnies': 'pop_lvst_initial_mules',
 'Swine / pigs': 'pop_lvst_initial_pigs',
 'Sheep': 'pop_lvst_initial_sheep'}

In [8]:
# Filter the data to only get the livestock items in the cw file
lvst_df = lvst_df[lvst_df.Item.isin(cw_df.Item_Fao)]
lvst_df.Item.unique()

array(['Cattle', 'Chickens', 'Goats', 'Horses', 'Mules and hinnies',
       'Sheep', 'Buffalo', 'Swine / pigs'], dtype=object)

In [9]:
# Create a new column with the sisepuede item names
lvst_df["sisepuede_item"] = lvst_df["Item"].replace(cw_dict)
lvst_df.head()

Unnamed: 0,iso_code3,Nation,Year,Item,Value,sisepuede_item
24,AFG,Afghanistan,2011,Cattle,5524000,pop_lvst_initial_cattle_nondairy
25,AFG,Afghanistan,2012,Cattle,5244000,pop_lvst_initial_cattle_nondairy
26,AFG,Afghanistan,2013,Cattle,5235000,pop_lvst_initial_cattle_nondairy
27,AFG,Afghanistan,2014,Cattle,5349000,pop_lvst_initial_cattle_nondairy
28,AFG,Afghanistan,2015,Cattle,5261000,pop_lvst_initial_cattle_nondairy


In [10]:
# Perform a groupby in case several Items from FAO share the same sisepuede item
# lvst_grouped_df = lvst_df.groupby(["iso_code3", "Nation","Year","sisepuede_item"])["Value"].mean().reset_index()  # The groupby is creating an error in the measurements it is not yet necessary
lvst_pivot_df = lvst_df.pivot_table(index=['iso_code3', 'Nation', 'Year'], columns='sisepuede_item', values='Value').reset_index()  
lvst_pivot_df["pop_lvst_initial_cattle_dairy"] = lvst_pivot_df["pop_lvst_initial_cattle_nondairy"].copy()
lvst_pivot_df.head()

sisepuede_item,iso_code3,Nation,Year,pop_lvst_initial_buffalo,pop_lvst_initial_cattle_nondairy,pop_lvst_initial_chickens,pop_lvst_initial_goats,pop_lvst_initial_horses,pop_lvst_initial_mules,pop_lvst_initial_pigs,pop_lvst_initial_sheep,pop_lvst_initial_cattle_dairy
0,AFG,Afghanistan,2011,,5524000.0,13378.0,7635000.0,181000.0,25000.0,,14262000.0,5524000.0
1,AFG,Afghanistan,2012,,5244000.0,13212.0,7311000.0,178000.0,24000.0,,13820000.0,5244000.0
2,AFG,Afghanistan,2013,,5235000.0,12053.0,7037000.0,171000.0,21000.0,,13141000.0,5235000.0
3,AFG,Afghanistan,2014,,5349000.0,11098.0,7059000.0,171000.0,24000.0,,13485000.0,5349000.0
4,AFG,Afghanistan,2015,,5261000.0,11863.0,7723000.0,173000.0,24500.0,,13218000.0,5261000.0


In [11]:
lvst_pivot_df[lvst_pivot_df.iso_code3 == 'IRN'][['Year', 'pop_lvst_initial_pigs']]

sisepuede_item,Year,pop_lvst_initial_pigs
985,2011,0.0
986,2012,0.0
987,2013,0.0
988,2014,0.0
989,2015,0.0
990,2016,0.0
991,2017,0.0
992,2018,0.0
993,2019,0.0
994,2020,0.0


In [12]:
# Generate new csv files for each selected livestock item
for sisepuede_var_name in [i for i in lvst_pivot_df.columns if "pop_lvst" in i]:
    
    if lvst_pivot_df[sisepuede_var_name].isna().any():   
        lvst_pivot_df[sisepuede_var_name] = lvst_pivot_df[sisepuede_var_name].fillna(0)

    # Save the new df in its directory.
    dir_path = f'../AFOLU/{sisepuede_var_name}/input_to_sisepuede'    
    lvst_pivot_df[["iso_code3","Nation", "Year",sisepuede_var_name]].to_csv(os.path.join(dir_path,f"historical/{sisepuede_var_name}.csv"), index = False)


In [13]:
# Generate projected input data
max_year = lvst_pivot_df.Year.max()

projected_lvst_df = lvst_pivot_df[lvst_pivot_df.Year == max_year]

projected_lvst_df = projected_lvst_df.drop(columns = "Year")

projected_years_df = pd.DataFrame({"Year" : range(max_year+1, 2051)})
projected_lvst_df = projected_lvst_df.merge(projected_years_df, how = "cross") 


for sise_var in [i for i in lvst_pivot_df.columns if "pop_lvst" in i]:
    dir_path = f'../AFOLU/{sise_var}/input_to_sisepuede'
    projected_lvst_df[["iso_code3","Nation", "Year",sise_var]].to_csv(os.path.join(dir_path,f"projected/{sise_var}.csv"), index = False)