In [29]:
import pandas as pd
import statsmodels.formula.api as smf
from scipy.stats.mstats import winsorize
import re
import numpy as np


In [3]:
df = pd.read_csv('../data/full_survey_data.csv')

  df = pd.read_csv('../data/full_survey_data.csv')


In [4]:
#  x ~ z
# creating the z variable and we only consider animals that potentially use crop residue as feed
# Step 1: Select relevant columns
name_cols = [f'livestock_name_{i}' for i in range(1, 6)]
died_cols = [f'livestock_died_{i}' for i in range(1, 6)]

# Step 2: Melt the livestock names and deaths
names_long = df.melt(id_vars='id_unique', value_vars=name_cols, 
                     var_name='animal_slot', value_name='livestock_name')

died_long = df.melt(id_vars='id_unique', value_vars=died_cols, 
                    var_name='animal_slot', value_name='livestock_died')

# Step 3: Make sure they align by extracting index number
names_long['slot'] = names_long['animal_slot'].str.extract(r'(\d+)').astype(int)
died_long['slot'] = died_long['animal_slot'].str.extract(r'(\d+)').astype(int)

# Step 4: Merge the two long DataFrames
long_df = pd.merge(
    names_long[['id_unique', 'slot', 'livestock_name']],
    died_long[['id_unique', 'slot', 'livestock_died']],
    on=['id_unique', 'slot']
)

# Optional: drop rows where animal name is missing
long_df = long_df.dropna(subset=['livestock_name']).reset_index(drop=True)

# Result: Each row is (id_unique, livestock_name, livestock_died)
print(long_df.head())

death_df = long_df[long_df.livestock_died.notna()]

         id_unique  slot livestock_name  livestock_died
0  bf_adn_2019_1_1     1          sheep             0.0
1  bf_adn_2019_2_1     1         cattle             0.0
2  bf_adn_2019_3_1     1           pigs             3.0
3  bf_adn_2019_4_1     1         cattle             0.0
4  bf_adn_2019_5_1     1         cattle             0.0


In [6]:
death_df 

Unnamed: 0,id_unique,slot,livestock_name,livestock_died
0,bf_adn_2019_1_1,1,sheep,0.0
1,bf_adn_2019_2_1,1,cattle,0.0
2,bf_adn_2019_3_1,1,pigs,3.0
3,bf_adn_2019_4_1,1,cattle,0.0
4,bf_adn_2019_5_1,1,cattle,0.0
...,...,...,...,...
81614,tz_glv_2017_522_1,5,pigs,0.0
81615,tz_glv_2017_562_1,5,donkeys_horses,0.0
81616,tz_glv_2017_889_1,5,chicken,3.0
81617,tz_glv_2017_892_1,5,rabbits,0.0


In [7]:
death_df.livestock_died.describe()

count    6.171200e+04
mean     7.112730e+01
std      1.610642e+04
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      1.000000e+00
max      4.000000e+06
Name: livestock_died, dtype: float64

In [8]:
# creating herd_size and renaming livestock head columns to be more succint
# ['cattle', 'chicken', 'goats', 'sheep', 'donkeys_horses', 'buffalo', 'pigs']
livestock_heads_cols = [
    'livestock_heads_sheep',
    'livestock_heads_cattle',
    'livestock_heads_pigs',
    'livestock_heads_goats',
    'livestock_heads_chicken',
    'livestock_heads_buffalo',
    'livestock_heads_donkeys_horses'
]

# ok sum of heads to get herd_size
herd_df = df[livestock_heads_cols + ['id_unique']].copy()


In [9]:
herd_df

Unnamed: 0,livestock_heads_sheep,livestock_heads_cattle,livestock_heads_pigs,livestock_heads_goats,livestock_heads_chicken,livestock_heads_buffalo,livestock_heads_donkeys_horses,id_unique
0,3.0,,,6.0,10.0,,,bf_adn_2019_1_1
1,3.0,2.0,,5.0,10.0,,,bf_adn_2019_2_1
2,,,1.0,,,,,bf_adn_2019_3_1
3,4.0,4.0,5.0,3.0,23.0,,,bf_adn_2019_4_1
4,4.0,2.0,,3.0,2.0,,,bf_adn_2019_5_1
...,...,...,...,...,...,...,...,...
54868,0.0,5.0,0.0,0.0,8.0,,0.0,zm_scn_2017_606_1
54869,0.0,12.0,0.0,4.0,0.0,,0.0,zm_scn_2017_607_1
54870,0.0,7.0,0.0,5.0,8.0,,0.0,zm_scn_2017_608_1
54871,0.0,1.0,0.0,0.0,0.0,,0.0,zm_scn_2017_609_1


In [10]:
herd_df = herd_df.rename(columns=lambda x: x.replace("livestock_heads_", "") if x.startswith("livestock_heads_") else x)
herd_df

Unnamed: 0,sheep,cattle,pigs,goats,chicken,buffalo,donkeys_horses,id_unique
0,3.0,,,6.0,10.0,,,bf_adn_2019_1_1
1,3.0,2.0,,5.0,10.0,,,bf_adn_2019_2_1
2,,,1.0,,,,,bf_adn_2019_3_1
3,4.0,4.0,5.0,3.0,23.0,,,bf_adn_2019_4_1
4,4.0,2.0,,3.0,2.0,,,bf_adn_2019_5_1
...,...,...,...,...,...,...,...,...
54868,0.0,5.0,0.0,0.0,8.0,,0.0,zm_scn_2017_606_1
54869,0.0,12.0,0.0,4.0,0.0,,0.0,zm_scn_2017_607_1
54870,0.0,7.0,0.0,5.0,8.0,,0.0,zm_scn_2017_608_1
54871,0.0,1.0,0.0,0.0,0.0,,0.0,zm_scn_2017_609_1


In [11]:
herd_df.describe()

Unnamed: 0,sheep,cattle,pigs,goats,chicken,buffalo,donkeys_horses
count,16808.0,25844.0,12974.0,21430.0,31787.0,1064.0,7057.0
mean,6.968646,6.315934,2.426083,7.173448,20.912165,1.808271,1.750744
std,23.690625,26.058973,11.312464,20.028181,297.970807,7.503332,1.540853
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,1.0,3.0,0.0,1.0
50%,3.0,2.0,0.0,3.0,9.0,1.0,1.0
75%,8.0,4.0,2.0,7.0,20.0,2.0,2.0
max,1822.0,1204.0,700.0,1000.0,50000.0,200.0,60.0


In [None]:
# Pivot to wide format
wide_death_df = (
    death_df.pivot_table(
        index="id_unique",                 # unique household ID
        columns="livestock_name",          # each animal becomes a column
        values="livestock_died",           # fill values with deaths
        aggfunc="sum"                      # if duplicates exist, sum them
    )
    .reset_index()
)

wide_death_df
# Flatten MultiIndex columns if needed
wide_df.columns.name = None


livestock_name,id_unique,alpaca,bees,buffalo,camel,cats,cattle,chicken,crocodile,dogs,...,guinea_pigs,llama,mice,otherpoultry,oxen,pigs,rabbits,rats,sheep,small_mammals
0,bf_adn_2019_10_1,,,,,,1.0,,,,...,,,,,,,,,2.0,
1,bf_adn_2019_11_1,,,,,,0.0,4.0,,,...,,,,,,,,,1.0,
2,bf_adn_2019_12_1,,,,,,0.0,1.0,,,...,,,,,,0.0,,,,
3,bf_adn_2019_13_1,,,,,,,3.0,,,...,,,,,,,,,,
4,bf_adn_2019_14_1,,,,,,0.0,7.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33362,zm_scn_2017_95_1,,,,,,0.0,,,,...,,,,,,,,,,
33363,zm_scn_2017_96_1,,,,,,1.0,,,,...,,,,,,,,,,
33364,zm_scn_2017_98_1,,,,,,0.0,,,,...,,,,,,,,,,
33365,zm_scn_2017_99_1,,,,,,1.0,,,,...,,,,,,,,,,


In [14]:

wide_death_df.columns.name = None
wide_death_df

Unnamed: 0,id_unique,alpaca,bees,buffalo,camel,cats,cattle,chicken,crocodile,dogs,...,guinea_pigs,llama,mice,otherpoultry,oxen,pigs,rabbits,rats,sheep,small_mammals
0,bf_adn_2019_10_1,,,,,,1.0,,,,...,,,,,,,,,2.0,
1,bf_adn_2019_11_1,,,,,,0.0,4.0,,,...,,,,,,,,,1.0,
2,bf_adn_2019_12_1,,,,,,0.0,1.0,,,...,,,,,,0.0,,,,
3,bf_adn_2019_13_1,,,,,,,3.0,,,...,,,,,,,,,,
4,bf_adn_2019_14_1,,,,,,0.0,7.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33362,zm_scn_2017_95_1,,,,,,0.0,,,,...,,,,,,,,,,
33363,zm_scn_2017_96_1,,,,,,1.0,,,,...,,,,,,,,,,
33364,zm_scn_2017_98_1,,,,,,0.0,,,,...,,,,,,,,,,
33365,zm_scn_2017_99_1,,,,,,1.0,,,,...,,,,,,,,,,


In [15]:
keep_cols = ["id_unique", "cattle", "chicken", "goats", "sheep", "donkeys_horses", "buffalo", "pigs"]

wide_death_df = wide_death_df[keep_cols]
wide_death_df

Unnamed: 0,id_unique,cattle,chicken,goats,sheep,donkeys_horses,buffalo,pigs
0,bf_adn_2019_10_1,1.0,,,2.0,,,
1,bf_adn_2019_11_1,0.0,4.0,0.0,1.0,,,
2,bf_adn_2019_12_1,0.0,1.0,,,,,0.0
3,bf_adn_2019_13_1,,3.0,0.0,,,,
4,bf_adn_2019_14_1,0.0,7.0,,,,,
...,...,...,...,...,...,...,...,...
33362,zm_scn_2017_95_1,0.0,,,,,,
33363,zm_scn_2017_96_1,1.0,,,,,,
33364,zm_scn_2017_98_1,0.0,,0.0,,,,
33365,zm_scn_2017_99_1,1.0,,,,,,


In [19]:
# Add '_deaths' to every animal column name except 'id_unique'
animal_cols = [col for col in wide_death_df.columns if col != 'id_unique']
wide_death_df = wide_death_df.rename(columns={col: f"{col}_deaths" for col in animal_cols})
wide_death_df.head()

Unnamed: 0,id_unique,cattle_size_deaths,chicken_size_deaths,goats_size_deaths,sheep_size_deaths,donkeys_horses_size_deaths,buffalo_size_deaths,pigs_size_deaths
0,bf_adn_2019_10_1,1.0,,,2.0,,,
1,bf_adn_2019_11_1,0.0,4.0,0.0,1.0,,,
2,bf_adn_2019_12_1,0.0,1.0,,,,,0.0
3,bf_adn_2019_13_1,,3.0,0.0,,,,
4,bf_adn_2019_14_1,0.0,7.0,,,,,


In [18]:
# Add '_size' to every animal column name except 'id_unique' in herd_df
animal_cols_herd = [col for col in herd_df.columns if col != 'id_unique']
herd_df = herd_df.rename(columns={col: f"{col}_size" for col in animal_cols_herd})
herd_df.head()

Unnamed: 0,sheep_size,cattle_size,pigs_size,goats_size,chicken_size,buffalo_size,donkeys_horses_size,id_unique
0,3.0,,,6.0,10.0,,,bf_adn_2019_1_1
1,3.0,2.0,,5.0,10.0,,,bf_adn_2019_2_1
2,,,1.0,,,,,bf_adn_2019_3_1
3,4.0,4.0,5.0,3.0,23.0,,,bf_adn_2019_4_1
4,4.0,2.0,,3.0,2.0,,,bf_adn_2019_5_1


In [20]:
merged_df = pd.merge(wide_death_df, herd_df, on='id_unique', how='inner')
merged_df.head()

Unnamed: 0,id_unique,cattle_size_deaths,chicken_size_deaths,goats_size_deaths,sheep_size_deaths,donkeys_horses_size_deaths,buffalo_size_deaths,pigs_size_deaths,sheep_size,cattle_size,pigs_size,goats_size,chicken_size,buffalo_size,donkeys_horses_size
0,bf_adn_2019_10_1,1.0,,,2.0,,,,5.0,4.0,,6.0,10.0,,
1,bf_adn_2019_11_1,0.0,4.0,0.0,1.0,,,,4.0,1.0,,1.0,4.0,,
2,bf_adn_2019_12_1,0.0,1.0,,,,,0.0,,1.0,2.0,,3.0,,
3,bf_adn_2019_13_1,,3.0,0.0,,,,,,,,2.0,6.0,,
4,bf_adn_2019_14_1,0.0,7.0,,,,,,,1.0,,,60.0,,


In [22]:
death_cols = [col for col in merged_df.columns if col.endswith('_size_deaths')]
rename_dict = {col: col.replace('_size_deaths', '_deaths') for col in death_cols}
merged_df = merged_df.rename(columns=rename_dict)
merged_df.head()

Unnamed: 0,id_unique,cattle_deaths,chicken_deaths,goats_deaths,sheep_deaths,donkeys_horses_deaths,buffalo_deaths,pigs_deaths,sheep_size,cattle_size,pigs_size,goats_size,chicken_size,buffalo_size,donkeys_horses_size
0,bf_adn_2019_10_1,1.0,,,2.0,,,,5.0,4.0,,6.0,10.0,,
1,bf_adn_2019_11_1,0.0,4.0,0.0,1.0,,,,4.0,1.0,,1.0,4.0,,
2,bf_adn_2019_12_1,0.0,1.0,,,,,0.0,,1.0,2.0,,3.0,,
3,bf_adn_2019_13_1,,3.0,0.0,,,,,,,,2.0,6.0,,
4,bf_adn_2019_14_1,0.0,7.0,,,,,,,1.0,,,60.0,,


In [23]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33367 entries, 0 to 33366
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id_unique              33367 non-null  object 
 1   cattle_deaths          19387 non-null  float64
 2   chicken_deaths         11927 non-null  float64
 3   goats_deaths           11015 non-null  float64
 4   sheep_deaths           7845 non-null   float64
 5   donkeys_horses_deaths  4189 non-null   float64
 6   buffalo_deaths         509 non-null    float64
 7   pigs_deaths            4437 non-null   float64
 8   sheep_size             9845 non-null   float64
 9   cattle_size            18427 non-null  float64
 10  pigs_size              6731 non-null   float64
 11  goats_size             13366 non-null  float64
 12  chicken_size           21368 non-null  float64
 13  buffalo_size           444 non-null    float64
 14  donkeys_horses_size    5727 non-null   float64
dtypes:

In [24]:
merged_df.describe()

Unnamed: 0,cattle_deaths,chicken_deaths,goats_deaths,sheep_deaths,donkeys_horses_deaths,buffalo_deaths,pigs_deaths,sheep_size,cattle_size,pigs_size,goats_size,chicken_size,buffalo_size,donkeys_horses_size
count,19387.0,11927.0,11015.0,7845.0,4189.0,509.0,4437.0,9845.0,18427.0,6731.0,13366.0,21368.0,444.0,5727.0
mean,211.2585,8.882452,5.07163,11.881963,6.026737,0.078585,1.200361,9.720569,7.360341,3.811172,8.733054,22.201703,2.790541,1.800943
std,28730.3,75.803785,381.144741,753.120721,313.550646,0.373472,5.303861,30.150712,30.42386,12.615593,24.454992,112.619663,3.264668,1.608193
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,5.0,1.0,1.0
50%,0.0,2.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0,2.0,4.0,10.0,2.0,1.0
75%,0.0,8.0,2.0,2.0,0.0,0.0,1.0,10.0,4.0,4.0,8.0,20.0,3.0,2.0
max,4000000.0,7096.0,40000.0,65000.0,20000.0,4.0,240.0,1822.0,1204.0,600.0,1000.0,10000.0,40.0,60.0


In [25]:
#winsorize chicken_size, donkeys_horses_deaths, sheep_deaths, goats_deaths, chicken_deaths, cattle_deaths
test_df = merged_df.copy()

cols_to_winsorize = [
    "chicken_size",
    "donkeys_horses_deaths",
    "sheep_deaths",
    "goats_deaths",
    "chicken_deaths",
    "cattle_deaths"
]

for col in cols_to_winsorize:
    if col in test_df.columns:
        test_df[col] = winsorize(test_df[col], limits=[0, 0.05])



In [26]:
test_df.describe()

  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(


Unnamed: 0,cattle_deaths,chicken_deaths,goats_deaths,sheep_deaths,donkeys_horses_deaths,buffalo_deaths,pigs_deaths,sheep_size,cattle_size,pigs_size,goats_size,chicken_size,buffalo_size,donkeys_horses_size
count,19387.0,11927.0,11015.0,7845.0,4189.0,509.0,4437.0,9845.0,18427.0,6731.0,13366.0,21368.0,444.0,5727.0
mean,211.2585,8.882452,5.07163,11.881963,6.026737,0.078585,1.200361,9.720569,7.360341,3.811172,8.733054,22.201703,2.790541,1.800943
std,28730.3,75.803785,381.144741,753.120721,313.550646,0.373472,5.303861,30.150712,30.42386,12.615593,24.454992,112.619663,3.264668,1.608193
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,5.0,1.0,1.0
50%,0.0,2.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0,2.0,4.0,10.0,2.0,1.0
75%,0.0,8.0,2.0,2.0,0.0,0.0,1.0,10.0,4.0,4.0,8.0,20.0,3.0,2.0
max,4000000.0,7096.0,40000.0,65000.0,20000.0,4.0,240.0,1822.0,1204.0,600.0,1000.0,10000.0,40.0,60.0


Unnamed: 0,id_unique,cattle_deaths,chicken_deaths,goats_deaths,sheep_deaths,donkeys_horses_deaths,buffalo_deaths,pigs_deaths,sheep_size,cattle_size,pigs_size,goats_size,chicken_size,buffalo_size,donkeys_horses_size
0,bf_adn_2019_10_1,1.0,,,2.0,,,,5.0,4.0,,6.0,10.0,,
1,bf_adn_2019_11_1,0.0,4.0,0.0,1.0,,,,4.0,1.0,,1.0,4.0,,
2,bf_adn_2019_12_1,0.0,1.0,,,,,0.0,,1.0,2.0,,3.0,,
3,bf_adn_2019_13_1,,3.0,0.0,,,,,,,,2.0,6.0,,
4,bf_adn_2019_14_1,0.0,7.0,,,,,,,1.0,,,60.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33362,zm_scn_2017_95_1,0.0,,,,,,,0.0,6.0,0.0,0.0,0.0,,0.0
33363,zm_scn_2017_96_1,1.0,,,,,,,0.0,5.0,0.0,0.0,18.0,,0.0
33364,zm_scn_2017_98_1,0.0,,0.0,,,,,0.0,6.0,0.0,9.0,13.0,,0.0
33365,zm_scn_2017_99_1,1.0,,,,,,,0.0,12.0,0.0,0.0,6.0,,0.0


In [50]:

test_df = merged_df.copy()

cols_to_winsorize = [
    "chicken_size",
    "donkeys_horses_deaths",
    "sheep_deaths",
    "goats_deaths",
    "chicken_deaths",
    "cattle_deaths"
]

for col in cols_to_winsorize:
    if col in test_df.columns:
        # Copy column
        series = test_df[col].copy()
        mask = series.notna()

        # Pick limit depending on column name
        if "_deaths" in col:
            limit = [0, 0.0008]   # cap top 0.08%
        else:
            limit = [0, 0.05]   # cap top 5%

        # Winsorize non-missing values
        winsorized = winsorize(series[mask].to_numpy(), limits=limit)

        # Assign back
        test_df.loc[mask, col] = winsorized

In [51]:
test_df.describe()

Unnamed: 0,cattle_deaths,chicken_deaths,goats_deaths,sheep_deaths,donkeys_horses_deaths,buffalo_deaths,pigs_deaths,sheep_size,cattle_size,pigs_size,goats_size,chicken_size,buffalo_size,donkeys_horses_size
count,19387.0,11927.0,11015.0,7845.0,4189.0,509.0,4437.0,9845.0,18427.0,6731.0,13366.0,21368.0,444.0,5727.0
mean,0.81663,7.848998,1.411711,1.677757,0.139651,0.078585,1.200361,9.720569,7.360341,3.811172,8.733054,14.745601,2.790541,1.800943
std,5.07181,22.395267,4.136537,3.774788,0.625643,0.373472,5.303861,30.150712,30.42386,12.615593,24.454992,14.232939,3.264668,1.608193
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,5.0,1.0,1.0
50%,0.0,2.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0,2.0,4.0,10.0,2.0,1.0
75%,0.0,8.0,2.0,2.0,0.0,0.0,1.0,10.0,4.0,4.0,8.0,20.0,3.0,2.0
max,114.0,430.0,80.0,50.0,13.0,4.0,240.0,1822.0,1204.0,600.0,1000.0,53.0,40.0,60.0


In [52]:
#  Year
test_df = test_df.merge(
    df[["id_unique", "year"]],
    on="id_unique",
    how="left"
)

# country
test_df = test_df.merge(
    df[["id_unique", "country"]],
    on="id_unique",
    how="left"
)

test_df.head()

Unnamed: 0,id_unique,cattle_deaths,chicken_deaths,goats_deaths,sheep_deaths,donkeys_horses_deaths,buffalo_deaths,pigs_deaths,sheep_size,cattle_size,pigs_size,goats_size,chicken_size,buffalo_size,donkeys_horses_size,year,country
0,bf_adn_2019_10_1,1.0,,,2.0,,,,5.0,4.0,,6.0,10.0,,,2019,burkina_faso
1,bf_adn_2019_11_1,0.0,4.0,0.0,1.0,,,,4.0,1.0,,1.0,4.0,,,2019,burkina_faso
2,bf_adn_2019_12_1,0.0,1.0,,,,,0.0,,1.0,2.0,,3.0,,,2019,burkina_faso
3,bf_adn_2019_13_1,,3.0,0.0,,,,,,,,2.0,6.0,,,2019,burkina_faso
4,bf_adn_2019_14_1,0.0,7.0,,,,,,,1.0,,,53.0,,,2019,burkina_faso


In [53]:
test_df.describe()

Unnamed: 0,cattle_deaths,chicken_deaths,goats_deaths,sheep_deaths,donkeys_horses_deaths,buffalo_deaths,pigs_deaths,sheep_size,cattle_size,pigs_size,goats_size,chicken_size,buffalo_size,donkeys_horses_size,year
count,19387.0,11927.0,11015.0,7845.0,4189.0,509.0,4437.0,9845.0,18427.0,6731.0,13366.0,21368.0,444.0,5727.0,33367.0
mean,0.81663,7.848998,1.411711,1.677757,0.139651,0.078585,1.200361,9.720569,7.360341,3.811172,8.733054,14.745601,2.790541,1.800943,2019.612581
std,5.07181,22.395267,4.136537,3.774788,0.625643,0.373472,5.303861,30.150712,30.42386,12.615593,24.454992,14.232939,3.264668,1.608193,1.698316
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,5.0,1.0,1.0,2018.0
50%,0.0,2.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0,2.0,4.0,10.0,2.0,1.0,2020.0
75%,0.0,8.0,2.0,2.0,0.0,0.0,1.0,10.0,4.0,4.0,8.0,20.0,3.0,2.0,2021.0
max,114.0,430.0,80.0,50.0,13.0,4.0,240.0,1822.0,1204.0,600.0,1000.0,53.0,40.0,60.0,2023.0


In [75]:
import statsmodels.formula.api as smf

animals = ["cattle", "chicken", "goats", "sheep", "donkeys_horses", "buffalo", "pigs"]

for animal in animals:
    size_col = f"{animal}_size"
    deaths_col = f"{animal}_deaths"
    
    print(f"\n--- {animal.capitalize()} Model with Year & Country FEs ---")
    formula = f"{size_col} ~ {deaths_col} + C(year) + C(country)"
    model = smf.ols(formula=formula, data=test_df).fit()
    print(model.summary())


--- Cattle Model with Year & Country FEs ---
                            OLS Regression Results                            
Dep. Variable:            cattle_size   R-squared:                       0.260
Model:                            OLS   Adj. R-squared:                  0.258
Method:                 Least Squares   F-statistic:                     157.7
Date:                Thu, 18 Sep 2025   Prob (F-statistic):               0.00
Time:                        21:47:18   Log-Likelihood:                -78830.
No. Observations:               16660   AIC:                         1.577e+05
Df Residuals:                   16622   BIC:                         1.580e+05
Df Model:                          37                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [56]:
#  Year
merged_df = merged_df.merge(
    df[["id_unique", "year"]],
    on="id_unique",
    how="left"
)

# country
merged_df = merged_df.merge(
    df[["id_unique", "country"]],
    on="id_unique",
    how="left"
)

In [74]:
for animal in animals:
    size_col = f"{animal}_size"
    deaths_col = f"{animal}_deaths"
    
    print(f"\n--- {animal.capitalize()} Model with Year & Country FEs ---")
    formula = f"{size_col} ~ {deaths_col} + C(year) + C(country)"
    model = smf.ols(formula=formula, data=merged_df).fit()
    print(model.summary())


--- Cattle Model with Year & Country FEs ---
                            OLS Regression Results                            
Dep. Variable:            cattle_size   R-squared:                       0.140
Model:                            OLS   Adj. R-squared:                  0.138
Method:                 Least Squares   F-statistic:                     73.09
Date:                Thu, 18 Sep 2025   Prob (F-statistic):               0.00
Time:                        21:47:02   Log-Likelihood:                -80081.
No. Observations:               16660   AIC:                         1.602e+05
Df Residuals:                   16622   BIC:                         1.605e+05
Df Model:                          37                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [61]:

new_test_df = merged_df.copy()

cols_to_winsorize = merged_df.columns

for col in cols_to_winsorize:
    if col in new_test_df.columns:
        # Copy column
        series = new_test_df[col].copy()
        mask = series.notna()

        # Pick limit depending on column name
        if "_deaths" in col:
            limit = [0, 0.0008]   # cap top 0.08%
        else:
            limit = [0, 0.05]   # cap top 5%

        # Winsorize non-missing values
        winsorized = winsorize(series[mask].to_numpy(), limits=limit)

        # Assign back
        new_test_df.loc[mask, col] = winsorized

In [62]:
new_test_df.describe()

Unnamed: 0,cattle_deaths,chicken_deaths,goats_deaths,sheep_deaths,donkeys_horses_deaths,buffalo_deaths,pigs_deaths,sheep_size,cattle_size,pigs_size,goats_size,chicken_size,buffalo_size,donkeys_horses_size,year
count,19387.0,11927.0,11015.0,7845.0,4189.0,509.0,4437.0,9845.0,18427.0,6731.0,13366.0,21368.0,444.0,5727.0,33367.0
mean,0.81663,7.848998,1.411711,1.677757,0.139651,0.078585,1.150778,7.207009,4.034732,3.072798,6.290663,14.745601,2.513514,1.70997,2019.608655
std,5.07181,22.395267,4.136537,3.774788,0.625643,0.373472,3.807417,7.312426,4.946483,3.469166,6.549051,14.232939,1.93163,1.051192,1.691624
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,5.0,1.0,1.0,2018.0
50%,0.0,2.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0,2.0,4.0,10.0,2.0,1.0,2020.0
75%,0.0,8.0,2.0,2.0,0.0,0.0,1.0,10.0,4.0,4.0,8.0,20.0,3.0,2.0,2021.0
max,114.0,430.0,80.0,50.0,13.0,4.0,70.0,28.0,20.0,13.0,26.0,53.0,8.0,4.0,2022.0


In [63]:
#  Year
new_test_df = new_test_df.merge(
    df[["id_unique", "year"]],
    on="id_unique",
    how="left"
)

# country
new_test_df = new_test_df.merge(
    df[["id_unique", "country"]],
    on="id_unique",
    how="left"
)

In [66]:
new_test_df.drop(columns=["year_y", "country_y"], inplace=True)
new_test_df.rename(columns={"year_x": "year", "country_x": "country"}, inplace=True)

In [67]:
new_test_df

Unnamed: 0,id_unique,cattle_deaths,chicken_deaths,goats_deaths,sheep_deaths,donkeys_horses_deaths,buffalo_deaths,pigs_deaths,sheep_size,cattle_size,pigs_size,goats_size,chicken_size,buffalo_size,donkeys_horses_size,year,country
0,bf_adn_2019_10_1,1.0,,,2.0,,,,5.0,4.0,,6.0,10.0,,,2019,burkina_faso
1,bf_adn_2019_11_1,0.0,4.0,0.0,1.0,,,,4.0,1.0,,1.0,4.0,,,2019,burkina_faso
2,bf_adn_2019_12_1,0.0,1.0,,,,,0.0,,1.0,2.0,,3.0,,,2019,burkina_faso
3,bf_adn_2019_13_1,,3.0,0.0,,,,,,,,2.0,6.0,,,2019,burkina_faso
4,bf_adn_2019_14_1,0.0,7.0,,,,,,,1.0,,,53.0,,,2019,burkina_faso
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33362,vn_crp_2020_5_1,0.0,,,,,,,0.0,6.0,0.0,0.0,0.0,,0.0,2017,vietnam
33363,vn_crp_2020_5_1,1.0,,,,,,,0.0,5.0,0.0,0.0,18.0,,0.0,2017,vietnam
33364,vn_crp_2020_5_1,0.0,,0.0,,,,,0.0,6.0,0.0,9.0,13.0,,0.0,2017,vietnam
33365,vn_crp_2020_5_1,1.0,,,,,,,0.0,12.0,0.0,0.0,6.0,,0.0,2017,vietnam


In [73]:

for animal in animals:
    size_col = f"{animal}_size"
    deaths_col = f"{animal}_deaths"
    
    print(f"\n--- {animal.capitalize()} Model with Year & Country FEs ---")
    formula = f"{size_col} ~ {deaths_col} + C(year) + C(country)"
    model = smf.ols(formula=formula, data=new_test_df).fit()
    print(model.summary())


--- Cattle Model with Year & Country FEs ---
                            OLS Regression Results                            
Dep. Variable:            cattle_size   R-squared:                       0.351
Model:                            OLS   Adj. R-squared:                  0.350
Method:                 Least Squares   F-statistic:                     257.0
Date:                Thu, 18 Sep 2025   Prob (F-statistic):               0.00
Time:                        21:46:32   Log-Likelihood:                -46964.
No. Observations:               16660   AIC:                         9.400e+04
Df Residuals:                   16624   BIC:                         9.428e+04
Df Model:                          35                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

In [None]:
# do this in the begining instead
new_test_df['country'] = new_test_df['country'].replace('Mali', 'mali')
merged_df['country'] = merged_df['country'].replace('Mali', 'mali')
test_df['country'] = test_df['country'].replace('Mali', 'mali')