In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import itertools

def get_row_from_metadata(metadata, covariate_name):
    """
    Extracts a specific row from the metadata DataFrame based on the covariate name.

    :param metadata: DataFrame containing metadata.
    :param covariate_name: Name of the covariate to extract.
    :return: Row corresponding to the specified covariate name.
    """
    return metadata.loc[metadata['variable_name'] == covariate_name].squeeze()

def all_rows_from_metadata_containing(metadata, substring):
    """
    Extracts all rows from the metadata DataFrame that contain a specific substring in the variable name.

    :param metadata: DataFrame containing metadata.
    :param substring: Substring to search for in the variable names.
    :return: DataFrame containing all rows with variable names that contain the substring.
    """
    return metadata[metadata['variable_name'].str.contains(substring, na=False)].reset_index(drop=True)

def all_column_names_containing(df, substring):
    """
    Extracts all column names from the DataFrame that contain a specific substring.

    :param df: DataFrame to search for column names.
    :param substring: Substring to search for in the column names.
    :return: List of column names containing the specified substring.
    """
    return [col for col in df.columns if substring in col]


def find_equivalent_columns(data, summary, numeric_tolerance=1e-6, categorical_threshold=0.99):
    """
    Find pairs of columns in a DataFrame that are informationally equivalent.
    
    Parameters:
    -----------
    data : pandas DataFrame
        The DataFrame to analyze
    numeric_tolerance : float, default 1e-6
        Tolerance for considering numeric columns equal or proportional
    categorical_threshold : float, default 0.99
        Threshold for considering categorical columns equivalent (percentage match)
    
    Returns:
    --------
    list of tuples
        Each tuple contains (col1, col2, relationship_type)
        where relationship_type is one of: 'identical', 'proportional', 'categorical_equivalent'
    """
    equivalent_pairs = []
    columns = data.columns
    
    # Get column types
    
    numeric_cols = summary[summary.data_type == 'numeric'].variable_name.tolist()
    categorical_cols = summary[summary.data_type == 'categorical'].variable_name.tolist()

    # Identify constant columns
    constant_cols = []
    for col in columns:
        unique_values = data[col].dropna().unique()
        if len(unique_values) <= 1:
            constant_cols.append(col)

    # Print constant columns if verbose
    if len(constant_cols) > 0:
        print("Constant columns:")
        for col in constant_cols:
            print(col)
        print()
    

    # Remove constant columns from numeric and categorical lists
    numeric_cols = [col for col in numeric_cols if col not in constant_cols]
    categorical_cols = [col for col in categorical_cols if col not in constant_cols]

    # remove missingness-indicator columns
    missingness_cols = list(
        set(all_column_names_containing(data, '_missing') + 
        all_column_names_containing(data, '_m'))
    )
    numeric_cols = [col for col in numeric_cols if col not in missingness_cols]
    categorical_cols = [col for col in categorical_cols if col not in missingness_cols]
    
    # Check numeric columns for equality or proportionality
    for col1, col2 in itertools.combinations(numeric_cols, 2):

        # Check for identical values first
        if data[col1].equals(data[col2]):
            equivalent_pairs.append((col1, col2, 'identical'))
            continue
            
        # Check for identical values where neither is zero
        valid_mask = ~data[col1].isna() & ~data[col2].isna()
        if np.allclose(data.loc[valid_mask, col1], data.loc[valid_mask, col2], 
                      rtol=numeric_tolerance, atol=numeric_tolerance):
            equivalent_pairs.append((col1, col2, 'nearly_identical'))
            continue
        
        # For rows with zeros, check if the columns are exactly equal
        zero_mask = (data[col1] == 0) | (data[col2] == 0)
        non_zero_mask = ~zero_mask & valid_mask
        
        # Check if the columns have the same values where zeros are present
        if zero_mask.any():
            zero_equality = (data.loc[zero_mask & valid_mask, col1] == 
                             data.loc[zero_mask & valid_mask, col2]).all()
        else:
            zero_equality = True
            
        # Check for proportional relationship in non-zero values
        if non_zero_mask.sum() > 10:  # Require at least some non-zero values
            ratios = data.loc[non_zero_mask, col2] / data.loc[non_zero_mask, col1]
            ratio_std = ratios.std()
            
            # If standard deviation of ratios is very small, columns are proportional
            if ratio_std < numeric_tolerance and zero_equality:
                ratio = ratios.mean()
                equivalent_pairs.append((col1, col2, f'proportional (factor: {ratio:.4f})'))
    
    # Create a list of all columns to check for categorical equivalence
    # This includes both explicit categorical columns and numeric columns
    all_potential_categorical_cols = categorical_cols + numeric_cols
    
    # Check all columns for equivalent categorical mappings
    for col1, col2 in itertools.combinations(all_potential_categorical_cols, 2):
        # Skip if identical columns or already identified as identical or proportional
        if col1 == col2 or any((col1, col2, rel) in equivalent_pairs for rel in 
                               ['identical', 'nearly_identical', 'proportional']):
            continue
            
        # Get unique values for both columns
        unique_vals1 = data[col1].dropna().unique()
        unique_vals2 = data[col2].dropna().unique()
        
        # Skip if columns have different number of unique values
        if len(unique_vals1) != len(unique_vals2):
            continue
            
        # Skip if too many unique values (likely not categorical)
        if len(unique_vals1) > 100:  # Arbitrary threshold, adjust as needed
            continue
            
        # Create a mapping table between values in both columns
        mapping_df = data[[col1, col2]].dropna().drop_duplicates()
        
        # Check if mapping is one-to-one (each value in col1 maps to exactly one value in col2)
        is_one_to_one = True
        
        # Check col1 -> col2 mapping
        for val in unique_vals1:
            corresponding_vals = data.loc[data[col1] == val, col2].dropna().unique()
            if len(corresponding_vals) != 1:
                is_one_to_one = False
                break
                
        # Check col2 -> col1 mapping
        if is_one_to_one:
            for val in unique_vals2:
                corresponding_vals = data.loc[data[col2] == val, col1].dropna().unique()
                if len(corresponding_vals) != 1:
                    is_one_to_one = False
                    break
        
        if is_one_to_one:
            # If we create a new column using the mapping, it should match the original
            val_mapping = dict(zip(mapping_df[col1], mapping_df[col2]))
            
            # Apply mapping and handle NaN values
            mapped_values = data[col1].map(val_mapping)
            
            # Count matches (ignoring NaN values)
            valid_mask = ~data[col1].isna() & ~data[col2].isna()
            if valid_mask.sum() > 0:
                match_percentage = (mapped_values == data[col2])[valid_mask].mean()
                
                if match_percentage >= categorical_threshold:
                    # Determine if both are numeric or mixed types
                    if col1 in numeric_cols and col2 in numeric_cols:
                        relationship = 'numeric_categorical_equivalent'
                    else:
                        relationship = 'categorical_equivalent'
                    equivalent_pairs.append((col1, col2, relationship))
    
    return equivalent_pairs

"""
Done in this notebook
- Ensure that missingness-indicator columns exist.
    - You probably can't conclusively check that all are included, because the data you get will not necessarily reveal which columns had missingness, but check that there are some missingness columns, and none for categorical data.
- Ensure there are no NaNs in the data.
- Ensure column names:
    - In data: "hhid" (if household ID is included), "consumption_per_capita_per_day", "hh_wgt".
    - Consumption: Check mean and std for sanity. In a poor country, the mean should be low-mid single digits: e.g., in Uganda, the mean is $3.80/day.
- Check for columns that indicate units:
    - If they are present, the corresponding numeric field should be standardized, e.g., all area units adjusted to square meters.
- Check that metadata and the dataset itself match:
    - Every column in data is described in metadata and vice versa. It's also OK if `hhid` is not in the data at all.
- In metadata:
    - "variable_name".
    - "data_type", with permitted values "numeric" and "categorical".
    - "geographic_indicator".
- Scan datatypes:
    - In particular, make sure nothing is numeric which should be categorical.
    - Ensure categorical-type columns have the appropriate type even if the categories are encoded as integers (if a column is binary, with no missing values, it can be numeric or categorical).
    - IDs of all kinds are strings even if they appear numeric.
- Check for duplication
- Check feasibility of stratification
""";

## Read in

In [None]:
data_path = Path('/data/eop')

data, summary, proposed_stratifier = None, None, None

country = 'Nigeria'  # Change this to the desired country

if country == 'Burkina Faso':
    data = pd.read_parquet(
        data_path / 'burkina_faso' / 'cleaned' / 'burkinafaso_final_data.parquet'
    )
    summary = pd.read_parquet(
        data_path / 'burkina_faso' / 'cleaned' / 'summary.parquet'
    )
    proposed_stratifier = 'region'
elif country == 'Cote dIvoire':
    data = pd.read_parquet(
        data_path / 'cote_divoire' / 'cleaned' / 'cotedivoire_cleaned_data.parquet'
    )

    summary = pd.read_parquet(
        data_path / 'cote_divoire' / 'cleaned' / 'summary.parquet'
    )
    proposed_stratifier = 'region'
elif country == 'Guinea-Bissau':
    data = pd.read_parquet(
        data_path / 'guinea-bissau' / 'cleaned' / 'final_gb_dataset.parquet'
    )
    summary = pd.read_parquet(
        data_path / 'guinea-bissau' / 'cleaned' / 'summary.parquet'
    )
    proposed_stratifier = 'region'
elif country == 'Mali':
    data = pd.read_parquet(
        data_path / 'mali' / 'cleaned' / 'final_mali_dataset.parquet'
    )
    summary = pd.read_parquet(
        data_path / 'mali' / 'cleaned' / 'summary.parquet'
    )
    proposed_stratifier = 'region'
elif country == 'Somalia':
    data = pd.read_parquet(
        data_path / 'somalia' / 'cleaned' / 'somalia_lsms_final.parquet'
    )
    summary = pd.read_parquet(
        data_path / 'somalia' / 'cleaned' / 'summary.parquet'
    )
    proposed_stratifier = 'region'
elif country == 'Albania': 
    data = pd.read_parquet(
        data_path / 'albania' / 'cleaned' / 'albania_all.parquet'
    )
    summary = pd.read_parquet(
        data_path / 'albania' / 'cleaned' / 'summary.parquet'
    )
elif country == 'Uganda':
    data = pd.read_parquet(
        data_path / 'uganda' / 'cleaned' / 'uganda_full.parquet'
    )
    summary = pd.read_parquet(
        data_path / 'uganda' / 'cleaned' / 'summary.parquet'
    )
    proposed_stratifier='region'
elif country == 'Malawi':
    data = pd.read_parquet(data_path / 'malawi/cleaned/malawi_2019.parquet')
    summary = pd.read_parquet(data_path / 'malawi/cleaned/summary.parquet')
    summary.rename(columns={'description': 'variable_description'}, inplace=True)

elif country == 'Togo':
    data = pd.read_parquet('/data/eop/Togo 2018-19/clean/final_togo.parquet')
    summary = pd.read_parquet('/data/eop/Togo 2018-19/clean/summary.parquet')
    proposed_stratifier = 'cluster_id'
elif country == 'Togo_only_cdr':
    data = pd.read_parquet('/data/eop/Togo 2018-19/clean/cdr_features/togo.parquet')
    summary = pd.read_parquet('/data/eop/Togo 2018-19/clean/cdr_features/summary.parquet')

elif country == 'Togo_survey_and_cdr':
    data = pd.read_parquet('/data/eop/Togo 2018-19/clean/cdr_features_and_survey_predictors/togo.parquet')
    summary = pd.read_parquet('/data/eop/Togo 2018-19/clean/cdr_features_and_survey_predictors/summary.parquet')
    proposed_stratifier = 'cluster_id'

elif country == 'Ethiopia':
    data = pd.read_parquet('/data/eop/Ethiopia 2018-19/clean/final_ethiopia.parquet')
    summary = pd.read_parquet('/data/eop/Ethiopia 2018-19/clean/summary.parquet')
    proposed_stratifier = 'region_zone'
elif country == 'Nigeria':
    data = pd.read_parquet('/data/eop/Nigeria 2018-19/clean/final_nigeria.parquet')
    summary = pd.read_parquet('/data/eop/Nigeria 2018-19/clean/summary.parquet')
    proposed_stratifier = 'ea_id'

elif country == 'Kenya':
    data = pd.read_parquet('/data/eop/kenya/cleaned/kenya.parquet')
    summary = pd.read_parquet('/data/eop/kenya/cleaned/summary.parquet')
elif country == 'Tanzania':
    data = pd.read_parquet('/data/eop/Tanzania_2020-21/cleaned/tanzania_data.parquet')
    summary = pd.read_parquet('/data/eop/Tanzania_2020-21/cleaned/summary.parquet')
    proposed_stratifier = 'region'
elif country == 'Madagascar':
    data = pd.read_parquet('/data/eop/Madagascar 2010-11/cleaned/madagascar_data.parquet')
    summary = pd.read_parquet('/data/eop/Madagascar 2010-11/cleaned/summary.parquet')
    proposed_stratifier = 'REGION'
else:
    raise ValueError('Invalid country name')


if 'variable_description' not in summary.columns:
    summary['variable_description'] = summary['variable_name']

print(f'Read in: {country}')

Read in: Ethiopia


In [26]:
print(f'country: {country}')
print('nullity: ')
display(data.isna().mean().sort_values(ascending=False).head(2))
# Empty string may or may not be a problem.

print('empty string')
display(data.isin(['']).mean().sort_values(ascending=False).head(2))
print('Number of samples:')
print(data.shape[0])

country: Ethiopia
nullity: 


hhid                       0.0
distance_to_bus_station    0.0
dtype: float64

empty string


hhid                       0.0
distance_to_bus_station    0.0
dtype: float64

Number of samples:
6770


## Missingness columns

In [27]:
# Missingness columns (assumes _missing suffix)
print(f'country: {country}')

missingness_columns_missing = [
    c for c in data.columns if ('missing' in c) 
]
missingness_columns_m = [
    c for c in data.columns if ('_m' in c) 
]
with_missingness = [
    c[:-8] for c in missingness_columns_missing
] + [
    c[:-2] for c in missingness_columns_m
]
missingness_columns = missingness_columns_missing + missingness_columns_m
for c in missingness_columns:
    if not (c in summary.variable_name.values):
        print(f"Missingness column {c} not in summary")
    
relevant_summary = summary[summary.variable_name.isin(with_missingness)]
print('categorical columns with missingness indicators:')

print(relevant_summary.data_type.value_counts())
display(relevant_summary[relevant_summary.data_type == 'categorical'])

# print numerical columns with no missingness indicators
print('numerical columns with no missingness indicators:')
print(summary[
    (summary.data_type == 'numeric') 
    & (~summary.variable_name.isin(with_missingness))
    & (~summary.variable_name.str.endswith('_missing'))
    & (~summary.variable_name.str.endswith('_m'))
].variable_name)

country: Ethiopia
categorical columns with missingness indicators:
data_type
numeric    25
Name: count, dtype: int64


Unnamed: 0,variable_name,data_type,geographic_indicator_coarser,geographic_indicator_finer,geographic_indicator,variable_description


numerical columns with no missingness indicators:
3                              hh_wgt
9                             hh_size
10                       num_children
11                 num_young_children
12                         num_elders
13                num_children_school
14                    num_adult_males
15                  num_adule_females
19                           head_age
31                          num_rooms
52           num_owned_Kerosene_stove
53                  num_owned_CD_Deck
54           num_owned_Satellite_Dish
55                 num_owned_Sofa_set
56                  num_owned_Bicycle
57               num_owned_Motorcycle
58                num_owned_Cart_hand
59              num_owned_Cart_animal
60           num_owned_Sewing_machine
61        num_owned_Weaving_equipment
62           num_owned_Mitad_Electric
63       num_owned_Cylinder_gas_stove
64      num_owned_Energy_saving_stove
65             num_owned_Refrigerator
66              num_owned_Private_car


## Consumption, weights, hh size, poverty rate

In [28]:
print(f'country: {country}')
assert 'consumption_per_capita_per_day' in data.columns
assert 'headcount_adjusted_hh_wgt' in data.columns
assert pd.api.types.is_numeric_dtype(data['consumption_per_capita_per_day']), "'consumption_per_capita_per_day' is not numeric"
assert pd.api.types.is_numeric_dtype(data['headcount_adjusted_hh_wgt']), "'headcount_adjusted_hh_wgt' is not numeric"
if not 'hh_size' in data.columns:
    print('Warning: Missing hh_size')
else:
    assert np.isclose(data.hh_size * data.hh_wgt, data.headcount_adjusted_hh_wgt).all()
    assert pd.api.types.is_numeric_dtype(data['hh_size']), "'hh_size' is not numeric"

for col in ['headcount_adjusted_hh_wgt_missing', 'consumption_per_capita_per_day_missing', 'hh_wgt_missing']:
    if col in data.columns:
        assert data[col].sum() == 0, f"{col} has missing values"

print('mean:', data.consumption_per_capita_per_day.mean())
print('std:', data.consumption_per_capita_per_day.std())

count_poor = (
    data[data.consumption_per_capita_per_day < 2.15].headcount_adjusted_hh_wgt
).sum()

total = (
    data.headcount_adjusted_hh_wgt
).sum()
rate = count_poor / total

print('rate:',rate)
# To crosscheck: https://docs.google.com/spreadsheets/d/11wGVZadIZMvR2oXoDtSfjJVvixyv3ievuUOF4k_1HNY/edit?gid=0#gid=0

country: Ethiopia
mean: 4.681175409246381
std: 4.812618484963736
rate: 0.4560640131118927


## Suspiciously named columns

In [29]:
print(f'country: {country}')

# Suspicious data
print('containing the word "unit":')
display(
    summary[
        (
            summary.variable_name.str.contains('unit')
            | summary.variable_description.str.contains('unit')
        ) & (
            ~summary.variable_name.str.contains('community')
        )
    ]
)

print('containing the word "consumption":')
display(
    summary[
        summary.variable_name.str.contains('consumption')
        | summary.variable_description.str.contains('consumption')
    ]
)

# Print variables whose name contains "id" or "code" and are listed as numeric in the summary
print('variables with "id" or "code" and listed numeric:')

filtered_variables = summary[
    (summary["variable_name"].str.contains("id|code", case=False, na=False)) &
    (summary["data_type"] == "numeric")
]

# Print the name and description of the filtered variables
for _, row in filtered_variables.iterrows():
    print(f"Name: {row['variable_name']}, Description: {row['variable_description']}")

country: Ethiopia
containing the word "unit":


Unnamed: 0,variable_name,data_type,geographic_indicator_coarser,geographic_indicator_finer,geographic_indicator,variable_description


containing the word "consumption":


Unnamed: 0,variable_name,data_type,geographic_indicator_coarser,geographic_indicator_finer,geographic_indicator,variable_description
138,consumption_per_capita_per_day,numeric,False,False,False,consumption_per_capita_per_day


variables with "id" or "code" and listed numeric:


## Summary correctness: Matches data, format

In [30]:
print(f'country: {country}')

# check that metadata and data match
data_columns = set(data.columns)

summary_variable_names = set(summary['variable_name'])
missing_in_data = summary_variable_names - data_columns
missing_in_summary = data_columns - summary_variable_names

print("Variables in summary but not in data:", missing_in_data)
print("Columns in data but not in summary:", missing_in_summary)

country: Ethiopia
Variables in summary but not in data: set()
Columns in data but not in summary: {'headcount_adjusted_hh_wgt'}


In [31]:
print(f'country: {country}')
# Check that "summary" fits the required format
required_columns = {
    "variable_name", "data_type", "geographic_indicator", "geographic_indicator_coarser"
    }
summary_columns = set(summary.columns)

missing_columns = required_columns - summary_columns
if missing_columns:
    print(f"Missing required columns in summary: {missing_columns}")

# Ensure "data_type" has only permitted values
permitted_data_types = {"numeric", "categorical"}
for _, row in summary.iterrows():
    if row["data_type"] not in permitted_data_types:
        print(
            f"Invalid data_type '{row['data_type']}' for variable '{row['variable_name']}'. "
            f"Description: {row['variable_description']}"
        )

# Ensure "geographic_indicator_coarser", "geographic_indicator_finer" is boolean or 0-1
for _, row in summary.iterrows():
    for c in ["geographic_indicator", "geographic_indicator_coarser", "geographic_indicator_finer"]:
        if c not in row:
            continue
        if row[c] not in [0, 1, True, False, None]:
            print(
                f"Invalid {c} '{row[c]}' for variable '{row['variable_name']}'. "
                f"Description: {row['variable_description']}"
            )

country: Ethiopia


## Data types

In [32]:
print(f'country: {country}')

# Check that numeric columns in summary are actually numeric in data
numeric_columns = summary[summary["data_type"] == "numeric"]["variable_name"]
for col in numeric_columns:
    if col in data.columns and not pd.api.types.is_numeric_dtype(data[col]):
        description = summary.loc[summary["variable_name"] == col, "variable_description"].values[0]
        print(f"BAD: numeric in summary, non-numeric in data: '{col}'; {description}")

# Check that categorical columns in summary are actually categorical in data (less important)
categorical_columns = summary[summary["data_type"] == "categorical"]["variable_name"]
for col in categorical_columns:
    if col in data.columns and not pd.api.types.is_categorical_dtype(data[col]):
        description = summary.loc[summary["variable_name"] == col, "variable_description"].values[0]
        print(f"categorical in summary, numeric in data: '{col}'; {description}")

country: Ethiopia
categorical in summary, numeric in data: 'hhid'; hhid
categorical in summary, numeric in data: 'ea_id'; ea_id
categorical in summary, numeric in data: 'rural_urban'; rural_urban
categorical in summary, numeric in data: 'region_code'; region_code
categorical in summary, numeric in data: 'woreda_code'; woreda_code
categorical in summary, numeric in data: 'city_code'; city_code
categorical in summary, numeric in data: 'subcity_code'; subcity_code
categorical in summary, numeric in data: 'kebele_code'; kebele_code
categorical in summary, numeric in data: 'max_edu_adult'; max_edu_adult
categorical in summary, numeric in data: 'max_edu_adult_female'; max_edu_adult_female
categorical in summary, numeric in data: 'head_gender'; head_gender
categorical in summary, numeric in data: 'religion'; religion
categorical in summary, numeric in data: 'head_marital_status'; head_marital_status
categorical in summary, numeric in data: 'head_agri_7d'; head_agri_7d
categorical in summary, 

  if col in data.columns and not pd.api.types.is_categorical_dtype(data[col]):


## Duplicate columns

In [33]:
# Check for duplicate information. Don't do if there are too many columns.
print(f'country: {country}')

find_equivalent_columns(data, summary)

country: Ethiopia


[]

## Geography and stratification

In [34]:
print(f'country: {country}')

print('geographic indicators:')
display(summary[summary.geographic_indicator])
for _, row in summary[summary.geographic_indicator].iterrows():
    print(row.variable_name)
    print(data[row.variable_name].nunique())

if False:
    partially_represented = summary[
        (summary.geographic_indicator_finer) & ~(summary.geographic_indicator_coarser)
    ]
    if len(partially_represented) == 0:
        print('No partially represented geo level')
    else:
        assert len(partially_represented) == 1
        print('partially represented:')
        print(partially_represented.variable_name.values[0])
        print('partially represented value counts:')
        print(data[partially_represented.variable_name.values[0]].value_counts())

country: Ethiopia
geographic indicators:


Unnamed: 0,variable_name,data_type,geographic_indicator_coarser,geographic_indicator_finer,geographic_indicator,variable_description
1,ea_id,categorical,False,False,True,ea_id
4,region_code,categorical,True,True,True,region_code
5,woreda_code,categorical,False,False,True,woreda_code
6,city_code,categorical,False,False,True,city_code
7,subcity_code,categorical,False,False,True,subcity_code
8,kebele_code,categorical,False,False,True,kebele_code
164,region_zone,categorical,False,True,True,region_zone


ea_id
535
region_code
11
woreda_code
27
city_code
5
subcity_code
11
kebele_code
41
region_zone
105


In [37]:
# Stratification
print(f'country: {country}')
print('proposed stratifier:', proposed_stratifier)
print('count per unit')
display(data.groupby(proposed_stratifier, observed=True).size().reset_index(name='count').sort_values('count', ascending=False))

print('weights per unit')
display(data.groupby(proposed_stratifier,  observed=True).hh_wgt.nunique().sort_values())

print('regions per weight class')
display(data.groupby('hh_wgt', observed=True)[proposed_stratifier].nunique().sort_values())

print('count per unit x weight')
display(
    data.groupby(['hh_wgt', proposed_stratifier],  observed=True)
    .size()
    .reset_index(name='count')
    .sort_values('count', ascending=False)
)

country: Ethiopia
proposed stratifier: region_zone
count per unit


Unnamed: 0,region_zone,count
36,DIRE DAWA_1,579
41,HARAR_1,550
37,GAMBELA_1,248
33,BENISHANGUL GUMUZ_2,164
32,BENISHANGUL GUMUZ_1,160
...,...,...
57,OROMIA_24,15
58,OROMIA_28,15
60,OROMIA_30,15
62,OROMIA_35,15


weights per unit


region_zone
OROMIA_1                1
OROMIA_23               1
OROMIA_24               1
BENISHANGUL GUMUZ_4     1
OROMIA_28               1
                       ..
TIGRAY_2               18
AFAR_2                 19
GAMBELA_1              22
HARAR_1                50
DIRE DAWA_1            50
Name: hh_wgt, Length: 105, dtype: int64

regions per weight class


hh_wgt
7.594256       1
2482.531727    1
2483.181945    1
2514.627852    1
2538.286831    1
              ..
802.221053     2
1231.423933    2
8256.395349    2
1858.280556    3
6905.952500    4
Name: region_zone, Length: 660, dtype: int64

count per unit x weight


Unnamed: 0,hh_wgt,region_zone,count
97,151.035714,DIRE DAWA_1,60
320,1220.930769,ADDIS ABABA_2,30
183,278.439990,DIRE DAWA_1,30
262,802.221053,BENISHANGUL GUMUZ_2,20
88,138.328128,HARAR_1,20
...,...,...,...
226,511.552037,SNNP_10,1
298,1127.846219,OROMIA_6,1
296,1107.151793,OROMIA_4,1
30,57.886410,AFAR_4,1
