In [238]:
import pandas as pd
from pathlib import Path
import pyreadstat

In [284]:
nolan_metadata = pd.read_csv(
    '/Users/leo/Documents/gpl/eop/overseen_work/nolan/uganda/uganda_metadata.csv'
)

In [285]:
nolan_household_data = pd.read_csv(
    '/Users/leo/Documents/gpl/eop/overseen_work/nolan/uganda/uganda_household_data_TESTING.csv'
)

In [286]:
columns_to_drop = [
    'dc_2018', # this and next 3: geo codes
    'cc_2018',
    'sc_2018',
    'pc_2018',
    'working_for_pay_count',
    'business_owner_count',
    'total_laborers',
    'total_armed_forces',
    'total_managers',
    'total_professionals',
    'total_associate_professionals',
    'total_clerical_workers',
    'total_service/sales',
    'total_agricultural_workers',
    'total_craftsmen',
    'total_operators',
    'total_elementary_occupation_workers',
    'entid_unps',
    'h12q01_1',
    'h12q02_twodigit',
    'h12q02_threedigit',
    'h12q02_fourdigit',
    'regurb',
    'h9q27', # ownership type of parcel
]
nolan_metadata = nolan_metadata[nolan_metadata.variable_name.isin(columns_to_drop) == False]
nolan_household_data = nolan_household_data.drop(columns=columns_to_drop)

In [287]:
# Replace units with number of sq meters in 1 unit
nolan_household_data.h9q24a = nolan_household_data.h9q24a.map(
    {1: 1, 2: 10000, 3: 4046.86, 4: 0.092903}
)
nolan_household_data.h9q24b = nolan_household_data.h9q24b * nolan_household_data.h9q24a
nolan_household_data.drop(columns=['h9q24a'], inplace=True)
nolan_metadata = nolan_metadata[nolan_metadata.variable_name != 'h9q24a']
nolan_metadata.loc[nolan_metadata.variable_name == 'h9q24b', 'variable_description'] = 'Area of parcel (in sq meters)'

In [288]:
# Force categoricals
force_categorical = ['s1aq02a', 's1aq03a', 's1aq04a', 'h9q23', 'district']
nolan_metadata.loc[nolan_metadata.variable_name.isin(force_categorical), 'data_type'] = 'categorical'

In [289]:
h12q02_oneDigit_mapper = {
  "11111.0": "Agriculture, Forestry, and Fishing",
  "21111.0": "Mining and Quarrying",
  "31111.0": "Manufacturing",
  "41111.0": "Electricity, Gas, Steam, and Air Conditioning Supply",
  "51111.0": "Water Supply; Sewerage, Waste Management, and Remediation",
  "61111.0": "Construction",
  "71111.0": "Wholesale and Retail Trade; Repair of Motor Vehicles",
  "81111.0": "Transportation and Storage",
  "91111.0": "Accommodation and Food Service Activities",
  "101111.0": "Information and Communication",
  "111111.0": "Financial and Insurance Activities",
  "121111.0": "Real Estate Activities",
  "131111.0": "Professional, Scientific, and Technical Activities",
  "141111.0": "Administrative and Support Service Activities",
  "151111.0": "Public Administration and Defence; Compulsory Social Security",
  "161111.0": "Education",
  "171111.0": "Human Health and Social Work Activities",
  "181111.0": "Arts, Entertainment, and Recreation",
  "191111.0": "Other Service Activities",
  "201111.0": "Activities of Households as Employers; Undifferentiated Goods",
  "211111.0": "Activities of Extraterritorial Organizations and Bodies",
  "missing": "missing"
}

nolan_household_data['h12q02_onedigit'] = nolan_household_data['h12q02_onedigit'].map(h12q02_oneDigit_mapper)

In [290]:
path_to_raw = Path(
    '/Users/leo/Documents/gpl/eop/data/uganda/UGA_2019_UNPS_v03_M_STATA14'
)

hh10_3, hh10_3_meta = pyreadstat.read_dta(path_to_raw / 'HH' / 'gsec10_3.dta', apply_value_formats=True)


def convert_to_binary(value):
    if isinstance(value, str):
        return 1 if value.lower() == "yes" else 0
    return value

filtered_data = hh10_3[hh10_3['s10q14'] == 'Yes'].copy()

for col in ['s10q14', 's10q15a', 's10q15b', 's10q15c']:
    filtered_data[col] = filtered_data[col].apply(convert_to_binary)

# Initialize an empty dataframe to store the result
result_df = pd.DataFrame()

# Get unique household IDs
unique_hhids = hh10_3['hhid'].unique()
result_df['hhid'] = unique_hhids

# Set index for easier joining later
result_df.set_index('hhid', inplace=True)

# For each fuel type, create columns in the result dataframe
for fuel_type in filtered_data['s10q13'].unique():
    # Filter data for current fuel
    fuel_data = filtered_data[filtered_data['s10q13'] == fuel_type]
    
    # Create temporary dataframes for each use type
    for use_col, use_name in [('s10q15a', 'cooking'), ('s10q15b', 'lighting'), ('s10q15c', 'heating')]:
        # Create column name
        col_name = f"{fuel_type}_{use_name}"
        if col_name not in nolan_metadata.variable_name:
            # Add a row to metadata describing this new column
            nolan_metadata = pd.concat([
                nolan_metadata,
                pd.DataFrame({
                    'variable_name': [col_name],
                    'variable_description': [f"Household uses {fuel_type} for {use_name}"],
                    'data_type': ['numeric']
                })
            ], ignore_index=True)

        
        # Create temporary dataframe with hhid and the binary use indicator
        temp_df = fuel_data[['hhid', use_col]].copy()
        temp_df.columns = ['hhid', col_name]
        temp_df.set_index('hhid', inplace=True)
        
        # Join with result dataframe
        result_df = result_df.join(temp_df, how='left')

# Fill NaN values with 0 (households that don't use a specific fuel for a specific purpose)
result_df.fillna(0, inplace=True)

# Reset index to make hhid a regular column
result_df.reset_index(inplace=True)



You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  hh10_3, hh10_3_meta = pyreadstat.read_dta(path_to_raw / 'HH' / 'gsec10_3.dta', apply_value_formats=True)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pan

In [291]:

to_drop = nolan_metadata[nolan_metadata.module_name == 'gsec10_3'].variable_name.values
nolan_household_data.drop(columns=to_drop, inplace=True)

nolan_metadata = nolan_metadata[
    nolan_metadata.variable_name.isin(to_drop) == False
]


In [292]:
nolan_household_data = nolan_household_data.merge(
    result_df,
    how='left',
    on='hhid'
)

In [293]:
for _, row in nolan_metadata.iterrows():
    try:
        c = eval(row['columns'])
    except:
        continue
    if len(c) == 1:
        continue
    for variable_name in c:
        if variable_name.endswith('_missing'):
            new_row = row.copy()
            new_row.variable_name = variable_name
            new_row.variable_description = f"{row.variable_description} (missing)"
            new_row.data_type = 'categorical'
            nolan_metadata = pd.concat([nolan_metadata, pd.DataFrame([new_row])], ignore_index=True)

nolan_metadata.drop(columns='columns', inplace=True)

In [None]:
nolan_household_data.to_parquet('/Users/leo/Documents/gpl/eop/data/uganda/cleaned/uganda_full.parquet', index=False)
nolan_metadata.rename(columns={'data_type': 'type'}, inplace=True)
nolan_metadata[['variable_name', 'module_name', 'variable_description', 'type']].to_csv(
    '/Users/leo/Documents/gpl/eop/data/uganda/cleaned/uganda_metadata.csv',
    index=False
)

In [None]:
nolan_metadata[['variable_name', 'module_name', 'variable_description', 'type']].to_parquet(
    '/Users/leo/Documents/gpl/eop/data/uganda/cleaned/summary.parquet',
    index=False
)

region
3.0    862
1.0    782
4.0    729
2.0    701
Name: count, dtype: int64