# Demographic metadatac creation notebook

In [2]:
import pandas as pd

In [3]:
# Source file paths
RAW_DATA_DIR = "../../raw_data/"
demographics_by_district_path = RAW_DATA_DIR + "demographics_by_district.csv"

demographics_df = pd.read_csv(demographics_by_district_path)
demographics_df.head()

Unnamed: 0,District,Land Area (sq km),"Year-end Permanent Population (10,000 persons)","Permanent Registered Population (10,000)","Permanent Non-registered Population (10,000)",Population Density (persons per sq km),"Household Registration Population (10,000 persons)","Men (10,000 persons)","Women (10,000 persons)",Sex Ratio (equal men and women = 100),Employee Compensation Payable (100 million yuan),Average Number of Employed Persons (person),Average Housing Price (per sq/m),GDP (10000 yuan)
0,Futian,78.66,150.17,95.35,54.82,19091,98.97,49.4,49.57,99.65,552.17,557197,48902.66667,35572870
1,Luohu,78.75,100.4,59.18,41.22,12749,61.19,30.54,30.65,99.61,181.27,174393,48108.0,19724939
2,Yantian,74.91,22.65,6.66,15.98,3024,7.2,3.62,3.58,101.11,31.56,30965,50334.33333,5375327
3,Nanshan,187.47,135.63,81.02,54.61,7235,85.79,45.55,40.24,113.21,462.45,329039,49847.83333,38452711
4,Bao'an,396.61,301.71,47.75,253.96,7607,50.94,26.07,24.87,104.76,164.28,186976,50647.66667,30038215


In [5]:
demographics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 14 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   District                                            10 non-null     object 
 1   Land Area (sq km)                                   10 non-null     float64
 2   Year-end Permanent Population (10,000 persons)      10 non-null     float64
 3   Permanent Registered Population (10,000)            10 non-null     float64
 4   Permanent Non-registered Population (10,000)        10 non-null     float64
 5   Population Density (persons per sq km)              10 non-null     int64  
 6   Household Registration Population (10,000 persons)  10 non-null     float64
 7   Men (10,000 persons)                                10 non-null     float64
 8   Women (10,000 persons)                              10 non-null     float64
 9   Se

## Clean up column names

In [17]:
column_names = list(demographics_df.columns)

column_names[0] = 'DistrictName'
column_names[1] = 'AreaKm2'
column_names[2] = 'YearEndPermanentPop10k'
column_names[3] = 'RegisteredPermanentPop10k'
column_names[4] = 'NonRegisteredPermanentPop10k'
column_names[5] = 'PopDensityPerKm2'
column_names[6] = 'HouseholdRegisteredPop10k'
column_names[7] = 'MalePop10k'
column_names[8] = 'FemalePop10k'
column_names[9] = 'SexRatio100'
column_names[10] = 'EmployeeCompensation100MYuan'
column_names[11] = 'AvgEmployedPersons'
column_names[12] = 'AvgHousingPricePerSqM'
column_names[13] = 'GDPin10000Yuan'

column_names_dict = dict(zip(demographics_df.columns, column_names))

new_demographics_df = demographics_df.rename(columns=column_names_dict)

print(new_demographics_df.columns)

Index(['DistrictName', 'AreaKm2', 'YearEndPermanentPop10k',
       'RegisteredPermanentPop10k', 'NonRegisteredPermanentPop10k',
       'PopDensityPerKm2', 'HouseholdRegisteredPop10k', 'MalePop10k',
       'FemalePop10k', 'SexRatio100', 'EmployeeCompensation100MYuan',
       'AvgEmployedPersons', 'AvgHousingPricePerSqM', 'GDPin10000Yuan'],
      dtype='object')


## Generate metadata

In [29]:
def generate_metadata(df):
    meta_df = pd.DataFrame(index = df.columns)
    meta_df['dtype'] = df.dtypes
    meta_df['n_unique'] = df.nunique()
    meta_df['total_missing'] = df.isnull().sum()
    meta_df['is_numeric'] = meta_df['dtype'].apply(
        lambda x: pd.api.types.is_numeric_dtype(x)
    )
    meta_df['is_categorical'] = meta_df['dtype'].apply(
        lambda x: isinstance(x, pd.CategoricalDtype) or x == object
    )
    meta_df['sample_values'] = df.apply(lambda col: col.dropna().unique()[:5].tolist(), result_type='reduce')
    return meta_df

# Generate metadata
meta_df = generate_metadata(new_demographics_df)

# Export to CSV
meta_df.to_csv("all_demographics_by_district.csv")
meta_df.head(20)

Unnamed: 0,dtype,n_unique,total_missing,is_numeric,is_categorical,sample_values
DistrictName,object,10,0,False,True,"[Futian, Luohu, Yantian, Nanshan, Bao'an]"
AreaKm2,float64,10,0,True,False,"[78.66, 78.75, 74.91, 187.47, 396.61]"
YearEndPermanentPop10k,float64,10,0,True,False,"[150.17, 100.4, 22.65, 135.63, 301.71]"
RegisteredPermanentPop10k,float64,10,0,True,False,"[95.35, 59.18, 6.66, 81.02, 47.75]"
NonRegisteredPermanentPop10k,float64,10,0,True,False,"[54.82, 41.22, 15.98, 54.61, 253.96]"
PopDensityPerKm2,int64,10,0,True,False,"[19091, 12749, 3024, 7235, 7607]"
HouseholdRegisteredPop10k,float64,10,0,True,False,"[98.97, 61.19, 7.2, 85.79, 50.94]"
MalePop10k,float64,10,0,True,False,"[49.4, 30.54, 3.62, 45.55, 26.07]"
FemalePop10k,float64,10,0,True,False,"[49.57, 30.65, 3.58, 40.24, 24.87]"
SexRatio100,float64,10,0,True,False,"[99.65, 99.61, 101.11, 113.21, 104.76]"


## Save renamed dataset and export metadata

In [32]:
new_demographics_df.to_csv("all_demographics_by_district.csv")

dtypes = new_demographics_df.dtypes.astype(str)
district_names = new_demographics_df["DistrictName"].tolist()

import json
with open("all_demographics_by_district.json", "w") as f:
    json.dump({
        "dtypes": dtypes.to_dict(),
        "districts": district_names
    }, f)