In [7]:
import pandas as pd
excel_path = 'data/map/namdinh/data.xlsx'
xls = pd.ExcelFile(excel_path)
all_data_with_sheet_name = []

# Iterate through each sheet, add a column for 'varname' to store the sheet name, and then append to the list
for sheet_name in xls.sheet_names:
    df_sheet = pd.read_excel(xls, sheet_name=sheet_name)
    # For each sheet, melt and then add the sheet name as 'varname'
    df_sheet_melted = df_sheet.melt(id_vars=["NAME"], var_name="year", value_name="value")
    df_sheet_melted["varname"] = sheet_name
    all_data_with_sheet_name.append(df_sheet_melted)

# Concatenate all the melted DataFrames with the sheet name included
long_df_with_varname = pd.concat(all_data_with_sheet_name, ignore_index=True)

long_df_with_varname

Unnamed: 0,NAME,year,value,varname
0,Thị trấn Mỹ Lộc,2016,4943.0,pop
1,Xã Mỹ Hà,2016,7589.0,pop
2,Xã Mỹ Tiến,2016,5151.0,pop
3,Xã Mỹ Thắng,2016,8313.0,pop
4,Xã Mỹ Trung,2016,5245.0,pop
...,...,...,...,...
4063,Xã Hải Chính,2021,25.0,ph
4064,Xã Hải Xuân,2021,79.0,ph
4065,Xã Hải Châu,2021,1.0,ph
4066,Xã Hải Triều,2021,37.0,ph


In [13]:
# Assuming long_df_with_varname is already created as per your setup

# Separate the DataFrames based on 'varname' for 'ph' and 'hh' using 'contains'
df_ph = long_df_with_varname[long_df_with_varname['varname'].str.contains('ph')].copy()
df_hh = long_df_with_varname[long_df_with_varname['varname'].str.contains('hh')].copy()

# Perform operations directly on the copy to avoid SettingWithCopyWarning
df_ph['year'] = df_ph['year']
df_hh['year'] = df_hh['year']

# Merge the 'ph' and 'hh' DataFrames on 'NAME' and 'year'
df_merged = pd.merge(df_ph, df_hh, on=['NAME', 'year'], suffixes=('_ph', '_hh'))

# Calculate the new variable 'ph/hh * 100'
df_merged['perph'] = (df_merged['value_ph'] / df_merged['value_hh'])

# Create a DataFrame for the new variable
df_new_var = df_merged[['NAME', 'year', 'perph']].copy()
df_new_var['varname'] = 'ph/hh * 100'

# Ensure the original DataFrame is not just a view to avoid the warning
long_df_with_varname_filtered = long_df_with_varname[~long_df_with_varname['varname'].str.contains('ph|hh')].copy()

# Concatenate the DataFrame with the new variable to the filtered original DataFrame
final_df = pd.concat([long_df_with_varname_filtered, df_new_var[['NAME', 'year', 'varname', 'perph']].rename(columns={'perph': 'value'})], ignore_index=True)

# Note: Ensure to use '.copy()' when slicing DataFrames to avoid the SettingWithCopyWarning when setting new values.


In [14]:
final_df

Unnamed: 0,NAME,year,value,varname
0,Thị trấn Mỹ Lộc,2016,4943.000000,pop
1,Xã Mỹ Hà,2016,7589.000000,pop
2,Xã Mỹ Tiến,2016,5151.000000,pop
3,Xã Mỹ Thắng,2016,8313.000000,pop
4,Xã Mỹ Trung,2016,5245.000000,pop
...,...,...,...,...
2701,Xã Hải Chính,2021,0.016869,ph/hh * 100
2702,Xã Hải Xuân,2021,0.035666,ph/hh * 100
2703,Xã Hải Châu,2021,0.000433,ph/hh * 100
2704,Xã Hải Triều,2021,0.023825,ph/hh * 100
