In [None]:
import pandas as pd

# Load the datasets
socioeconomic_df = pd.read_csv("/mnt/data/socioeconomic_dataset.csv")
health_burden_df = pd.read_csv("/mnt/data/health_burden_dataset.csv")
risk_factor_df = pd.read_csv("/mnt/data/risk_factor_dataset.csv")

# --- Step 1: Standardize column names for consistency ---
socioeconomic_df.rename(columns={'Country': 'country', 'Year': 'year'}, inplace=True)
health_burden_df.rename(columns={'location': 'country'}, inplace=True)
risk_factor_df.rename(columns={'location': 'country'}, inplace=True)

# --- Step 2: Normalize text values for merging ---
socioeconomic_df['country'] = socioeconomic_df['country'].str.strip().str.lower()
health_burden_df['country'] = health_burden_df['country'].str.strip().str.lower()
risk_factor_df['country'] = risk_factor_df['country'].str.strip().str.lower()

# --- Step 3: Subset COPD-specific rows from burden and risk datasets ---
copd_burden_df = health_burden_df[
    health_burden_df['cause'].str.contains("chronic obstructive pulmonary disease", case=False)
].copy()

copd_risk_df = risk_factor_df[
    risk_factor_df['cause'].str.contains("chronic obstructive pulmonary disease", case=False)
].copy()

# --- Step 4: Pivot health burden data to wide format ---
burden_pivot = copd_burden_df.pivot_table(
    index=['country', 'year'],
    columns='measure',
    values='val',
    aggfunc='mean'
).reset_index()

# --- Step 5: Pivot risk factor data to wide format ---
risk_pivot = copd_risk_df.pivot_table(
    index=['country', 'year'],
    columns='risk_factor',
    values='val',
    aggfunc='mean'
).reset_index()

# --- Step 6: Merge all datasets on 'country' and 'year' ---
merged_df = socioeconomic_df.merge(burden_pivot, on=['country', 'year'], how='left')
merged_df = merged_df.merge(risk_pivot, on=['country', 'year'], how='left')

# --- Preview the final merged dataset ---
print("Merged DataFrame shape:", merged_df.shape)
print(merged_df.head())
