In [14]:
from pathlib import Path
import pandas as pd

data_dir = Path("data")
years = range(2015, 2025) # 월드 행복지수 데이터: 2015~2024

raw_by_year = {y: pd.read_csv(data_dir / f"{y}.csv") for y in years}

for y, df in raw_by_year.items():
    print(y, df.shape)

file_paths = {year: data_dir / f"{year}.csv" for year in years}

2015 (158, 12)
2016 (157, 13)
2017 (155, 12)
2018 (156, 9)
2019 (156, 9)
2020 (153, 11)
2021 (149, 11)
2022 (146, 11)
2023 (137, 11)
2024 (143, 11)


In [15]:
raw_by_year = {year: pd.read_csv(path) for year, path in file_paths.items()}

for year in sorted(raw_by_year):
    rows, cols = raw_by_year[year].shape
    print(f"{year}: {rows} rows, {cols} cols")

2015: 158 rows, 12 cols
2016: 157 rows, 13 cols
2017: 155 rows, 12 cols
2018: 156 rows, 9 cols
2019: 156 rows, 9 cols
2020: 153 rows, 11 cols
2021: 149 rows, 11 cols
2022: 146 rows, 11 cols
2023: 137 rows, 11 cols
2024: 143 rows, 11 cols


In [17]:
# 연도별로 컬럼명이 조금씩 달라서, 스키마를 통일
# 아래 10개 컬럼으로 고정
STANDARD_COLS = [
    "year", "country", "rank", "score",
    "gdp_per_capita", "social_support", "healthy_life_expectancy",
    "freedom", "generosity", "corruption"
]

def standardize_year_df(raw_df: pd.DataFrame, year: int) -> pd.DataFrame:
    """
    연도별 원본 데이터프레임을 표준 스키마(STANDARD_COLS)로 변환합니다.

    Parameters
    ----------
    raw_df : pd.DataFrame
        `{year}.csv`에서 로드한 연도별 원본 데이터프레임입니다.
    year : int
        대상 연도(2015~2024)입니다.

    Returns
    -------
    pd.DataFrame
        STANDARD_COLS(10개)로 통일된 데이터프레임을 반환합니다.
    """

    df = raw_df.copy()
    df.columns = [c.strip() for c in df.columns]  # 컬럼명 공백 제거

    # 2015~2016: Happiness Score/Rank, Economy (GDP per Capita) 등으로 표기
    if year in (2015, 2016):
        std_df = pd.DataFrame({
            "year": year,
            "country": df["Country"],
            "rank": df["Happiness Rank"],
            "score": df["Happiness Score"],
            "gdp_per_capita": df["Economy (GDP per Capita)"],
            "social_support": df["Family"],
            "healthy_life_expectancy": df["Health (Life Expectancy)"],
            "freedom": df["Freedom"],
            "generosity": df["Generosity"],
            "corruption": df["Trust (Government Corruption)"],
        })
        return std_df[STANDARD_COLS]

    # 2017: 점(.)이 포함된 컬럼명 형태로 제공되는 경우가 많음
    if year == 2017:
        std_df = pd.DataFrame({
            "year": year,
            "country": df["Country"],
            "rank": df["Happiness.Rank"],
            "score": df["Happiness.Score"],
            "gdp_per_capita": df["Economy..GDP.per.Capita."],
            "social_support": df["Family"],
            "healthy_life_expectancy": df["Health..Life.Expectancy."],
            "freedom": df["Freedom"],
            "generosity": df["Generosity"],
            "corruption": df["Trust..Government.Corruption."],
        })
        return std_df[STANDARD_COLS]

    # 2018~2019: "Country or region", "Overall rank", "Score" 형태
    if year in (2018, 2019):
        std_df = pd.DataFrame({
            "year": year,
            "country": df["Country or region"],
            "rank": df["Overall rank"],
            "score": df["Score"],
            "gdp_per_capita": df["GDP per capita"],
            "social_support": df["Social support"],
            "healthy_life_expectancy": df["Healthy life expectancy"],
            "freedom": df["Freedom to make life choices"],
            "generosity": df["Generosity"],
            "corruption": df["Perceptions of corruption"],
        })
        return std_df[STANDARD_COLS]

    # 2020~2024: "Country name", "Happiness score" 중심
    std_df = pd.DataFrame({
        "year": year,
        "country": df["Country name"],
        "rank": df["Happiness Rank"],
        "score": df["Happiness score"],
        "gdp_per_capita": df["Economy (GDP per Capita)"],
        "social_support": df["Social support"],
        "healthy_life_expectancy": df["Healthy life expectancy"],
        "freedom": df["Freedom to make life choices"],
        "generosity": df["Generosity"],
        "corruption": df["Perceptions of corruption"],
    })
    return std_df[STANDARD_COLS]

In [4]:
standardized_by_year = {}

# 컬럼명이 다르면 KeyError가 날 수 있으니,
# 어떤 연도에서 어떤 컬럼이 없었는지 바로 보이도록 try/except로 감싼다.
for year in years:
    try:
        standardized_by_year[year] = standardize_year_df(raw_by_year[year], year)
    except KeyError as e:
        print(f"[KeyError] year={year} missing column: {e}")
        print("Columns:", list(raw_by_year[year].columns))
        raise

# 모든 연도의 표준화 DF를 합쳐서 하나의 분석용 데이터로 만든다.
happiness_df = pd.concat(list(standardized_by_year.values()), ignore_index=True)

happiness_df.head()

Unnamed: 0,year,country,rank,score,gdp_per_capita,social_support,healthy_life_expectancy,freedom,generosity,corruption
0,2015,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.29678,0.41978
1,2015,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.4363,0.14145
2,2015,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.34139,0.48357
3,2015,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.34699,0.36503
4,2015,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.45811,0.32957


In [5]:
# 숫자로 쓰는 컬럼들은 타입이 섞여 있으면 분석/시각화가 꼬이므로 numeric으로 통일
numeric_cols = [
    "rank", "score",
    "gdp_per_capita", "social_support", "healthy_life_expectancy",
    "freedom", "generosity", "corruption"
]

for col in numeric_cols:
    happiness_df[col] = pd.to_numeric(happiness_df[col], errors="coerce")

print("shape:", happiness_df.shape)
print()
print(happiness_df.dtypes)

shape: (1510, 10)

year                         int64
country                        str
rank                         int64
score                      float64
gdp_per_capita             float64
social_support             float64
healthy_life_expectancy    float64
freedom                    float64
generosity                 float64
corruption                 float64
dtype: object


In [6]:
# 컬럼별 결측치 개수/비율 요약
missing_summary_df = (
    happiness_df.isna()
    .sum()
    .to_frame("missing_count")
    .assign(missing_ratio=lambda x: (x["missing_count"] / len(happiness_df)).round(4))
    .sort_values(["missing_count", "missing_ratio"], ascending=False)
)
missing_summary_df

Unnamed: 0,missing_count,missing_ratio
healthy_life_expectancy,4,0.0026
corruption,4,0.0026
gdp_per_capita,3,0.002
social_support,3,0.002
freedom,3,0.002
generosity,3,0.002
year,0,0.0
country,0,0.0
rank,0,0.0
score,0,0.0


In [7]:
# 연도별로 데이터가 얼마나 들어 있는지도 확인
year_rows_df = happiness_df.groupby("year").size().to_frame("rows")
year_countries_df = happiness_df.groupby("year")["country"].nunique().to_frame("unique_countries")

year_summary_df = year_rows_df.join(year_countries_df)
year_summary_df

Unnamed: 0_level_0,rows,unique_countries
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2015,158,158
2016,157,157
2017,155,155
2018,156,156
2019,156,156
2020,153,153
2021,149,149
2022,146,146
2023,137,137
2024,143,143


In [8]:
# 통합본 저장
output_path = data_dir / "happiness_2015_2024.csv"
happiness_df.to_csv(output_path, index=False)
print("Saved:", output_path)

Saved: data\happiness_2015_2024.csv
