In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import os

# --- 1. Load the labor share file ---
input_file = os.path.join('..', 'Data', 'pwt_labor_share.csv')
df = pd.read_csv(input_file)
df.columns = df.columns.str.strip().str.lower()

# The file has columns like: "iso code", "country", "variable code", "variable name", "1991", "1992", ...
# We want to melt it to get a (country, year, laborshare) structure.

# --- 2. Melt into long format ---
df_melt = df.melt(
    id_vars=["iso code", "country", "variable code", "variable name"],
    var_name="year",
    value_name="laborshare"
)
# Convert year and laborshare to numeric
df_melt["year"] = pd.to_numeric(df_melt["year"], errors="coerce")
df_melt["laborshare"] = pd.to_numeric(df_melt["laborshare"], errors="coerce")

# Filter out rows where year < 1991 or year > 2019 for the regression step
df_melt_valid = df_melt[(df_melt["year"] >= 1991) & (df_melt["year"] <= 2019)].copy()

# --- 3. Extrapolate 2020–2023 using linear regression for each country ---
def extrapolate_laborshare(group):
    """
    group is the subset of df_melt_valid for a single country.
    We'll fit a simple linear model: laborshare = a*year + b
    and predict for 2020, 2021, 2022, 2023.
    """
    if len(group) < 2:
        # Not enough data to fit a line
        return pd.DataFrame()

    x = group["year"].values
    y = group["laborshare"].values
    # Fit a linear model
    coeffs = np.polyfit(x, y, 1)  # returns [a, b]
    a, b = coeffs
    # Years to predict
    future_years = np.array([2020, 2021, 2022, 2023])
    y_pred = a * future_years + b

    # Build a DataFrame with these predictions
    df_pred = pd.DataFrame({
        "iso code": group["iso code"].iloc[0],
        "country": group["country"].iloc[0],
        "variable code": group["variable code"].iloc[0],
        "variable name": group["variable name"].iloc[0],
        "year": future_years,
        "laborshare": y_pred
    })
    return df_pred

pred_list = []
for ctry, grp in df_melt_valid.groupby("country"):
    df_p = extrapolate_laborshare(grp)
    if not df_p.empty:
        pred_list.append(df_p)

df_predicted = pd.concat(pred_list, ignore_index=True)

# Combine predicted with original (melted) data
df_extended = pd.concat([df_melt, df_predicted], ignore_index=True)
# We now have data up to 2023 for countries where we could extrapolate.

# --- 4. Pivot back to wide format with new columns for 2020–2023 ---
df_extended_wide = df_extended.pivot_table(
    index=["iso code", "country", "variable code", "variable name"],
    columns="year",
    values="laborshare",
    aggfunc="first"  # or np.mean if needed
).reset_index()

# The pivot will create columns for all years including 2020–2023.
# Sort columns so years are in ascending order
year_cols = [c for c in df_extended_wide.columns if isinstance(c, (int, float))]
year_cols_sorted = sorted(year_cols)
df_extended_wide = df_extended_wide[["iso code", "country", "variable code", "variable name"] + year_cols_sorted]

# --- 5. Plot each country's labor share from 2019–2023, highlight 2020–2023 ---
# Ensure the Result folder exists
os.makedirs(os.path.join('..', 'Result'), exist_ok=True)

# Define the output path
pdf_filename = os.path.join('..', 'Result', 'Plots', 'laborshare_extrapolated_plots.pdf')

with PdfPages(pdf_filename) as pdf:
    for ctry in df_predicted["country"].unique():
        # We'll get data for 2019–2023 from df_extended (long format)
        subset = df_extended[(df_extended["country"] == ctry) & (df_extended["year"] >= 2019) & (df_extended["year"] <= 2023)]
        if subset.empty:
            continue
        
        subset_2019 = subset[subset["year"] == 2019]
        subset_future = subset[subset["year"] >= 2020]

        plt.figure(figsize=(7, 5))
        # Plot 2019 in one color (blue), 2020–2023 in another (red)
        # If 2019 doesn't exist, this won't plot a separate point
        if not subset_2019.empty:
            plt.plot(subset_2019["year"], subset_2019["laborshare"], marker='o', color='blue', label='2019 Actual')
        # Plot 2020–2023
        plt.plot(subset_future["year"], subset_future["laborshare"], marker='o', color='red', label='2020–2023 Extrapolated')

        plt.title(f"Labor Share for {ctry} (2019–2023)")
        plt.xlabel("Year")
        plt.ylabel("Labor Share")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        pdf.savefig()
        plt.close()

# --- 6. Save the updated wide file with new columns 2020–2023 to Result folder ---
os.makedirs(os.path.join('..', 'Result'), exist_ok=True)

output_path = os.path.join('..', 'Result', 'pwt_labor_share_extended.csv')
df_extended_wide.to_csv(output_path, index=False)

print(f"✅ Done! Extended labor share file saved to: {output_path}")
print(f"✅ Plots saved to: {pdf_filename}")


✅ Done! Extended labor share file saved to: ../Result/pwt_labor_share_extended.csv
✅ Plots saved to: ../Result/Plots/laborshare_extrapolated_plots.pdf
