In [160]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

OUTDIR = Path("./prepared")
OUTDIR.mkdir(parents=True, exist_ok=True)

In [161]:
df_yields = pd.read_csv("./data/Yields.csv")
print(df_yields.shape)

n_crops = df_yields["Crop"].nunique(dropna=True)
n_munis = df_yields["Municipality"].nunique(dropna=True)
print(f"- number of different crops: {n_crops}") # 53 crops expected
print(f"- number of different municipalities: {n_munis}") # 99 municipalities expected
df_yields.head()

(44448, 9)
- number of different crops: 53
- number of different municipalities: 99


Unnamed: 0,Year,Municipality,Crop,Variety,Farms,Acres,Yield/Acre,t,i
0,1996,ALEXANDER,ARGENTINE CANOLA,INNOVATOR (HCN 92) (LT),3,575.0,0.579,1,1
1,1996,ALEXANDER,ARGENTINE CANOLA,QUANTUM (91-21864 NA),6,1125.0,0.78,1,1
2,1996,ALEXANDER,ARGENTINE CANOLA,45A71 (NS1471)(ST),4,1012.0,0.756,1,1
3,1996,ALEXANDER,BARLEY,ROBUST,9,1247.0,1.607,1,1
4,1996,ALEXANDER,CANARYSEED,KEET,3,535.0,0.184,1,1


In [162]:
df_weather = pd.read_csv("./data/Weather Reanalysis.csv")
print(df_weather.shape)

n_munis = df_weather["Municipality"].nunique(dropna=True)
print(f"- number of different municipalities: {n_munis}") # 84 municipalities expected
df_weather.head()

(490896, 9)
- number of different municipalities: 84


Unnamed: 0,Municipality,Date/Time,Year,Month,Day,Max_Temp,Min_Temp,Mean_Temp,Ptol
0,ALEXANDER,1/1/96,1996,1,1,-10.4164,-21.008989,-15.712695,-1.37e-10
1,ALEXANDER,1/2/96,1996,1,2,-15.238113,-18.910885,-17.074499,-1.37e-10
2,ALEXANDER,1/3/96,1996,1,3,-13.913316,-21.768749,-17.841032,0.001088851
3,ALEXANDER,1/4/96,1996,1,4,-22.517648,-28.702039,-25.609844,-1.37e-10
4,ALEXANDER,1/5/96,1996,1,5,-23.918762,-30.488448,-27.203605,-1.37e-10


In [163]:
YEARS = range(1996, 2012) # 1996–2011
LAST5 = list(range(2007, 2012))             # 2007–2011

def normalize_yields_per_crop(y):
    """
    The yield for each crop in the crop mix of a given municipality was then rescaled within the interval [0,1]
    based on its maximal yield in 1996–2011
    """
    y = y.copy()
    y["Yield/Acre"] = pd.to_numeric(y["Yield/Acre"], errors="coerce")
    # filter to study window before computing mins/maxs (as per paper)
    y = y[y["Year"].between(min(YEARS), max(YEARS))]
    
    g = y.groupby("Crop")["Yield/Acre"]
    mins = g.transform("min")
    maxs = g.transform("max")
    rng  = maxs - mins
    # vectorized, stays aligned; sets 0 when rng == 0
    y["YieldNorm"] = np.where(rng != 0, (y["Yield/Acre"] - mins) / rng, 0.0)
    return y


def compute_recent5_weights(y, last5=LAST5):
    """
    We aggregate yields for a crop mix that is representative of the recent five years for each municipality.
    The mix includes all varieties for each crop in the mix.
    """
    w = (y[y["Year"].isin(last5)]
         .groupby(["Municipality", "Crop"], as_index=False)["Acres"].sum())

    # Ensure numeric and clean
    w["Acres"] = pd.to_numeric(w["Acres"], errors="coerce")
    w = w.replace([np.inf, -np.inf], np.nan).dropna(subset=["Acres"])
    w = w[w["Acres"] > 0]

    # Use transform for aligned indexing (prevents the TypeError)
    muni_totals = w.groupby("Municipality")["Acres"].transform("sum")
    w["Weight"] = w["Acres"] / muni_totals

    # Re-normalize inside each municipality to guard against fp drift
    w["Weight"] = w.groupby("Municipality")["Weight"].transform(lambda s: s / s.sum())

    return w[["Municipality", "Crop", "Weight"]].copy()

# def build_portfolio_yield(y_norm, weights):
#     """
#     To make yield risks comparable across crops, we first normalize yields for each crop over 1996–2011.
#     We then build, for every municipality, a representative crop-mix yield that uses a fixed set of acreage weights based on the most recent five years.
#     The mix includes all varieties within each crop.
#     """
#     """
#     Portfolio yield for every Municipality and Year:
#     Y^{mix}_{i,t} = sum_c w_{i,c} * y'_{i,t,c}
#     where y' is min–max normalized per Crop and w_{i,c} are the 5-year acreage shares.
#     """
#     m = y_norm.merge(weights, on=["Municipality","Crop"], how="inner")
#     port = (m.assign(weighted=lambda df: df["YieldNorm"] * df["Weight"])
#               .groupby(["Municipality","Year"], as_index=False)["weighted"].sum()
#               .rename(columns={"weighted":"YieldPortfolio"}))
    
#     return port

def collapse_varieties_to_crop(y_norm):
    """
    One row per (Municipality, Year, Crop), combining all varieties.
    Prefer acreage-weighted mean of YieldNorm; fallback to simple mean if Acres missing.
    """
    y = y_norm.copy()
    y["Acres"] = pd.to_numeric(y["Acres"], errors="coerce")

    def agg(group):
        yn = group["YieldNorm"].astype(float)
        w  = group["Acres"].fillna(0).astype(float)
        if w.sum() > 0:
            val = (yn * w).sum() / w.sum()
        else:
            val = yn.mean()  # no acres → simple mean across varieties
        return pd.Series({"YieldNormCrop": val})

    crop_level = (y.groupby(["Municipality","Year","Crop"], as_index=False)
                    .apply(agg)
                    .reset_index(drop=True))
    return crop_level

def build_portfolio_yield(y_norm, weights):
    # 1) collapse to crop-level first (no duplicate varieties)
    y_crop = collapse_varieties_to_crop(y_norm)
    # 2) merge crop-level with weights (one row per muni-crop)
    m = y_crop.merge(weights, on=["Municipality","Crop"], how="inner")
    # 3) weighted sum over crops per (Municipality, Year)
    port = (m.assign(weighted=lambda df: df["YieldNormCrop"] * df["Weight"])
              .groupby(["Municipality","Year"], as_index=False)["weighted"].sum()
              .rename(columns={"weighted":"YieldPortfolio"}))
    # 4) numerical safety
    port["YieldPortfolio"] = port["YieldPortfolio"]
    return port


In [164]:
y_norm = normalize_yields_per_crop(df_yields)
weights = compute_recent5_weights(y_norm)     # uses Acres in 2007–2011
y_port = build_portfolio_yield(y_norm, weights)
y_port.to_csv(OUTDIR / "yields_portfolio.csv", index=False)
y_port.head()

  .apply(agg)


Unnamed: 0,Municipality,Year,YieldPortfolio
0,ALEXANDER,1996,0.381934
1,ALEXANDER,1997,0.359387
2,ALEXANDER,1998,0.365427
3,ALEXANDER,1999,0.415961
4,ALEXANDER,2000,0.279473


In [165]:
GROW_START, GROW_END = ("05-01", "10-31")   # May–Oct

def parse_date_us_2digit_year(s):
    # Robustly parse strings like "1/1/96" -> 1996-01-01
    return pd.to_datetime(s, format="%m/%d/%y", errors="coerce")

def load_weather(e):
    # Parse date
    e["__date"] = parse_date_us_2digit_year(e["Date/Time" ])

    # Filter to study window
    e = e[e["Year"].between(min(YEARS), max(YEARS))].copy()

    # Growing season mask (May–Oct by month/day)
    e["__mmdd"] = e["__date"].dt.strftime("%m-%d")
    season = (e["__mmdd"] >= GROW_START) & (e["__mmdd"] <= GROW_END)
    e = e.loc[season].copy()

    return e

def weather_indices(e,
                thresholds=dict(hot=30.0, frost=0.0, wet=0.001, gdd_base=5.0, gdd_cap=30.0)):
    """
    Build annual (Municipality, Year) indices for May–Oct:
      - Averages: mean Tmin/Tmax/Tmean
      - Extremes: min Tmin, max Tmax, max daily precip
      - Totals/Counts: precip total, #hot days (Tmax>=hot), #frost days (Tmin<=frost), #wet days (P>=wet)
      - Degree-days: capped GDD using daily mean
    """
    e = e.copy()

    # Degree-days (cap at gdd_cap and floor at base)
    tmean = e["Mean_Temp"]
    base = thresholds["gdd_base"]; cap = thresholds["gdd_cap"]
    gdd = np.clip(tmean - base, 0, cap - base)
    e["GDD"] = gdd

    # Extreme-day counts
    e["HotDays"]   = (e["Max_Temp"] >= thresholds["hot"]).astype(int)
    e["FrostDays"] = (e["Min_Temp"] <= thresholds["frost"]).astype(int)
    e["WetDays"]   = (e["Ptol"] >= thresholds["wet"]).astype(int)

    # Aggregations
    grp = e.groupby(["Municipality", "Year"])
    idx = grp.agg({
        "Min_Temp": ["mean", "min"],
        "Max_Temp": ["mean", "max"],
        "Mean_Temp":["mean"],
        "Ptol": ["sum", "max"],
        "GDD":    ["sum"],
        "HotDays":["sum"],
        "FrostDays":["sum"],
        "WetDays":["sum"],
    }).reset_index()

    # Flatten columns
    idx.columns = ["Municipality","Year",
                   "Tmin_mean","Tmin_min",
                   "Tmax_mean","Tmax_max",
                   "Tmean_mean",
                   "Ptot_sum","Ptot_max",
                   "GDD_sum",
                   "HotDays_sum","FrostDays_sum","WetDays_sum"]
    return idx

In [166]:
weather_idx_raw = weather_indices(df_weather)
weather_idx_raw.to_csv(OUTDIR / "weather_indices_raw.csv", index=False)
weather_idx_raw.head()

Unnamed: 0,Municipality,Year,Tmin_mean,Tmin_min,Tmax_mean,Tmax_max,Tmean_mean,Ptot_sum,Ptot_max,GDD_sum,HotDays_sum,FrostDays_sum,WetDays_sum
0,ALEXANDER,1996,-2.231028,-37.795774,5.486426,34.526901,1.627699,0.77778,0.047799,1786.386633,5,189,112
1,ALEXANDER,1997,-0.277886,-32.508271,7.44873,31.53468,3.585422,0.745016,0.035828,1811.883024,2,186,107
2,ALEXANDER,1998,1.876897,-28.898511,9.551834,30.971862,5.714365,0.6733,0.040896,2024.97022,3,153,111
3,ALEXANDER,1999,1.274463,-30.357541,8.90404,33.457402,5.089251,0.655147,0.035641,1822.989375,5,165,119
4,ALEXANDER,2000,0.006244,-30.738687,7.698553,30.337081,3.852398,0.888865,0.043473,1813.705321,2,161,125


In [167]:
yields_weather = y_port.merge(weather_idx_raw, on=["Municipality","Year"], how="inner")
yields_weather.to_csv(OUTDIR / "yields_weather.csv", index=False)
yields_weather.head()

Unnamed: 0,Municipality,Year,YieldPortfolio,Tmin_mean,Tmin_min,Tmax_mean,Tmax_max,Tmean_mean,Ptot_sum,Ptot_max,GDD_sum,HotDays_sum,FrostDays_sum,WetDays_sum
0,ALEXANDER,1996,0.381934,-2.231028,-37.795774,5.486426,34.526901,1.627699,0.77778,0.047799,1786.386633,5,189,112
1,ALEXANDER,1997,0.359387,-0.277886,-32.508271,7.44873,31.53468,3.585422,0.745016,0.035828,1811.883024,2,186,107
2,ALEXANDER,1998,0.365427,1.876897,-28.898511,9.551834,30.971862,5.714365,0.6733,0.040896,2024.97022,3,153,111
3,ALEXANDER,1999,0.415961,1.274463,-30.357541,8.90404,33.457402,5.089251,0.655147,0.035641,1822.989375,5,165,119
4,ALEXANDER,2000,0.279473,0.006244,-30.738687,7.698553,30.337081,3.852398,0.888865,0.043473,1813.705321,2,161,125
