In [23]:
import pandas as pd
import numpy as np

# Pré-processando dados

## Lendo arquivos

In [24]:
regime_df = pd.read_csv("../data/preprocessed/regime_data.csv")
crisis_df = pd.read_csv("../data/preprocessed/crisis_data.csv")

In [25]:
regime_df["country_name"] = regime_df["country_name"].str.upper()

## Gerando visão agregada em `crisis_df`

Para cada intervalo de tempo (`gov_start`, `gov_end`), geraremos as seguintes visões agregadas relativas ao banco de crises:

1. Para variáveis indicadoras: 'crisis_banking', 'crisis_systemic', 'default_domestic_debt', 'default_sovereign_debt_tp2', 'crisis_currency', 'crisis_currency'
    * **Soma** das variáveis
    * **Média ponderada**, onde o peso para o i-ésimo ano é $w_i=\frac{i}{\sum i}$
2. Para `ref_inflation`, o valor do último ano

### Encontrando os intervalos de tempo no banco de regimes:

In [26]:
intervals_df = regime_df[["gov_start", "gov_duration", "country_name", "gov_id"]]
intervals_df["gov_end"] = intervals_df["gov_start"] + intervals_df["gov_duration"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intervals_df["gov_end"] = intervals_df["gov_start"] + intervals_df["gov_duration"]


In [27]:
gov_id_per_row = []

for index, value in crisis_df.iterrows():

    df_ = intervals_df[intervals_df["country_name"] == value["country_name"]]
    start_interval = df_[df_["gov_start"] <= value["ref_year"]]["gov_start"].max()

    gov_id_per_row.append(
        df_[df_["gov_start"] == start_interval]["gov_id"].values
    )

In [28]:
crisis_df["gov_id"] = [x[0] if x else np.nan for x in gov_id_per_row]

  crisis_df["gov_id"] = [x[0] if x else np.nan for x in gov_id_per_row]


Mantendo apenas países que estejam no dataset de crises

In [29]:
crisis_df = crisis_df.dropna(subset="gov_id")

### Gerando as variáveis agregadas

In [30]:
indicating_cols = ['crisis_banking', 'crisis_systemic', 'default_domestic_debt', 'default_sovereign_debt_tp2', 'crisis_currency', 'crisis_currency']

In [31]:
crisis_df["pos_order"] = (
    crisis_df
    .groupby("gov_id")["ref_year"]
    .rank(method="min")
)

In [32]:
aggr_ponderado = (
    crisis_df
    .groupby("gov_id")
    .agg({"pos_order" : "sum"})
    .rename({"pos_order" : "sum_pos_order"}, axis=1)
    .merge(crisis_df, left_index=True, right_on="gov_id")
)

for col in indicating_cols:
    aggr_ponderado[col] = (aggr_ponderado[col] * aggr_ponderado["pos_order"]) / aggr_ponderado["sum_pos_order"]

aggr_ponderado = (
    aggr_ponderado    
    .groupby("gov_id")
    .agg({col : "sum" for col in indicating_cols})
    .rename({ col : f"pond_{col}" for col in indicating_cols}, axis=1)
)

In [33]:
aggr_df = crisis_df.groupby("gov_id")

In [34]:
crisis_aggr = (
    aggr_df
    .agg({ col : "sum" for col in indicating_cols})
    .merge(
        aggr_df.nth(-1)[["ref_inflation"]], 
        left_index=True, right_index=True
    )
    .rename({"ref_inflation" : "last_ref_inflation"}, axis=1)
    .rename({ col : f"sum_{col}" for col in indicating_cols}, axis=1)
    .merge(aggr_ponderado, left_index=True, right_index=True)
)

### Juntando com dados de regime

In [None]:
joined_df = regime_df.merge(crisis_aggr, left_on="gov_id", right_index=True)

In [None]:
joined_df = joined_df[joined_df["gov_id"] != "MILITARY"]

In [None]:
joined_df = joined_df.set_index("gov_id")

In [None]:
#joined_df["gov_observed_death"] = [1 if x else 0 for x in joined_df["gov_observed_death"]]

In [None]:
joined_df.to_csv("../data/intermediate/aggregated.csv")