___
# README: Prepare data for TWFE and Staggered DiD

- This notebook prepares the data for the estimation of causal effects via TWFE (two way fixed effects) and Staggered Difference in Difference à la de Chaisemartin & D'Haultfoeuille (2020).
- Before running the next cells, please upload the following .csv file to the Files section in Google Colab (left panel)
  - unified-data-20240723.csv
- The .csv file can be found [here](https://drive.google.com/drive/folders/1-UbPqDkB9ZBFNmAti3Gt936HVUyfNrfh?usp=drive_link).

___

In [None]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns",None)

In [None]:
def get_custom_interval(year):
    interval_num = (year - 1945) // 10
    start_year = 1945 + interval_num * 10
    end_year = start_year + 9
    return f'[{start_year}-{end_year}]'

## 1. Load and process data

In [None]:
PATH_IDEA = "unified-data-20240723.csv"
IDEA_COLS = ["year","Country","type","CV_idea",
             "Turnout_idea","in_idea","Population"]

idea = (
          pd.read_csv(PATH_IDEA, usecols=IDEA_COLS)
          .query("in_idea == 1")
          .drop(columns=["in_idea"])
       )

print(idea.shape)

idea["cv_change"] = (
                      idea
                      .sort_values(["Country","type","year"], ascending=[True,True,True])
                      .groupby("Country")["CV_idea"]
                      .transform(lambda x: x.diff())
                    )

# Happy path
# ==========
filt_example = (idea["Country"]=="Uruguay") & (idea["type"]=="Legislative")

(
    idea[filt_example]
    .sort_values("year", ascending=True)
    .head(10)
)


(3008, 6)


Unnamed: 0,year,Country,type,CV_idea,Turnout_idea,Population,cv_change
1425,1946,Uruguay,Legislative,0.0,67.43,2281000,
1457,1950,Uruguay,Legislative,0.0,70.91,2407000,0.0
1492,1954,Uruguay,Legislative,0.0,67.87,2579000,0.0
1530,1958,Uruguay,Legislative,0.0,71.3,2762000,0.0
1562,1962,Uruguay,Legislative,0.0,76.63,2914000,0.0
1603,1966,Uruguay,Legislative,0.0,74.28,2750000,0.0
1651,1971,Uruguay,Legislative,1.0,91.9,2920000,1.0
1815,1984,Uruguay,Legislative,1.0,87.87,2990000,0.0
498,1989,Uruguay,Legislative,1.0,88.67,3077000,0.0
644,1994,Uruguay,Legislative,1.0,91.44,3167000,0.0


In [None]:
# Ugly path
# =========
opp_direction = [
    ['Uruguay', 'Presidential'],        # Correction: cv_change = -1 de Uruguay
    ['Netherlands', 'Legislative'],
    ['Switzerland', 'Legislative'],
    ['Austria', 'Presidential'],
    ['Philippines', 'Legislative'],
    ['Guatemala', 'Presidential'],
    ['Guatemala', 'Legislative'],
    ['Bulgaria', 'Presidential'],       # Correction: cv_change = -1 de Uruguay
    ['Venezuela', 'Legislative'],
    ['Venezuela', 'Presidential'],
    ['Italy', 'Legislative'],
    ['Dominican Republic', 'Legislative'],
    ['Dominican Republic', 'Presidential'],
    ['Chile', 'Legislative'],
    ['Chile', 'Presidential'],
    ['Fiji', 'Legislative'],
    ['Cyprus', 'Presidential'],
    ['Cyprus', 'Legislative']
]

n = 7
cntr = opp_direction[n][0]
kind = opp_direction[n][1]


(
    idea[(idea["Country"]==cntr) & (idea["type"]==kind)]
    .sort_values("year", ascending=True)
    .head(25)
)

Unnamed: 0,year,Country,type,CV_idea,Turnout_idea,Population,cv_change
551,1992,Bulgaria,Presidential,0.0,75.17,8540000,-1.0
669,1996,Bulgaria,Presidential,0.0,62.295,8364000,0.0
827,2001,Bulgaria,Presidential,0.0,48.275,7932984,0.0
990,2006,Bulgaria,Presidential,0.0,43.365,7385367,0.0
1163,2011,Bulgaria,Presidential,0.0,50.27,7093635,0.0
1343,2016,Bulgaria,Presidential,1.0,53.36,7144653,1.0
3010,2021,Bulgaria,Presidential,1.0,36.545,6919180,0.0


In [None]:
# Correcting encoding errors shown above
# ======================================

idea.loc[(idea["Country"]=="Uruguay") & (idea["type"]=="Presidential") & (idea["year"]==1946), "cv_change"] = 0
idea.loc[(idea["Country"]=="Bulgaria") & (idea["type"]=="Presidential") & (idea["year"]==1992), "cv_change"] = 0

In [None]:
# Creating lead and lag columns per country
# =========================================

# Presidential elections
# ======================
groups_pres = idea[idea["type"]=="Presidential"].groupby(["Country"]).groups

proc_data_pres = {}
always_treated_pres = []
never_treated_pres = []
treated_pres = []
adopt_or_abandon_cv_pres = {}
for country, indices in list(groups_pres.items()):

    sorted_data = (
                      idea.loc[indices, :]
                      .sort_values("year", ascending=True)
                      .copy()
                  )

    for n in range(11):
        sorted_data[f"lag{n}"] = sorted_data["cv_change"].shift(n).bfill()

    for n in range(1,11):
        sorted_data[f"lead{n}"] = sorted_data["cv_change"].shift(-n).ffill()

    proc_data_pres.update({country: sorted_data})

    if list(sorted_data["CV_idea"].unique()) == [1]:
        always_treated_pres.append(country)

    if list(sorted_data["CV_idea"].unique()) == [0]:
        never_treated_pres.append(country)

    if list(sorted_data["CV_idea"].unique()) == [0,1]:
        treated_pres.append(country)

    cv_sequence = [str(j) for j in sorted_data["CV_idea"].values]
    adopt_or_abandon_cv_pres.update({country : "-".join(cv_sequence)})

In [None]:
abandon_cv_pres = []
for c,s in adopt_or_abandon_cv_pres.items():
    if (adopt_or_abandon_cv_pres[c].startswith("1.0")) and (adopt_or_abandon_cv_pres[c].endswith("0.0")):
        print(f"Abandon cv (presidential): {c}")
        abandon_cv_pres.append(c)

Abandon cv (presidential): Austria
Abandon cv (presidential): Chile
Abandon cv (presidential): Cyprus
Abandon cv (presidential): Dominican Republic
Abandon cv (presidential): Guatemala
Abandon cv (presidential): Venezuela


In [None]:
# Legislative elections
# =====================
groups_legi = idea[idea["type"]=="Legislative"].groupby(["Country"]).groups

proc_data_legi = {}
always_treated_legi = []
never_treated_legi = []
treated_legi = []
adopt_or_abandon_cv_legi = {}
for country, indices in list(groups_legi.items()):

    sorted_data = (
                      idea.loc[indices, :]
                      .sort_values("year", ascending=True)
                      .copy()
                  )

    for n in range(11):
        sorted_data[f"lag{n}"] = sorted_data["cv_change"].shift(n).bfill()

    for n in range(1,11):
        sorted_data[f"lead{n}"] = sorted_data["cv_change"].shift(-n).ffill()

    proc_data_legi.update({country: sorted_data})

    if list(sorted_data["CV_idea"].unique()) == [1]:
        always_treated_legi.append(country)

    if list(sorted_data["CV_idea"].unique()) == [0]:
        never_treated_legi.append(country)

    if list(sorted_data["CV_idea"].unique()) == [0,1]:
        treated_legi.append(country)

    cv_sequence = [str(j) for j in sorted_data["CV_idea"].values]
    adopt_or_abandon_cv_legi.update({country : "-".join(cv_sequence)})

In [None]:
abandon_cv_legi = []
for c,s in adopt_or_abandon_cv_legi.items():
    if (adopt_or_abandon_cv_legi[c].startswith("1.0")) and (adopt_or_abandon_cv_legi[c].endswith("0.0")):
        print(f"Abandon cv (legislative): {c}")
        abandon_cv_legi.append(c)

Abandon cv (legislative): Chile
Abandon cv (legislative): Cyprus
Abandon cv (legislative): Dominican Republic
Abandon cv (legislative): Fiji
Abandon cv (legislative): Guatemala
Abandon cv (legislative): Italy
Abandon cv (legislative): Netherlands
Abandon cv (legislative): Switzerland
Abandon cv (legislative): Venezuela


In [None]:
# Concatenating all datasets
# ==========================

final_cols = ["year","Country","type","CV_idea","Turnout_idea","cv_change",
              "lag0","lag1","lag2","lag3","lag4","lag5","lag6","lag7","lag8",
              "lag9","lag10","lead1","lead2","lead3","lead4","lead5","lead6",
              "lead7","lead8","lead9","lead10","Population"]

df = pd.DataFrame(columns=final_cols)

for subject in groups_pres.keys():
    if (subject not in always_treated_pres): #and (subject not in abandon_cv_pres):
        df = pd.concat([df, proc_data_pres[subject]], axis=0)

for subject in groups_legi.keys():
    if (subject not in always_treated_legi): #and (subject not in abandon_cv_legi):
        df = pd.concat([df, proc_data_legi[subject]], axis=0)

In [None]:
# Create 10 years bin
# ===================

df['interval'] = df['year'].apply(get_custom_interval)

In [None]:
# Creating country id column
# ==========================

cid = {}
for i,c in enumerate(np.unique(list(groups_legi.keys()) + list(groups_pres.keys()))):
    cid.update({c:i})

df["cid"] = df["Country"].map(cid)

In [None]:
df.shape

(2497, 30)

In [None]:
# Records with lack of info about population
# ==========================================

df[df["Population"]=="-"].shape

(34, 30)

In [None]:
print(df[df["Population"]!="-"].shape)
df[df["Population"]!="-"].head(3)

(2463, 30)


Unnamed: 0,year,Country,type,CV_idea,Turnout_idea,cv_change,lag0,lag1,lag2,lag3,lag4,lag5,lag6,lag7,lag8,lag9,lag10,lead1,lead2,lead3,lead4,lead5,lead6,lead7,lead8,lead9,lead10,Population,interval,cid
2310,2004,Afghanistan,Presidential,0.0,83.66,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.0,0.0,0.0,,,,,,,,25697635,[1995-2004],0
2477,2009,Afghanistan,Presidential,0.0,38.8,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.0,0.0,0.0,,,,,,,,28483631,[2005-2014],0
2652,2014,Afghanistan,Presidential,0.0,36.285,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.0,0.0,0.0,,,,,,,,31822848,[2005-2014],0


## 2. Save processed data

In [None]:
df[df["Population"]!="-"].to_csv("data-final-assessment.csv", index=False)