## Cleaning V-Dem Episodes of Regime Transformation Dataset

### 1. Load dataset

In [1]:
import yaml

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
try:
    with open('../config.yaml') as file:
        config = yaml.safe_load(file)
except:
    print("Yaml file not found.")

In [24]:
# Load dataset

vdem_df = pd.read_csv(config['input_data']['vdem_episodes_of_regime_transformation_dataset'])
vdem_df.head()

Unnamed: 0.1,Unnamed: 0,country_id,country_text_id,country_name,year,v2x_regime,v2x_polyarchy,v2x_polyarchy_codelow,v2x_polyarchy_codehigh,reg_start_year,...,aut_ep_start_year,aut_ep_end_year,aut_pre_ep_year,aut_ep_termination,aut_ep_prch,aut_ep_pbr,aut_ep_subreg,aut_ep_outcome,aut_ep_outcome_agg,aut_ep_censored
0,1,3,MEX,Mexico,1900,0.0,0.124,0.112,0.135,1900.0,...,,,0,,,,,0,0,0
1,2,3,MEX,Mexico,1901,0.0,0.109,0.096,0.12,1900.0,...,,,0,,,,,0,0,0
2,3,3,MEX,Mexico,1902,0.0,0.109,0.096,0.12,1900.0,...,,,0,,,,,0,0,0
3,4,3,MEX,Mexico,1903,0.0,0.109,0.096,0.12,1900.0,...,,,0,,,,,0,0,0
4,5,3,MEX,Mexico,1904,0.0,0.109,0.096,0.12,1900.0,...,,,0,,,,,0,0,0


### 2. Data cleaning

#### 2.1 Long version of data cleaning -- step by step

In [28]:
# Set list of Council of Europe member states (i.e. sample used in the judicial autonomy dataset)
council_of_europe_countries = ['Albania', 'Armenia', 'Austria', 'Azerbaijan', 'Belgium',
       'Bulgaria', 'Bosnia and Herzegovina', 'Switzerland', 'Cyprus',
       'Czechia', 'Germany', 'Spain', 'Estonia', 'Finland', 'France',
       'United Kingdom', 'Georgia', 'Greece', 'Croatia', 'Hungary',
       'Ireland', 'Iceland', 'Italy', 'Lithuania', 'Latvia', 'Moldova',
       'North Macedonia', 'Malta', 'Montenegro', 'Norway', 'Poland',
       'Portugal', 'Romania', 'Russia', 'Serbia', 'Slovakia', 'Slovenia',
       'Sweden', 'Türkiye', 'Ukraine']

In [55]:
# Filter dataset for defined countries
vdem_country = vdem_df[vdem_df['country_name'].isin(council_of_europe_countries)]

# Rename Turkey to fit judicial autonomy dataset
vdem_country.loc[:, 'country_name'] = vdem_country['country_name'].replace({'Türkiye': 'Turkey'})

# Filter for years 2000-2022
vdem_country_year = vdem_country.loc[(vdem_country['year'] >= 2000) & (vdem_country['year'] <= 2022)]

# Define which columns to keep
columns_to_keep = ['country_name', 'year', 'reg_id', 'reg_type', 'v2x_regime', 'v2x_polyarchy', 'reg_trans', 'row_regch_event', 'dem_ep']

# Drop all other columns
vdem_country_year_cleaned = vdem_country_year[columns_to_keep]

# Rename country column to match judicial autonomy dataset
vdem_country_year_cleaned = vdem_country_year_cleaned.rename(columns={'country_name': 'country'})

vdem_cleaned_manually = vdem_country_year_cleaned.copy()
#vdem_cleaned_manually

#### 2.2 Short version of data cleaning -- user-defined function

In [56]:
# Define function to clean raw data of V-Dem Episodes of Regime Transformation

def cleaning_vdem_ert_data(vdem_data):
    
    # Step 1: Create copy of dataframe
    vdem = vdem_data.copy()

    # Step 2: Clean countries
    # Set list of Council of Europe member states (i.e. sample used in the judicial autonomy dataset)
    council_of_europe_countries = ['Albania', 'Armenia', 'Austria', 'Azerbaijan', 'Belgium',
       'Bulgaria', 'Bosnia and Herzegovina', 'Switzerland', 'Cyprus',
       'Czechia', 'Germany', 'Spain', 'Estonia', 'Finland', 'France',
       'United Kingdom', 'Georgia', 'Greece', 'Croatia', 'Hungary',
       'Ireland', 'Iceland', 'Italy', 'Lithuania', 'Latvia', 'Moldova',
       'North Macedonia', 'Malta', 'Montenegro', 'Norway', 'Poland',
       'Portugal', 'Romania', 'Russia', 'Serbia', 'Slovakia', 'Slovenia',
       'Sweden', 'Türkiye', 'Ukraine']

    # Filter dataset for defined countries
    vdem_country = vdem[vdem['country_name'].isin(council_of_europe_countries)]

    # Rename Turkey to match judicial autonomy dataset
    vdem_country.loc[:, 'country_name'] = vdem_country['country_name'].replace({'Türkiye': 'Turkey'})

    # Step 3: Filter for years 2000-2022
    vdem_country_year = vdem_country.loc[(vdem_country['year'] >= 2000) & (vdem_ert_country['year'] <= 2022)]

    # Step 4: Drop columns
    # Define which columns to keep
    columns_to_keep = ['country_name', 'year', 'reg_id', 'reg_type', 'v2x_regime', 'v2x_polyarchy', 
                        'reg_trans', 'row_regch_event', 'dem_ep']

    # Drop all other columns
    vdem_cleaned = vdem_country_year[columns_to_keep]

    # Rename country column to match judicial autonomy dataset
    vdem_cleaned = vdem_cleaned.rename(columns={'country_name': 'country'})
    
    return vdem_cleaned


In [57]:
vdem_cleaned = cleaning_vdem_ert_data(vdem_df)
vdem_cleaned

Unnamed: 0,country,year,reg_id,reg_type,v2x_regime,v2x_polyarchy,reg_trans,row_regch_event,dem_ep
348,Sweden,2000,SWE_1921_2023,1.0,3.0,0.914,0.0,0.0,0
349,Sweden,2001,SWE_1921_2023,1.0,3.0,0.914,0.0,0.0,0
350,Sweden,2002,SWE_1921_2023,1.0,3.0,0.914,0.0,0.0,0
351,Sweden,2003,SWE_1921_2023,1.0,3.0,0.915,0.0,0.0,0
352,Sweden,2004,SWE_1921_2023,1.0,3.0,0.915,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...
19369,Hungary,2018,HUN_2018_2023,0.0,1.0,0.482,-1.0,-1.0,0
19370,Hungary,2019,HUN_2018_2023,0.0,1.0,0.472,0.0,0.0,0
19371,Hungary,2020,HUN_2018_2023,0.0,1.0,0.465,0.0,0.0,0
19372,Hungary,2021,HUN_2018_2023,0.0,1.0,0.456,0.0,0.0,0


In [36]:
# Check datatypes
vdem_cleaned.dtypes

country_id           int64
country_name        object
year                 int64
v2x_regime         float64
v2x_polyarchy      float64
reg_id              object
reg_type           float64
reg_trans          float64
row_regch_event    float64
dem_ep               int64
dtype: object

In [62]:
vdem_cleaned.group_by('reg_type')['country'].sum()

AttributeError: 'DataFrame' object has no attribute 'group_by'