## Cleaning Judicial Autonomy Dataset (long)

### 1. Load dataset

In [1]:
import yaml

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
try:
    with open('../config.yaml') as file:
        config = yaml.safe_load(file)
except:
    print("Yaml file not found.")

In [3]:
# Load dataset q1
q1_df = pd.read_csv(config['input_data']['q1_dataset'])
q1_df.head()

Unnamed: 0,username,country,country_code,question_no,uid,year,answer,value_na,value
0,ALB22uFF4m,Albania,ALB,1,q1c1_apjufc,2000,Mixed system [0.5],0.5,0.5
1,ALB22uFF4m,Albania,ALB,1,q1c1_apjufc,2001,Mixed system [0.5],0.5,0.5
2,ALB22uFF4m,Albania,ALB,1,q1c1_apjufc,2002,Mixed system [0.5],0.5,0.5
3,ALB22uFF4m,Albania,ALB,1,q1c1_apjufc,2003,Mixed system [0.5],0.5,0.5
4,ALB22uFF4m,Albania,ALB,1,q1c1_apjufc,2004,Mixed system [0.5],0.5,0.5


**Output -- DataFrame:** `q1_df`

### 2. Data cleaning

#### 2.1 Long version of data cleaning -- step by step

In [4]:
# Pivot long dataframe to wide dataframe
q1_pivoted = q1_df.pivot(index=['username', 'country', 'country_code', 'year'], 
                      columns=['uid'], 
                      values=['value'])

# Reset index and flattening multi-level column names
# Use lambda function to access tuple storing the multi-level column names
q1_pivoted.reset_index(inplace=True)
q1_pivoted.columns = [col[1] if col[1] else col[0] for col in q1_pivoted.columns]
#q1_pivoted

In [5]:
# Filter out countries / users
remove_usernames = ('ADMIN123', 'ALB22uFF4m','BEL22cEw8t', 'BIH22q2nOU', 'DNK22KFh1N', 'MNE22N8NJv', 'NLD22Ba53p', 'SRB22L4wbh')
q1_countries_cleaned = q1_pivoted[~q1_pivoted['username'].isin(remove_usernames)]
#q1_countries_cleaned

In [6]:
# Replace country names
q1_countries_cleaned.loc[:,'country'] = q1_countries_cleaned['country'].replace({
    "Czech Republic": "Czechia",
    "Republic of Albania": "Albania",
    "Republic of Serbia": "Serbia",
    "Bosnia and Herzegovina (BiH)": "Bosnia and Herzegovina",
    "Montenegro (MON)": "Montenegro",
    "Kingdom of Belgium": "Belgium"})
q1_countries_cleaned

Unnamed: 0,username,country,country_code,year,q1c1_apjuac,q1c1_apjufc,q1c1_apjuhc,q1c1_appealac,q1c1_appealfc,q1c1_appealhc,...,q1c4_casealloc,q1c4_competence,q1c4_manbudget,q1c4_reasondecis,q1c4_regbudget,q1c4_sameright,q1c4_subj,q1c4_whochair,q1c4_whocharge,q1c4_whoselect
46,ALB33wGG5n,Albania,ALA,2000,0.5,0.5,0.0,0.0,0.0,0.0,...,1.0,0.00,1.0,0.0,1.0,0.0,3.0,,0.5,0.5
47,ALB33wGG5n,Albania,ALA,2001,0.5,0.5,0.0,0.0,0.0,0.0,...,1.0,0.00,1.0,0.0,1.0,0.0,3.0,,0.5,0.5
48,ALB33wGG5n,Albania,ALA,2002,0.5,0.5,0.0,0.0,0.0,0.0,...,1.0,0.00,1.0,0.0,1.0,0.0,3.0,,0.5,0.5
49,ALB33wGG5n,Albania,ALA,2003,0.5,0.5,0.0,0.0,0.0,0.0,...,1.0,0.00,1.0,0.0,1.0,0.0,3.0,,0.5,0.5
50,ALB33wGG5n,Albania,ALA,2004,0.5,0.5,0.0,0.0,0.0,0.0,...,1.0,0.00,1.0,0.0,1.0,0.0,3.0,,0.5,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099,UKR22Gf5Kx,Ukraine,UKR,2018,0.5,0.5,0.5,1.0,1.0,1.0,...,0.5,0.67,0.0,0.0,0.0,1.0,,1.0,1.0,1.0
1100,UKR22Gf5Kx,Ukraine,UKR,2019,0.5,0.5,0.5,1.0,1.0,1.0,...,0.5,0.67,0.0,0.0,0.0,1.0,,1.0,1.0,1.0
1101,UKR22Gf5Kx,Ukraine,UKR,2020,0.5,0.5,0.5,1.0,1.0,1.0,...,0.5,0.67,0.0,0.0,0.0,1.0,,1.0,1.0,1.0
1102,UKR22Gf5Kx,Ukraine,UKR,2021,0.5,0.5,0.5,1.0,1.0,1.0,...,0.5,0.67,0.0,0.0,0.0,1.0,,1.0,1.0,1.0


In [7]:
q1_countries_cleaned.shape

(920, 62)

In [8]:
# Drop columns that contain '_subj' (i.e. subjective answers)
q1_subj_cleaned = q1_countries_cleaned.drop(list(q1_countries_cleaned.filter(regex='_subj')), axis=1)

In [9]:
# Check for missing values across columns (in percentage) 
q1_check_nan = q1_subj_cleaned.isna().mean().sort_values(ascending=False)*100
q1_check_nan.head()

q1c2_retireage    85.108696
q1c2_jubonus      51.195652
q1c3_immunlift    31.304348
q1c3_evalints     25.326087
q1c4_whochair     19.891304
dtype: float64

In [10]:
# Filter out all columns with >20% of missing values
q1_columns_cleaned = q1_subj_cleaned.drop(columns = ['q1c2_retireage', 'q1c2_jubonus', 'q1c3_immunlift', 'q1c3_evalints'])

In [11]:
q1_cleaned_manually = q1_columns_cleaned.copy()

**Output -- DataFrame:** `q1_cleaned_manually`

#### 2.2 Short version of data cleaning -- user-defined function

In [12]:
# Define function to clean raw dataset

def cleaning_judicial_autonomy_data(qx_df):

    # Step 1: Create copy of dataframe
    df = qx_df.copy()
    
    # Step 2: Adjust dataframe shape
    # Pivot long dataframe to wide dataframe
    df_pivoted = df.pivot(index=['username', 'country', 'country_code', 'year'], 
                          columns=['uid'], 
                          values=['value'])
    
    # Reset index and flattening multi-level column names
    df_pivoted.reset_index(inplace=True)
    df_pivoted.columns = [col[1] if col[1] else col[0] for col in df_pivoted.columns]

    # Step 3: Clean country and user names
    # Remove usernames
    remove_usernames = ('ADMIN123', 
                        'ALB22uFF4m',
                        'BEL22cEw8t', 
                        'BIH22q2nOU', 
                        'DNK22KFh1N', 
                        'MNE22N8NJv', 
                        'NLD22Ba53p', 
                        'SRB22L4wbh')
    df_countries_cleaned = df_pivoted[~df_pivoted['username'].isin(remove_usernames)]

    # Replace country names
    df_countries_cleaned.loc[:,'country'] = df_countries_cleaned['country'].replace({
        'Czech Republic': 'Czechia',
        'Republic of Albania': 'Albania',
        'Republic of Serbia': 'Serbia',
        'Bosnia and Herzegovina (BiH)': 'Bosnia and Herzegovina',
        'Montenegro (MON)': 'Montenegro',
        'Kingdom of Belgium': 'Belgium'})
    
    # Step 4: Remove columns
    # Drop columns that contain '_subj'
    # Drop columns with more than 20% values missing values ['q1c2_jubonus', 'q1c2_retireage', 'q1c3_evalints', 'q1c3_immunlift']
    columns_nan_percentage = df_countries_cleaned.isna().mean()*100
    columns_nan_20_percent = columns_nan_percentage[columns_nan_percentage > 20].index
    df_cleaned = df_countries_cleaned.drop(columns=columns_nan_20_percent)

    return df_cleaned

**Milestone -- function:** `cleaning_judicial_autonomy_data()`

In [13]:
# Clean dataset
q1_cleaned = cleaning_judicial_autonomy_data(q1_df) 

**Output -- DataFrame:** `q1_cleaned`

In [14]:
q1_cleaned['country'].unique()

array(['Albania', 'Armenia', 'Austria', 'Azerbaijan', 'Belgium',
       'Bulgaria', 'Bosnia and Herzegovina', 'Switzerland', 'Cyprus',
       'Czechia', 'Germany', 'Spain', 'Estonia', 'Finland', 'France',
       'United Kingdom', 'Georgia', 'Greece', 'Croatia', 'Hungary',
       'Ireland', 'Iceland', 'Italy', 'Lithuania', 'Latvia', 'Moldova',
       'North Macedonia', 'Malta', 'Montenegro', 'Norway', 'Poland',
       'Portugal', 'Romania', 'Russia', 'Serbia', 'Slovakia', 'Slovenia',
       'Sweden', 'Turkey', 'Ukraine'], dtype=object)

### 3. Alternative option: recode fuzzy values as binary values

In [15]:
# Define function to recode all fuzzy values as binary values (conservative coding, i.e. value <= 0.5 as 0.0)

def recoding_fuzzy_to_binary(qx_df):

    df_fuzzy_values_recoded = qx_df.copy()
    float_columns = df_fuzzy_values_recoded.select_dtypes(include=float).columns

    for col in float_columns:
        df_fuzzy_values_recoded[col] = df_fuzzy_values_recoded[col].apply(lambda value: 0.0 if value <= 0.5 else 1.0)

    return df_fuzzy_values_recoded

**Output -- function:** `recoding_fuzzy_to_binary()`

In [16]:
# Recode fuzzy values
q1_fuzzy_binary_recoded = recoding_fuzzy_to_binary(q1_df)

In [17]:
# Clean dataset
q1_cleaned_binary = cleaning_judicial_autonomy_data(q1_fuzzy_binary_recoded) 
q1_cleaned_binary

Unnamed: 0,username,country,country_code,year,q1c1_apjuac,q1c1_apjufc,q1c1_apjuhc,q1c1_appealac,q1c1_appealfc,q1c1_appealhc,...,q1c4_casealloc,q1c4_competence,q1c4_manbudget,q1c4_reasondecis,q1c4_regbudget,q1c4_sameright,q1c4_subj,q1c4_whochair,q1c4_whocharge,q1c4_whoselect
46,ALB33wGG5n,Albania,ALA,2000,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
47,ALB33wGG5n,Albania,ALA,2001,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
48,ALB33wGG5n,Albania,ALA,2002,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
49,ALB33wGG5n,Albania,ALA,2003,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
50,ALB33wGG5n,Albania,ALA,2004,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099,UKR22Gf5Kx,Ukraine,UKR,2018,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
1100,UKR22Gf5Kx,Ukraine,UKR,2019,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
1101,UKR22Gf5Kx,Ukraine,UKR,2020,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
1102,UKR22Gf5Kx,Ukraine,UKR,2021,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0


**Output -- DataFrame:** `q1_cleaned_binary`

### 4. Create micro- and macro-indicators

#### 4.1 Micro-indicators

#### 4.1.1 Subset dataset to create micro-indicators

In [18]:
def create_micro_indicators_dict(qx_cleaned):

    subsets = {

    # 1 -- Actors involved in appointment procedures of judges
        'q1_micro_appointment_judges': qx_cleaned[[
                                        'country',
                                        'year',
                                        'q1c1_apjufc',
                                        'q1c1_apjuac',
                                        'q1c1_apjuhc']],
        
    # 2 -- Actors involved in appointment procedures of court presidents
 
        'q1_micro_appointment_court_presidents': qx_cleaned[[
                                        'country',
                                        'year',
                                        'q1c1_appresidfc',
                                        'q1c1_appresidac',
                                        'q1c1_appresidhc']],
    
    # 3 -- Veto powers during appointment procedures
        'q1_micro_appointment_veto': qx_cleaned[[
                                        'country',
                                        'year',
                                        'q1c1_exvetofc',
                                        'q1c1_exvetoac',
                                        'q1c1_exvetohc']],
    
    # 4 -- Selection and appointment criteria for judges - predetermined by law
        'q1_micro_selection_predetermined_law': qx_cleaned[[
                                        'country',
                                        'year',
                                        'q1c1_critfclaw', 
                                        'q1c1_critaclaw',
                                        'q1c1_crithclaw']],

    # 5 -- Selection and appointment criteria for judges - in accordance to international standards
        'q1_micro_selection_intl_standards': qx_cleaned[[
                                        'country',
                                        'year', 
                                        'q1c1_critfcints',
                                        'q1c1_critacints', 
                                        'q1c1_crithcints',
                                        'q1c1_probju']],

    # 6 -- Transparency and mechanisms for appeal in appointment procedures of judges
    'q1_micro_transparency_appeal': qx_cleaned[['country',
                                              'year',
                                              'q1c1_transplaw', 
                                              'q1c1_appealfc',
                                              'q1c1_appealac',
                                              'q1c1_appealhc']],
    # 7 -- Tenure and term in office of judges
    'q1_micro_judge_tenure': qx_cleaned[['country',
                                  'year',
                                  'q1c2_termfcju', 
                                  'q1c2_termacju',
                                  'q1c2_termpresid', 
                                  'q1c2_termhcju']],
    
    # 8 -- Immunity and non-transferability of judges
    'q1_micro_judge_immunity': qx_cleaned[['country',
                                    'year',
                                    'q1c2_juabsimmun', 
                                    'q1c2_jufuncimmun',
                                    'q1c2_juremove', 
                                    'q1c2_jutransf']],

    # 9 -- Salaries and bonuses of judges
    'q1_micro_judge_salary': qx_cleaned[['country',
                                  'year',
                                  'q1c2_jusalary',
                                  'q1c2_jupension']],
        
    # 10 -- Disciplinary proceedings against judges - predetermined by law
    'q1_micro_disciplinary_proceedings_law': qx_cleaned[['country',
                                              'year',
                                              'q1c3_fairtrial', 
                                              'q1c3_disciplaw',
                                              'q1c3_discipints',
                                              'q1c3_sanctscale']],
        
    # 11 -- Disciplinary proceedings against judges - actors involved
    'q1_micro_disciplinary_proceedings_actors': qx_cleaned[['country',
                                              'year',
                                              'q1c3_discipbody',
                                              'q1c3_initdiscip', 
                                              'q1c3_decdiscip',
                                              'q1c3_appealdiscip']],
        
    # 12 -- Conflict of interest, recusal from cases and evaluation of judges
    'q1_micro_conflict_recusal_evaluation': qx_cleaned[['country',
                                          'year',
                                          'q1c3_judisclos', 
                                          'q1c3_jurestrict',
                                          'q1c3_jurecuse']],
                                          
    # 13 -- Composition of the judicial self-governing bodies
    'q1_micro_judicial_self_governance_bodies': qx_cleaned[['country',
                                             'year',
                                             'q1c4_whocharge', 
                                             'q1c4_whoselect',
                                             'q1c4_whochair']], 
        
    # 14 -- Competences and functioning of the judicial self-governing bodies
    'q1_micro_judicial_self_governance_competences': qx_cleaned[['country',
                                             'year',
                                             'q1c4_competence',
                                             'q1c4_sameright', 
                                             'q1c4_reasondecis']], 
    
    # 15 -- Administration, functioning and budget of courts                                         
    'q1_micro_courts_administration': qx_cleaned[['country',
                                           'year',
                                           'q1c4_casealloc', 
                                           'q1c4_regbudget',
                                           'q1c4_manbudget']]
    }

    return subsets
    

**Output -- function:** `create_micro_indicators_dict()`

In [19]:
created_micro_indicators = create_micro_indicators_dict(q1_cleaned)

In [20]:
created_micro_indicators_binary = create_micro_indicators_dict(q1_cleaned_binary)

**Output -- dictionary:** `created_micro_indicators` `created_micro_indicators_binary`

#### 4.1.2 Fill in missing values per micro-indicator (row-wise mode)

In [21]:
# Function to fill in missing values in each micro-indicator by row mode

def get_row_mode(row):
    """
    Step 1: Calculate the mode for the row. If multiple modes, take the first one
    Step 2: Replace NaN values in the row with the mode
    (Step 3: Replace NaN values with 0 in case there is no mode
    """
    row_numeric = row[2:]
    numeric_modes = row_numeric.mode()
    mode_value = numeric_modes.iloc[0] if not numeric_modes.empty else np.nan

    return row.fillna(mode_value) 
    

In [22]:
# Function to fill in missing values in each micro-indicator by row mode

def get_row_mode(row):
    """
    Step 1: Check if row is completely NaN and replace with 0 if true.
    Step 2: Otherwise, calculate the mode for the row. If multiple modes, take the first one.
    Step 3: Replace NaN values in the row with the mode.
    """
    row_numeric = row[2:]  # Assuming this indexes your numeric part correctly
    
    # Check if all values in the row are NaN and replace with 0
    if row_numeric.isna().all():
        return row.fillna(0)
    else:
        numeric_modes = row_numeric.mode()
        mode_value = numeric_modes.iloc[0] if not numeric_modes.empty else np.nan
        return row.fillna(mode_value) 

In [23]:
# Apply function to all micro-indicator subsets

def fill_na_per_micro_indicators(subsets):
    filled_subsets = {}
    for key, subset in subsets.items():
        filled_subset = subset.apply(get_row_mode, axis=1)
        filled_subsets[key] = filled_subset
    return filled_subsets
    

**Output -- function:** `fill_na_per_micro_indicators()`

In [24]:
filled_na_micro_indicators = fill_na_per_micro_indicators(created_micro_indicators)

In [25]:
filled_na_micro_indicators_binary = fill_na_per_micro_indicators(created_micro_indicators_binary)

**Output -- dictionary:** `filled_na_micro_indicators` `filled_na_micro_indicators_binary`

#### 4.1.3 Calculate micro-indicator measure (row-wise mean)

In [26]:
# Create micro-indicator measurement (by mean)

def calculate_micro_indicators_mean(subsets):
    
    calc_means_subsets = {}
    
    for key, subset in subsets.items():
        calc_means_subsets[key] = subset.copy()
        column_name = f"{key}_ind_measure"
        calc_means_subsets[key][column_name] = subset.select_dtypes(include=float).mean(axis=1).round(2)
        
    return calc_means_subsets
    

**Output -- function:** `calculate_micro_indicators_mean()`

In [27]:
calc_micro_indicators = calculate_micro_indicators_mean(filled_na_micro_indicators)

In [28]:
calc_micro_indicators_binary = calculate_micro_indicators_mean(filled_na_micro_indicators_binary)

**Output -- dictionary:** `calc_micro_indicators` `calc_micro_indicators_binary`

### 4.2. Macro-indicators

#### 4.2.1 Merge dataset to create macro-indicators

In [29]:
# CHECK: rename "subset" ???

In [30]:
# Create macro-indicator measurement (by mean)

def create_macro_indicators_dict(subset):

    # 1 -- Merge micro-indicators for macro-indicator 'appointment_procedures'
        temporary_merge_app = pd.merge(subset['q1_micro_appointment_judges'], 
                                       subset['q1_micro_appointment_court_presidents'], 
                                       how='outer', on=['country', 'year'])
        temporary_merge_app2 = pd.merge(temporary_merge_app, 
                                        subset['q1_micro_appointment_veto'], 
                                        how='outer', on=['country', 'year'])

    # 2 -- Merge micro-indicators for macro-indicator 'selection_criteria'
        temporary_merge_sel = pd.merge(subset['q1_micro_selection_predetermined_law'],
                                       subset['q1_micro_selection_intl_standards'],
                                        how='outer', on=['country', 'year'])
        temporary_merge_sel2 = pd.merge(temporary_merge_sel, 
                                        subset['q1_micro_transparency_appeal'],                                        
                                        how='outer', on=['country', 'year'])
   
    # 3 -- Merge micro-indicators for macro-indicator 'professional_rights'
        temp_merge_rights = pd.merge(subset['q1_micro_judge_tenure'], 
                                     subset['q1_micro_judge_immunity'], 
                                     how='outer', on=['country', 'year'])
        temp_merge_rights2 = pd.merge(temp_merge_rights, 
                                      subset['q1_micro_judge_salary'],
                                      how='outer', on=['country', 'year'])

    # 4 -- Merge micro-indicators for macro-indicator 'professional_obligations'
        temp_merge_obl = pd.merge(subset['q1_micro_disciplinary_proceedings_law'], 
                                  subset['q1_micro_disciplinary_proceedings_actors'], 
                                  how='outer', on=['country', 'year'])
        temp_merge_obl2 = pd.merge(temp_merge_obl, 
                                   subset['q1_micro_conflict_recusal_evaluation'], 
                                   how='outer', on=['country', 'year'])
        
    # 5 -- Merge micro-indicators for macro-indicator 'judicial_administration'
        temp_merge_adm = pd.merge(subset['q1_micro_judicial_self_governance_bodies'], 
                                  subset['q1_micro_judicial_self_governance_competences'], 
                                  how='outer', on=['country', 'year'])
        temp_merge_adm2 = pd.merge(temp_merge_adm,
                                   subset['q1_micro_courts_administration'],
                                   how='outer', on=['country', 'year'])
    
        macro_indicators = {'q1_macro_appointment_procedures': temporary_merge_app2, 
                            'q1_macro_selection_criteria': temporary_merge_sel2, 
                            'q1_macro_professional_rights': temp_merge_rights2,
                            'q1_macro_professional_obligations': temp_merge_obl2,
                            'q1_macro_judicial_administration': temp_merge_adm2}

        return macro_indicators
    

**Output -- function:** `create_macro_indicators_dict()`

In [31]:
created_macro_indicators = create_macro_indicators_dict(calc_micro_indicators)

In [32]:
created_macro_indicators_binary = create_macro_indicators_dict(calc_micro_indicators_binary)

**Output -- dictionary:** `created_macro_indicators` `created_macro_indicators_binary`

#### 4.2.2 Aggregate indicator measures to macro indicator (by mean)

In [33]:
def aggregate_to_macro_indicators(subsets):
    calc_micro_ind_measure = {}
    for key, subset in subsets.items():
        calc_micro_ind_measure[key] = subset.copy()
        column_name = f"{key}_ind_measure" # macro indicator measure
        ind_measure_columns = [col for col in subset.columns if col.endswith('_ind_measure')]
        calc_micro_ind_measure[key][column_name] = subset[ind_measure_columns].mean(axis=1).round(2)

    return calc_micro_ind_measure


**Output -- function:** `aggregate_to_macro_indicators`

In [34]:
agg_macro_indicators = aggregate_to_macro_indicators(created_macro_indicators)

In [35]:
agg_macro_indicators_binary = aggregate_to_macro_indicators(created_macro_indicators_binary)

**Output -- dictionary:** `agg_macro_indicators` `agg_macro_indicators_binary`

In [36]:
agg_macro_indicators['q1_macro_appointment_procedures']

Unnamed: 0,country,year,q1c1_apjufc,q1c1_apjuac,q1c1_apjuhc,q1_micro_appointment_judges_ind_measure,q1c1_appresidfc,q1c1_appresidac,q1c1_appresidhc,q1_micro_appointment_court_presidents_ind_measure,q1c1_exvetofc,q1c1_exvetoac,q1c1_exvetohc,q1_micro_appointment_veto_ind_measure,q1_macro_appointment_procedures_ind_measure
0,Albania,2000,0.5,0.5,0.0,0.33,0.5,0.5,0.0,0.33,1.0,1.0,1.0,1.0,0.55
1,Albania,2001,0.5,0.5,0.0,0.33,0.5,0.5,0.0,0.33,1.0,1.0,1.0,1.0,0.55
2,Albania,2002,0.5,0.5,0.0,0.33,0.5,0.5,0.0,0.33,1.0,1.0,1.0,1.0,0.55
3,Albania,2003,0.5,0.5,0.0,0.33,0.5,0.5,0.0,0.33,1.0,1.0,1.0,1.0,0.55
4,Albania,2004,0.5,0.5,0.0,0.33,0.5,0.5,0.0,0.33,1.0,1.0,1.0,1.0,0.55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,United Kingdom,2018,0.5,0.5,0.5,0.50,0.5,0.5,0.5,0.50,0.0,0.0,0.0,0.0,0.33
916,United Kingdom,2019,0.5,0.5,0.5,0.50,0.5,0.5,0.5,0.50,0.0,0.0,0.0,0.0,0.33
917,United Kingdom,2020,0.5,0.5,0.5,0.50,0.5,0.5,0.5,0.50,0.0,0.0,0.0,0.0,0.33
918,United Kingdom,2021,0.5,0.5,0.5,0.50,0.5,0.5,0.5,0.50,0.0,0.0,0.0,0.0,0.33


### 5 Create Judicial Autonomy Index

#### 5.1 Merge dataset to create index

In [37]:
def create_index_dataset(subset):

    temporary_merge_index = pd.merge(subset['q1_macro_appointment_procedures'], 
                                     subset['q1_macro_selection_criteria'], 
                                     how='outer', on=['country', 'year'])
    temporary_merge_index2 = pd.merge(temporary_merge_index, 
                                     subset['q1_macro_professional_rights'], 
                                     how='outer', on=['country', 'year'])
    temporary_merge_index3 = pd.merge(temporary_merge_index2, 
                                     subset['q1_macro_professional_obligations'], 
                                     how='outer', on=['country', 'year'])
    index_merged = pd.merge(temporary_merge_index3, 
                            subset['q1_macro_judicial_administration'], 
                            how='outer', on=['country', 'year'])
    
    return index_merged

**Output -- function:** `create_index_dataset()`

In [38]:
created_index = create_index_dataset(agg_macro_indicators)

In [39]:
created_index_binary = create_index_dataset(agg_macro_indicators_binary)

**Output -- DataFrame:** `created_index` `created_index_binary`

#### 5.2 Aggregate macro indicator measures to overall index (by mean)

In [40]:
def aggregate_overall_index(subset):

    # calculate the mean of all macro-indicator measures
    calc_overall_index = subset.copy()
    ind_measure_columns = [col for col in subset.columns if col.startswith('q1_macro')]
    calc_overall_index['index_measure'] = subset[ind_measure_columns].mean(axis=1).round(2)

    # add a new column 'country_year' as UID
    calc_overall_index['country_year'] = calc_overall_index['country'] + "_" + calc_overall_index['year'].astype('string')
    
    return calc_overall_index
    

**Output -- function:** `aggregate_overall_index()`

In [41]:
agg_overall_index = aggregate_overall_index(created_index)

In [42]:
agg_overall_index_binary = aggregate_overall_index(created_index_binary)

**Output -- DataFrames:** `agg_overall_index` `agg_overall_index_binary` 

### 6 Create and export subsets

In [43]:
# Set structure for indicators-index subset: 
# overall index
# macro-indicator

index_macro_columns = [
    'country',
    'year',
    'country_year', 
    
    # 0 -- overall index
    'index_measure', 

    # 1 -- appointment_procedures
    'q1_macro_appointment_procedures_ind_measure',

    # 2 -- selection_criteria
     'q1_macro_selection_criteria_ind_measure',

    # 3 -- professional_rights
     'q1_macro_professional_rights_ind_measure',

    # 4 -- professional_obligations
    'q1_macro_professional_obligations_ind_measure', 

    # 5 -- judicial_administration
    'q1_macro_judicial_administration_ind_measure']

In [44]:
# Dataset with macro-indicators and overall index
dataset_index_macro = agg_overall_index[index_macro_columns]

# Export dataset
dataset_index_macro.to_csv(config['output_data']['q1_output_index_macro']) # index=False

# Display dataset
# dataset_index_macro.head(10)

**Output -- DataFrame:** `dataset_index_macro`

In [45]:
# Set structure for indicators-index subset: 
# overall index
# macro-indicator
# micro-indicator

index_macro_micro_columns = [
    'country',
    'year',
    'country_year', 
    
    # 0 -- overall index
    'index_measure', 

    # 1 -- appointment_procedures
    'q1_macro_appointment_procedures_ind_measure',
    'q1_micro_appointment_judges_ind_measure',
    'q1_micro_appointment_court_presidents_ind_measure',
    'q1_micro_appointment_veto_ind_measure',

    # 2 -- selection_criteria
     'q1_macro_selection_criteria_ind_measure',
     'q1_micro_selection_predetermined_law_ind_measure',
     'q1_micro_selection_intl_standards_ind_measure',
     'q1_micro_transparency_appeal_ind_measure',

    # 3 -- professional_rights
     'q1_macro_professional_rights_ind_measure',
     'q1_micro_judge_tenure_ind_measure',
     'q1_micro_judge_immunity_ind_measure',
     'q1_micro_judge_salary_ind_measure',

    # 4 -- professional_obligations
    'q1_macro_professional_obligations_ind_measure', 
    'q1_micro_disciplinary_proceedings_law_ind_measure',
    'q1_micro_disciplinary_proceedings_actors_ind_measure',
    'q1_micro_conflict_recusal_evaluation_ind_measure',

    # 5 -- judicial_administration
    'q1_macro_judicial_administration_ind_measure',
    'q1_micro_judicial_self_governance_bodies_ind_measure',
    'q1_micro_judicial_self_governance_competences_ind_measure',
    'q1_micro_courts_administration_ind_measure']
    

In [46]:
# Create dataset with micro-indicators, macro-indicators, overall index
dataset_index_macro_micro = agg_overall_index[index_macro_micro_columns]

# Export dataset
dataset_index_macro_micro.to_csv(config['output_data']['q1_output_index_macro_micro']) # index=False

# Display dataset
dataset_index_macro_micro.head(10)

Unnamed: 0,country,year,country_year,index_measure,q1_macro_appointment_procedures_ind_measure,q1_micro_appointment_judges_ind_measure,q1_micro_appointment_court_presidents_ind_measure,q1_micro_appointment_veto_ind_measure,q1_macro_selection_criteria_ind_measure,q1_micro_selection_predetermined_law_ind_measure,...,q1_micro_judge_immunity_ind_measure,q1_micro_judge_salary_ind_measure,q1_macro_professional_obligations_ind_measure,q1_micro_disciplinary_proceedings_law_ind_measure,q1_micro_disciplinary_proceedings_actors_ind_measure,q1_micro_conflict_recusal_evaluation_ind_measure,q1_macro_judicial_administration_ind_measure,q1_micro_judicial_self_governance_bodies_ind_measure,q1_micro_judicial_self_governance_competences_ind_measure,q1_micro_courts_administration_ind_measure
0,Albania,2000,Albania_2000,0.52,0.55,0.33,0.33,1.0,0.5,0.67,...,0.75,0.0,0.65,0.62,0.67,0.67,0.5,0.5,0.0,1.0
1,Albania,2001,Albania_2001,0.52,0.55,0.33,0.33,1.0,0.5,0.67,...,0.75,0.0,0.65,0.62,0.67,0.67,0.5,0.5,0.0,1.0
2,Albania,2002,Albania_2002,0.52,0.55,0.33,0.33,1.0,0.5,0.67,...,0.75,0.0,0.65,0.62,0.67,0.67,0.5,0.5,0.0,1.0
3,Albania,2003,Albania_2003,0.52,0.55,0.33,0.33,1.0,0.5,0.67,...,0.75,0.0,0.65,0.62,0.67,0.67,0.5,0.5,0.0,1.0
4,Albania,2004,Albania_2004,0.55,0.55,0.33,0.33,1.0,0.5,0.67,...,0.75,0.0,0.76,0.62,0.67,1.0,0.5,0.5,0.0,1.0
5,Albania,2005,Albania_2005,0.55,0.55,0.33,0.33,1.0,0.5,0.67,...,0.75,0.0,0.76,0.62,0.67,1.0,0.5,0.5,0.0,1.0
6,Albania,2006,Albania_2006,0.55,0.55,0.33,0.33,1.0,0.5,0.67,...,0.75,0.0,0.76,0.62,0.67,1.0,0.5,0.5,0.0,1.0
7,Albania,2007,Albania_2007,0.53,0.55,0.33,0.33,1.0,0.42,0.67,...,0.75,0.0,0.76,0.62,0.67,1.0,0.5,0.5,0.0,1.0
8,Albania,2008,Albania_2008,0.58,0.55,0.33,0.33,1.0,0.61,1.0,...,0.5,0.5,0.76,0.62,0.67,1.0,0.5,0.5,0.0,1.0
9,Albania,2009,Albania_2009,0.58,0.55,0.33,0.33,1.0,0.61,1.0,...,0.5,0.5,0.76,0.62,0.67,1.0,0.5,0.5,0.0,1.0


**Output -- DataFrame:** `dataset_index_macro_micro`

In [47]:
# Create dataset with micro-indicators, macro-indicators, overall index, username, country_code

# Subset 'username', 'country', 'country_code', 'year' from original dataset
df_username_country_code_columns = q1_cleaned[['username', 'country', 'country_code', 'year']]
dataset_index_macro_micro = agg_overall_index[index_macro_micro_columns]

# Merge both datasets
dataset_index_macro_micro_add_info = pd.merge(df_username_country_code_columns, 
                                              dataset_index_macro_micro,
                                              how='outer', on=['country', 'year'])

# Export dataset
dataset_index_macro_micro_add_info.to_csv(config['output_data']['q1_output_index_macro_micro_add_info']) # index=False

# Display dataset
# dataset_index_macro_micro_add_info.head(10)

**Output -- DataFrame:** `dataset_index_macro_micro_add_info`

In [48]:
# dataset for specific years

### 7 Export clean index dataset

### TESTS

In [49]:
# Micro-indicators means (after filling in NAs with row mode)

indicators_mean_subsets_no_nan = calc_micro_indicators_mean(filled_subsets)
indicators_mean_subsets_no_nan['q1_appointment_judges']['q1_appointment_judges_ind_measure'].value_counts().sort_index(ascending=False)

NameError: name 'calc_micro_indicators_mean' is not defined

In [None]:
# Micro-indicators means (without filling in NAs with row mode)

indicators_mean_subsets_nan = calc_micro_indicators_mean(subset_indicators)
indicators_mean_subsets_nan['q1_appointment_judges']['indicators_mean'].value_counts().sort_index(ascending=False)

In [None]:
# Micro-indicators means (after recoding fuzzy values to binary and filling in NAs with row mode)

indicators_mean_subsets_no_nan_fuzzy_recoded = calc_micro_indicators_mean(filled_subsets_fuzzy_recoded)
indicators_mean_subsets_no_nan_fuzzy_recoded['q1_appointment_judges']['indicators_mean'].value_counts().sort_index(ascending=False)

In [None]:
# Plot distribution of values in mean_subsets column

def distribution_of_mean_values(subsets):
    
    # Set the grid size and the figure for subplots
    n_subsets = len(subsets)
    ncols = 5
    nrows = 3
    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, nrows * 3))
    fig.tight_layout(pad=3.0)  # spacing between plots

    # Set title of the plot
    fig.suptitle('Distribution of Mean Values Across Subsets (Q1)', fontsize=16, y=1.025)

    # Flatten axes array for easy indexing, if necessary
    axs = axs.flatten() if n_subsets > 1 else [axs]
    
    # Create a histogram for each subset
    for idx, (key, subset) in enumerate(subsets.items()):
        sns.histplot(subset['indicators_mean'], bins=10, label=key, ax=axs[idx])        
        axs[idx].set_title(f'{key}', fontsize=10) # adjust subplot title
        axs[idx].legend().remove() # remove subplot legend
        axs[idx].set_xlabel('') # remove x-axis label
        axs[idx].set_xticks(np.arange(start=0, stop=1.2, step=0.2)) # adjust ticks on x-axis

    plt.show()


#### Distribution of Measurement for each Micro-Indicator (after filling in NAs with row mode)

In [None]:
distribution_of_mean_values(indicators_mean_subsets_no_nan)

#### Distribution of Measurement for each Micro-Indicator (without filling in NAs with row mode)

In [None]:
distribution_of_mean_values(indicators_mean_subsets_nan)

#### Distribution of Measurement for each Micro-Indicator (after recoding fuzzy values to binary and filling in NAs with row mode)

In [None]:
distribution_of_mean_values(indicators_mean_subsets_no_nan_fuzzy_recoded)

In [None]:
q1_appointment_judges = filled_subsets['q1_appointment_judges']
q1_appointment_court_presidents = filled_subsets['q1_appointment_court_presidents']
q1_appointment_veto = filled_subsets['q1_appointment_veto']
q1_selection_predetermined_law = filled_subsets['q1_selection_predetermined_law']
q1_selection_intl_standards = filled_subsets['q1_selection_intl_standards']
q1_transparency_appeal = filled_subsets['q1_transparency_appeal']
q1_judge_tenure = filled_subsets['q1_judge_tenure']
q1_judge_immunity = filled_subsets['q1_judge_immunity']
q1_judge_salary = filled_subsets['q1_judge_salary']
q1_disciplinary_proceedings_law = filled_subsets['q1_disciplinary_proceedings_law']
q1_disciplinary_proceedings_actors = filled_subsets['q1_disciplinary_proceedings_actors']
q1_conflict_recusal_evaluation = filled_subsets['q1_conflict_recusal_evaluation']
q1_judicial_self_governance = filled_subsets['q1_judicial_self_governance']
q1_judicial_self_governance = filled_subsets['q1_judicial_self_governance']
q1_courts_administration = filled_subsets['q1_courts_administration']

In [None]:
q1_appointment_judges.isna().sum()

### NOTES

In [None]:
subset_indicators['q1_appointment_actors'].nunique()

In [None]:
import numpy as np
import pandas as pd

def fill_na_with_row_mode2(row):
    """
    Step 1: Calculate the mode for the numerical columns in the row. 
    Step 2: If multiple modes, take the first one
    Step 3: Replace NaN values in the row with the mode
    """
    # Focus on float dtype for mode calculation
    mode_values = row.mode(dropna=True)
    if not mode_values.empty:
        mode_value = mode_values.iloc[0]
    else:
        mode_value = np.nan
    # Replace NaN values in the original row with the mode, return modified row
    return row.fillna(mode_value)

def fill_na_in_micro_indicators2(subsets):
    """
    Apply fill_na_with_row_mode function across subsets in the dictionary to fill NaN
    """
    filled_subsets = {}
    for key, subset in subsets.items():
        # Apply the function row-wise (axis=1)
        filled_subset = subset.apply(fill_na_with_row_mode2, axis=1)
        filled_subsets[key] = filled_subset
    return filled_subsets

In [None]:
filled_subsets = fill_na_in_micro_indicators2(subset_indicators)


In [None]:
# Apply the function row-wise (axis=1)
q1_appointment_actors_filled = subset_indicators['q1_appointment_actors'].apply(fill_na_with_row_mode, axis=1)
subset_indicators['q1_appointment_actors'].isna().sum(), q1_appointment_actors_filled.isna().sum()

q1_appointment_actors_filled['q1c1_exvetoac'].value_counts()

In [None]:
mode_value_test = subset_indicators['q1_appointment_actors'].select_dtypes(include=float).mode().iloc[0]
mode_value_test

### Check for missing values in each subset

In [None]:
q1_courts_administration.isna().sum().sort_values(ascending=False)

In [None]:
cols_with_nan = q1_courts_administration.columns[q1_courts_administration.isna().any()].tolist()
missing_counts = q1_courts_administration.groupby('country')[cols_with_nan].apply(lambda x: x.isna().sum()).reset_index()
#numeric_cols = missing_counts.select_dtypes(exclude='object').columns
missing_counts

In [None]:
missing_counts = q1_courts_administration.groupby('country').apply(lambda x: x.isna().sum()).reset_index()
missing_counts

### Check and deal with missing values

In [None]:

cols_with_nan = q1_cleaned.columns[q1_cleaned.isna().any()].tolist()
missing_counts = q1_cleaned.groupby('country')[cols_with_nan].apply(lambda x: x.isna().sum()).reset_index()
numeric_cols = missing_counts.select_dtypes(exclude='object').columns

# Summing up NaN counts across these columns for each row to calculate the 'total_nan'
missing_counts['total_nan'] = missing_counts[numeric_cols].sum(axis=1)
sorted_missing_counts = missing_counts.sort_values(by='total_nan', ascending=False)
sorted_missing_counts

#missing_counts['total_nan'] = missing_counts.dtypes(exclude='object').sum(axis=1)
#missing_counts.dtypes

In [None]:
# Check datatypes and missing values
q1_subj_cleaned.info(verbose=True)

In [None]:
q1_subj_cleaned.isna().sum().sort_values(ascending=False)

In [None]:
print(f"Number of columns that contain at least one missing value: {q1_subj_cleaned.isnull().any(axis = 0).sum()}")
print(f"Number of rows that contain at least one missing value: {q1_subj_cleaned.isnull().any(axis = 1).sum()}")

In [None]:
q1_subj_cleaned.loc[:, q1_subj_cleaned.isna().any()]
#df2.loc[:, df2.isnull().any()])

In [None]:
q1c4_nan = q1_subj_cleaned[q1_subj_cleaned['q1c4_manbudget'].isna()]
q1c4_nan

q1c2_retireage       783
q1c2_jubonus         471
q1c3_immunlift       288
q1c3_evalints        233
q1c4_whochair        183
q1c4_sameright        82
q1c4_competence       56
q1c4_reasondecis      56
q1c1_exvetoac         41
q1c4_whocharge        33
q1c4_whoselect        31
q1c4_casealloc        29
q1c1_apjufc           23
q1c1_appresidfc       23
q1c4_manbudget        23
q1c2_jutransf         23
q1c3_initdiscip       21
q1c2_termacju         18
q1c1_appresidac       18
q1c1_apjuac           18
q1c1_appealac         18
q1c1_critacints       18
q1c1_critaclaw        18
q1c1_appresidhc        2
q1c1_transplaw         2

### Dataset Q2

In [None]:
# Load dataset q2
#q2_df = pd.read_csv(config['input_data']['q2_dataset'])
#q2_df.head()

In [None]:
#q2_cleaned = cleaning_judicial_autonomy_data(q2_df)
#q2_cleaned.to_csv(config['output_data']['q2_dataset'])
#q2_cleaned  