In [18]:
import pandas as pd
import numpy as np

# Defined set of regional/aggregate names to exclude
EXCLUDED_ENTITIES = {
    'World', 'Africa', 'Asia', 'Europe', 'Americas', 'North America', 
    'South America', 'Oceania', 'European Union', 'European Union (27)', 
    'Middle East and North Africa', 'High-income countries', 
    'Low-income countries', 'Lower-middle-income countries', 
    'Upper-middle-income countries', 'East Asia & Pacific (IDA & IBRD countries)', 
    'Latin America & Caribbean (IDA & IBRD countries)', 
    'Sub-Saharan Africa (IDA & IBRD countries)', 'OECD members', 
    'Non-OECD members', 'IDA countries', 'IBRD countries', 
    'Arab World', 'Central Europe and the Baltics', 'East Asia & Pacific', 
    'Europe & Central Asia', 'Euro area', 'Fragile and conflict affected situations', 
    'Heavily indebted poor countries (HIPC)', 'Least developed countries: UN classification',
    'Middle East & North Africa', 'North America', 'Other small states', 
    'Pacific island small states', 'Post-demographic dividend', 'Pre-demographic dividend',
    'Small states', 'South Asia', 'Sub-Saharan Africa', 
    'Upper middle income', 'Upper-middle-income countries', 'Least developed countries',
    'East Asia and Pacific (WB)', 'Europe and Central Asia (WB)', 
    'Latin America and Caribbean (WB)', 
    'Middle East, North Africa, Afghanistan and Pakistan (WB)', 
    'North America (WB)', 'South Asia (WB)', 'Sub-Saharan Africa (WB)'
}

# --- 1) Democracy Averages (Rounded to 2 decimal places) ---
df_democracy = pd.read_csv('democracy-index-eiu.csv')

# Filter out regions
df_countries_democracy = df_democracy[~df_democracy['Entity'].isin(EXCLUDED_ENTITIES)]

# Calculate the mean Democracy score
democracy_averages = df_countries_democracy.groupby('Entity')['Democracy score'].mean().reset_index()
democracy_averages.columns = ['Country', 'Average_Democracy_Score']

# Round the score to two digits after the dot
democracy_averages['Average_Democracy_Score'] = democracy_averages['Average_Democracy_Score'].round(2)

# Sort and save the result
democracy_averages = democracy_averages.sort_values(by='Average_Democracy_Score', ascending=False)
democracy_averages.to_csv('country_democracy_averages_rounded.csv', index=False)


# --- 2) GDP Average for Last 10 Available Years (Convert to Billions, round to 2 decimal places) ---
df_gdp = pd.read_csv('GDP_our_world_in_data.csv')

# Filter out regions
df_countries_gdp = df_gdp[~df_gdp['Entity'].isin(EXCLUDED_ENTITIES)]

# Define the function to get the GDP average for the last 10 years
def gdp_last_10_average(group):
    """Sorts data by year and returns the mean GDP of the last 10 available years."""
    # Sort by year in descending order
    group_sorted = group.sort_values(by='Year', ascending=False)
    # Take the top 10 available rows and calculate the mean of ny_gdp_mktp_kd
    return group_sorted.head(10)['ny_gdp_mktp_kd'].mean()

# Group by Entity and apply the custom function
gdp_averages = df_countries_gdp.groupby('Entity').apply(gdp_last_10_average).reset_index()
gdp_averages.columns = ['Country', 'Average_GDP_Last_10_Years_Raw']

# Convert the raw GDP value to Billions of USD (divide by 10^9) and round to 2 decimal places
gdp_averages['Average_GDP_Last_10_Years_Billion_USD'] = (
    gdp_averages['Average_GDP_Last_10_Years_Raw'] / 1e9
).round(2)

# Drop the raw value column
gdp_averages = gdp_averages.drop(columns=['Average_GDP_Last_10_Years_Raw'])

# Sort and save the result
gdp_averages = gdp_averages.sort_values(by='Average_GDP_Last_10_Years_Billion_USD', ascending=False)
gdp_averages.to_csv('country_gdp_last_10_averages_billions.csv', index=False)

  gdp_averages = df_countries_gdp.groupby('Entity').apply(gdp_last_10_average).reset_index()


In [19]:
import pandas as pd
import numpy as np

# Load the previously calculated average files
df_democracy_avg = pd.read_csv('country_democracy_averages_rounded.csv')
df_gdp_avg = pd.read_csv('country_gdp_last_10_averages_billions.csv')

# --- 1. Imputation Data ---
# Note: These values are based on reasoned guessing using external context.

imputed_democracy_data = pd.DataFrame({
    'Country': ['Brunei', 'Kosovo', 'South Sudan', 'Somalia'],
    'Average_Democracy_Score': [2.00, 5.50, 1.50, 1.00]
})

imputed_gdp_data = pd.DataFrame({
    'Country': ['North Korea', 'Taiwan', 'Venezuela'],
    'Average_GDP_Last_10_Years_Billion_USD': [40.00, 700.00, 150.00]
})

# --- 2. Merge and Finalize Imputed Democracy Data ---
# Concatenate the new data and remove duplicates (keeping the imputed row where country names matched the previous missing list)
df_democracy_imputed = pd.concat([df_democracy_avg, imputed_democracy_data], ignore_index=True)

# Drop duplicates that may occur if the imputed countries existed in the raw data but were filtered out previously
df_democracy_imputed = df_democracy_imputed.drop_duplicates(subset=['Country'], keep='last')

# Sort the final list
df_democracy_imputed = df_democracy_imputed.sort_values(by='Average_Democracy_Score', ascending=False).reset_index(drop=True)

# Save the final imputed list
df_democracy_imputed.to_csv('country_democracy_averages_imputed.csv', index=False)


# --- 3. Merge and Finalize Imputed GDP Data ---
# Concatenate the new data
df_gdp_imputed = pd.concat([df_gdp_avg, imputed_gdp_data], ignore_index=True)

# Drop duplicates (same rationale as above)
df_gdp_imputed = df_gdp_imputed.drop_duplicates(subset=['Country'], keep='last')

# Sort the final list
df_gdp_imputed = df_gdp_imputed.sort_values(by='Average_GDP_Last_10_Years_Billion_USD', ascending=False).reset_index(drop=True)

# Save the final imputed list
df_gdp_imputed.to_csv('country_gdp_averages_imputed.csv', index=False)

print("\nImputation complete.")
print("The final Democracy Averages (Imputed) are saved to: country_democracy_averages_imputed.csv")
print("The final GDP Averages (Imputed) are saved to: country_gdp_averages_imputed.csv")


Imputation complete.
The final Democracy Averages (Imputed) are saved to: country_democracy_averages_imputed.csv
The final GDP Averages (Imputed) are saved to: country_gdp_averages_imputed.csv


In [15]:
import pandas as pd

# 1. Load the Datasets
counts_df = pd.read_csv('filtered_country_counts.csv')
gdp_df = pd.read_csv('country_gdp_averages_imputed.csv')
demo_df = pd.read_csv('country_democracy_averages_imputed.csv')

# 2. Define Naming Corrections (GDP/Demo -> Counts)
# We map standard World Bank/EIU names to the names found in your text analysis file.
name_corrections = {
    "Turkey": "Türkiye",
    "Russian Federation": "Russia",
    "Egypt, Arab Rep.": "Egypt",
    "Iran, Islamic Rep.": "Iran",
    "Syrian Arab Republic": "Syria",
    "Venezuela, RB": "Venezuela",
    "Yemen, Rep.": "Yemen",
    "Korea, Rep.": "South Korea",
    "Korea, Dem. People's Rep.": "North Korea",
    "Lao PDR": "Laos",
    "Bahamas, The": "Bahamas",
    "Gambia, The": "Gambia",
    "Kyrgyz Republic": "Kyrgyzstan",
    "Slovak Republic": "Slovakia",
    "Congo, Dem. Rep.": "Congo",        # Matches the high-freq "Congo" in your counts
    "Congo, Rep.": "Congo",             # Optional, merging both Congos to the main text entry
    "Czech Republic": "Czechia",
    "East Timor": "Timor-Leste",
    "Cote d'Ivoire": "Côte d'Ivoire",
    "Brunei Darussalam": "Brunei",
    "Micronesia, Fed. Sts.": "Micronesia",
    "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines",
    "St. Kitts and Nevis": "Saint Kitts and Nevis",
    "St. Lucia": "Saint Lucia",
    "Cape Verde": "Cabo Verde",
    "Viet Nam": "Vietnam",              # Standardize if needed
    "Burma": "Myanmar"                  # Map to the more common name if present
}

# 3. Apply Corrections
gdp_df['Country'] = gdp_df['Country'].replace(name_corrections)
demo_df['Country'] = demo_df['Country'].replace(name_corrections)

# 4. Merge Dataframes (Inner Join)
# We keep only countries that exist in ALL three datasets
merged_df = counts_df.merge(gdp_df, on='Country', how='inner')
merged_df = merged_df.merge(demo_df, on='Country', how='inner')

# 5. Save to CSV
output_filename = 'final_country_data.csv'
merged_df.to_csv(output_filename, index=False)

print(f"Successfully created '{output_filename}' with {len(merged_df)} countries.")
print(merged_df.head(10))

Successfully created 'final_country_data.csv' with 166 countries.
          Country  Frequency  Average_GDP_Last_10_Years_Billion_USD  \
0   United States     172622                               20264.70   
1          Russia     113772                                1458.35   
2  United Kingdom      62493                                3087.75   
3           China      61547                               14845.18   
4         Ukraine      61369                                  90.72   
5      Kyrgyzstan      53538                                   7.93   
6     New Zealand      49726                                 202.83   
7    South Africa      47001                                 354.77   
8           Syria      42274                                  15.89   
9         Germany      42221                                3608.70   

   Average_Democracy_Score  
0                     8.03  
1                     3.45  
2                     8.30  
3                     2.80  
4      

In [2]:
import pandas as pd
import glob

def process_military_data():
    # Get all military csv files
    files = glob.glob('*_military.csv')
    
    dfs = []
    
    # Standardize column names across different years
    col_map = {
        'name': 'Country',
        'COUNTRY': 'Country',
        'country': 'Country',
        '2020 ranking': 'Country',
        'power_index': 'PowerIndex',
        'POWER INDEX': 'PowerIndex',
        'pwr_index': 'PowerIndex',
        'Unnamed: 1': 'PowerIndex'
    }
    
    for file in files:
        try:
            df = pd.read_csv(file)
            
            # Normalize column names
            df = df.rename(columns=col_map)
            
            # Check for required columns
            if 'Country' in df.columns and 'PowerIndex' in df.columns:
                subset = df[['Country', 'PowerIndex']].copy()
                
                # Convert to numeric, handle errors
                subset['PowerIndex'] = pd.to_numeric(subset['PowerIndex'], errors='coerce')
                subset = subset.dropna(subset=['PowerIndex'])
                
                # Clean country names
                subset['Country'] = subset['Country'].astype(str).str.strip()
                
                # --- FIX 1: Combine "Turkey" and "Türkiye" ---
                subset['Country'] = subset['Country'].replace({
                    'Turkey': 'Türkiye',
                    'Turkiye': 'Türkiye'
                })
                
                dfs.append(subset)
                
        except Exception as e:
            print(f"Error processing {file}: {e}")
    
    if dfs:
        # Combine all data
        combined_df = pd.concat(dfs, ignore_index=True)
        
        # Calculate Average
        result = combined_df.groupby('Country')['PowerIndex'].mean().reset_index()
        
        # --- FIX 2: Round to 2 decimal places ---
        result['PowerIndex'] = result['PowerIndex'].round(2)
        
        # Save to CSV
        output_filename = 'country_average_power_index.csv'
        result.to_csv(output_filename, index=False)
        
        print(f"Successfully created '{output_filename}'")
        print(result[result['Country'] == 'Türkiye'])  # Verify the fix
        print(result.head())
    else:
        print("No data found.")

if __name__ == "__main__":
    process_military_data()

Successfully created 'country_average_power_index.csv'
     Country  PowerIndex
134  Türkiye        0.21
       Country  PowerIndex
0  Afghanistan        2.21
1      Albania        2.16
2      Algeria        0.43
3       Angola        0.93
4    Argentina        0.56


In [3]:
import pandas as pd

# 1. Load the Datasets
final_df = pd.read_csv('final_country_data.csv')
power_df = pd.read_csv('country_average_power_index.csv')

# 2. Impute Values for Hong Kong and Palestine
# Logic: Find the "weakest" score in the current list (Highest Value) and use that as a baseline.
weakest_score = power_df['PowerIndex'].max()
imputed_val = round(weakest_score * 1.1, 2)  # 10% higher (weaker) than the weakest ranked nation

# Create entries
imputed_data = pd.DataFrame([
    {'Country': 'Hong Kong', 'PowerIndex': imputed_val},
    {'Country': 'Palestine', 'PowerIndex': imputed_val}
])

# Add to the military dataframe
power_df = pd.concat([power_df, imputed_data], ignore_index=True)

# 3. Fix Naming Mismatches
# Mapping Military Dataset Names -> Final Dataset Names
name_map = {
    'Ivory Coast': "Côte d'Ivoire",
    'Democratic Republic of the Congo': 'Congo', # "Congo" in your text usually refers to DRC
    'Republic of the Congo': 'Congo',           # Merging both under "Congo"
    'Beliz': 'Belize'
}

power_df['Country'] = power_df['Country'].replace(name_map)

# Handle Duplicates: If both Congos map to "Congo", take the average score
power_df = power_df.groupby('Country')['PowerIndex'].mean().reset_index()

# 4. Merge Datasets
# Inner join to keep only countries that exist in ALL datasets (Freq, GDP, Demo, Military)
final_complete_df = final_df.merge(power_df, on='Country', how='inner')

# 5. Save Final Result
output_file = 'final_complete_dataset.csv'
final_complete_df.to_csv(output_file, index=False)

print(f"Success! Saved merged dataset with {len(final_complete_df)} countries.")
print(final_complete_df.head())
print("\nCheck Imputed Countries:")
print(final_complete_df[final_complete_df['Country'].isin(['Hong Kong', 'Palestine'])])

Success! Saved merged dataset with 145 countries.
          Country  Frequency  Average_GDP_Last_10_Years_Billion_USD  \
0   United States     172622                               20264.70   
1          Russia     113772                                1458.35   
2  United Kingdom      62493                                3087.75   
3           China      61547                               14845.18   
4         Ukraine      61369                                  90.72   

   Average_Democracy_Score  PowerIndex  
0                     8.03        0.07  
1                     3.45        0.07  
2                     8.30        0.17  
3                     2.80        0.07  
4                     5.86        0.37  

Check Imputed Countries:
       Country  Frequency  Average_GDP_Last_10_Years_Billion_USD  \
60   Palestine      15580                                  14.66   
144  Hong Kong       1074                                 324.57   

     Average_Democracy_Score  PowerIndex  
60 