In [3]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import os

In [4]:
# Load the data
df = pd.read_csv('https://raw.githubusercontent.com/lmuir21/350toinfinity/main/data/top15_countries_with_yoy.csv')

# Quick check
df.head()

Unnamed: 0,country,year,GDP_per_capita,GDP_growth,Primary_enrollment,Secondary_enrollment,Tertiary_enrollment,Education_expenditure,Population,YoY_GDP_per_capita,YoY_Primary_enrollment,YoY_Secondary_enrollment,YoY_Tertiary_enrollment
0,BGD,2000,620.560473,5.293295,,48.719131,5.46036,2.12508,134544304.0,,,,
1,BGD,2001,641.288923,5.077288,,50.31414,6.4244,2.17193,136805810.0,3.34,,3.27,17.66
2,BGD,2002,655.672139,3.833124,,51.418098,6.13537,2.01715,138933658.0,2.24,,2.19,-4.5
3,BGD,2003,676.826252,4.739567,,51.25708,6.19689,2.06939,140970351.0,3.23,,-0.31,1.0
4,BGD,2004,702.656357,5.239533,,47.86414,5.72884,1.94014,142902856.0,3.82,,-6.62,-7.55


In [5]:
print(df.shape)

(345, 13)


In [None]:
# Preprocess Enrollment Data
features = ['Primary_enrollment', 'Secondary_enrollment', 'Tertiary_enrollment']
x = df[features]

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [None]:
# Count Nan Vaules in: Primary_enrollment, Secondary_enrollment, Tertiary_enrollment, columns
# List of your enrollment columns
features = ['Primary_enrollment', 'Secondary_enrollment', 'Tertiary_enrollment']

# Count NaN values for each selected column
nan_counts = df[features].isna().sum()

print(nan_counts)

In [None]:
nan_percentage = df[features].isna().mean() * 100
print(nan_percentage)

In [None]:
# List your education feature columns
features = ['Primary_enrollment', 'Secondary_enrollment', 'Tertiary_enrollment']

# Impute missing values by filling with the column mean
x = df[features].fillna(df[features].mean())

# Then scale
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

# Proceed with PCA
pca = PCA(n_components=1)
epi = pca.fit_transform(x_scaled)

# Create cleaned dataframe matching x's index
df_clean = df.loc[x.index]

# Add EPI to cleaned df
df_clean['Education_Progress_Index'] = epi

# Make sure 'data/' folder exists
import os
os.makedirs('data', exist_ok=True)

# Save the dataframe
df_clean.to_csv('data/epi_top15.csv', index=False)

In [None]:
df

In [None]:
x

In [None]:
# Load your cleaned file
df_clean = pd.read_csv('data/epi_top15.csv')

# Group by country and calculate:
# - Total EPI Change
# - GDP per capita CAGR

# Calculate EPI change
epi_change = df_clean.groupby('country')['Education_Progress_Index'].apply(lambda x: x.iloc[-1] - x.iloc[0])

# Calculate GDP per capita CAGR (Compound Annual Growth Rate)
gdp_cagr = df_clean.groupby('country')['GDP_per_capita'].apply(lambda x: ((x.iloc[-1] / x.iloc[0]) ** (1/22) - 1) * 100)

# Combine into a new DataFrame
growth_df = pd.DataFrame({
    'EPI_change': epi_change,
    'GDP_CAGR': gdp_cagr
}).reset_index()

# View your results
growth_df

In [None]:
# Set up your plot
import matplotlib.pyplot as plt
import seaborn as sns

# Create scatterplot
plt.figure(figsize=(10, 7))
sns.scatterplot(data=growth_df, x='GDP_CAGR', y='EPI_change')

# Draw horizontal and vertical lines at 0 to create quadrants
plt.axhline(0, color='black', linestyle='--')
plt.axvline(0, color='black', linestyle='--')

# Label the plot
plt.title('GDP Growth vs. Education Progress')
plt.xlabel('GDP per Capita CAGR (%)')
plt.ylabel('Education Progress Index Change')

# Save figure
os.makedirs('figures', exist_ok=True)
plt.savefig('figures/gdp_vs_epi_quadrants.png')

# Show plot
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(data=growth_df, x='GDP_CAGR', y='EPI_change', hue='Quadrant', palette='Set2')

plt.axhline(0, color='black', linestyle='--')
plt.axvline(0, color='black', linestyle='--')

for i in range(growth_df.shape[0]):
    plt.text(x=growth_df['GDP_CAGR'][i]+0.1, 
             y=growth_df['EPI_change'][i], 
             s=growth_df['country'][i], 
             fontdict=dict(color='black', size=8))

plt.title('GDP Growth vs. Education Progress (Colored by Quadrant)')
plt.xlabel('GDP per Capita CAGR (%)')
plt.ylabel('Education Progress Index Change')
plt.legend(loc='best')
os.makedirs('figures', exist_ok=True)
plt.savefig('figures/gdp_vs_epi_quadrants_colored.png')
plt.show()

In [None]:
# Function to classify each country
def classify_country(row):
    if row['GDP_CAGR'] > 0 and row['EPI_change'] > 0:
        return 'Smart + Rich'
    elif row['GDP_CAGR'] > 0 and row['EPI_change'] <= 0:
        return 'Rich but Not Smarter'
    elif row['GDP_CAGR'] <= 0 and row['EPI_change'] > 0:
        return 'Smarter but Poorer'
    else:
        return 'Losing on Both Fronts'

# Apply the classification function
growth_df['Quadrant'] = growth_df.apply(classify_country, axis=1)

# Check the updated dataframe
growth_df

In [None]:
growth_df['Quadrant'].value_counts()

In [None]:
# List of your selected countries
selected_countries = ['ETH', 'IND', 'CHN', 'PAK', 'EGY', 'COD']

# Loop through each country
for country_code in selected_countries:
    
    # Filter data
    country_df = df_clean[df_clean['country'] == country_code]
    
    # ---- Plot 1: GDP per Capita over Time ----
    plt.figure(figsize=(10,5))
    plt.plot(country_df['year'], country_df['GDP_per_capita'], marker='o')
    plt.title(f'{country_code} - GDP per Capita Over Time')
    plt.xlabel('Year')
    plt.ylabel('GDP per Capita (constant 2015 US$)')
    plt.grid(True)
    plt.tight_layout()
    os.makedirs('figures', exist_ok=True)
    plt.savefig(f'figures/{country_code}_gdp_per_capita.png')
    plt.show()

    # ---- Plot 2: Enrollment Trends over Time ----
    plt.figure(figsize=(10,5))
    plt.plot(country_df['year'], country_df['Primary_enrollment'], marker='o', label='Primary Enrollment')
    plt.plot(country_df['year'], country_df['Secondary_enrollment'], marker='o', label='Secondary Enrollment')
    plt.plot(country_df['year'], country_df['Tertiary_enrollment'], marker='o', label='Tertiary Enrollment')
    plt.title(f'{country_code} - Enrollment Rates Over Time')
    plt.xlabel('Year')
    plt.ylabel('Enrollment Rate (%)')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'figures/{country_code}_enrollment_trends.png')
    plt.show()


In [None]:
from numpy.polynomial.polynomial import Polynomial

# List of selected countries
selected_countries = ['ETH', 'IND', 'CHN', 'PAK', 'EGY', 'COD']

for country_code in selected_countries:
    
    # Filter the data
    country_df = df_clean[df_clean['country'] == country_code]
    
    # --- Plot 1: GDP per Capita with Trend Line ---
    plt.figure(figsize=(10,5))
    plt.plot(country_df['year'], country_df['GDP_per_capita'], marker='o', label='GDP per Capita')

    # Best fit line for GDP
    z_gdp = np.polyfit(country_df['year'], country_df['GDP_per_capita'], 1)
    p_gdp = np.poly1d(z_gdp)
    plt.plot(country_df['year'], p_gdp(country_df['year']), "r--", label='Trend Line')

    plt.title(f'{country_code} - GDP per Capita Over Time')
    plt.xlabel('Year')
    plt.ylabel('GDP per Capita (constant 2015 US$)')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    os.makedirs('figures', exist_ok=True)
    plt.savefig(f'figures/{country_code}_gdp_per_capita_trend.png')
    plt.show()
    
    # --- Plot 2: Enrollment Rates with Trend Lines ---
    plt.figure(figsize=(10,5))
    
    for col, color in zip(['Primary_enrollment', 'Secondary_enrollment', 'Tertiary_enrollment'], ['blue', 'green', 'orange']):
        plt.plot(country_df['year'], country_df[col], marker='o', label=col)
        
        # Best fit line for each enrollment type
        z_enroll = np.polyfit(country_df['year'], country_df[col], 1)
        p_enroll = np.poly1d(z_enroll)
        plt.plot(country_df['year'], p_enroll(country_df['year']), linestyle='--', color=color)
    
    plt.title(f'{country_code} - Enrollment Rates Over Time')
    plt.xlabel('Year')
    plt.ylabel('Enrollment Rate (%)')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'figures/{country_code}_enrollment_trends_trend.png')
    plt.show()


In [None]:
from numpy.polynomial.polynomial import Polynomial
import os
import numpy as np
import matplotlib.pyplot as plt

# List of selected countries
selected_countries = ['ETH', 'IND', 'CHN', 'PAK', 'EGY', 'COD']

for country_code in selected_countries:
    
    # Filter the data
    country_df = df_clean[df_clean['country'] == country_code]
    
    # --- Plot 1: GDP per Capita with Trend Line ---
    plt.figure(figsize=(10,5))
    plt.plot(country_df['year'], country_df['GDP_per_capita'], marker='o', label='GDP per Capita')

    # Best fit line for GDP
    temp_gdp = country_df[['year', 'GDP_per_capita']].copy()
    temp_gdp['GDP_per_capita'] = temp_gdp['GDP_per_capita'].interpolate(method='linear')  # Interpolate missing GDPs
    z_gdp = np.polyfit(temp_gdp['year'], temp_gdp['GDP_per_capita'], 1)
    p_gdp = np.poly1d(z_gdp)
    plt.plot(temp_gdp['year'], p_gdp(temp_gdp['year']), "r--", label='Trend Line (Interpolated)')

    plt.title(f'{country_code} - GDP per Capita Over Time')
    plt.xlabel('Year')
    plt.ylabel('GDP per Capita (constant 2015 US$)')
    plt.legend(bbox_to_anchor=(1.15, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    os.makedirs('figures', exist_ok=True)
    plt.savefig(f'figures/{country_code}_gdp_per_capita_trend.png')
    plt.show()
    
    # --- Plot 2: Enrollment Rates with Trend Lines ---
    plt.figure(figsize=(10,5))
    
    for col, color in zip(['Primary_enrollment', 'Secondary_enrollment', 'Tertiary_enrollment'], ['blue', 'green', 'orange']):
        plt.plot(country_df['year'], country_df[col], marker='o', label=col)
        
        # Interpolate missing values
        temp_enroll = country_df[['year', col]].copy()
        temp_enroll[col] = temp_enroll[col].interpolate(method='linear')
        
        # Only fit trendline if enough non-NaN points
        if temp_enroll[col].notna().sum() >= 2:
            z_enroll = np.polyfit(temp_enroll['year'], temp_enroll[col], 1)
            p_enroll = np.poly1d(z_enroll)
            plt.plot(temp_enroll['year'], p_enroll(temp_enroll['year']), linestyle='--', color=color, label=f"{col} Trend (Interpolated)")
        else:
            print(f"⚠️ Not enough data to fit trendline for {col} in {country_code}")
    
    plt.title(f'{country_code} - Enrollment Rates Over Time')
    plt.xlabel('Year')
    plt.ylabel('Enrollment Rate (%)')
    plt.legend(bbox_to_anchor=(1.15, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'figures/{country_code}_enrollment_trends_trend.png')
    plt.show()


In [None]:
# --- Count Interpolated Points for Each Country ---
for country_code in selected_countries:
    print(f"\n--- {country_code} ---")
    country_df = df_clean[df_clean['country'] == country_code]

    for col in ['Primary_enrollment', 'Secondary_enrollment', 'Tertiary_enrollment']:
        original_na = country_df[col].isna().sum()
        interpolated_series = country_df[col].interpolate(method='linear')
        interpolated_na = interpolated_series.isna().sum()

        interpolated_points = original_na - interpolated_na

        print(f"{col}: {interpolated_points} points interpolated")


In [None]:
import pandas as pd

# Initialize an empty list to collect data
interpolation_data = []

# Loop through each country
for country_code in selected_countries:
    country_df = df_clean[df_clean['country'] == country_code]
    
    # Count interpolated points for each column
    for col in ['Primary_enrollment', 'Secondary_enrollment', 'Tertiary_enrollment']:
        original_na = country_df[col].isna().sum()
        interpolated_series = country_df[col].interpolate(method='linear')
        interpolated_na = interpolated_series.isna().sum()

        interpolated_points = original_na - interpolated_na

        # Append to list
        interpolation_data.append({
            'Country': country_code,
            'Enrollment_Type': col,
            'Interpolated_Points': interpolated_points
        })

# Create a DataFrame from the list
interpolation_df = pd.DataFrame(interpolation_data)

# Pivot table for cleaner look
pivot_table = interpolation_df.pivot(index='Country', columns='Enrollment_Type', values='Interpolated_Points')

# Display the pivot table
pivot_table


In [None]:
# Top 5 by EPI change
top_epi = growth_df.sort_values('EPI_change', ascending=False).head(5)

# Top 5 by GDP growth
top_gdp = growth_df.sort_values('GDP_CAGR', ascending=False).head(5)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(15,6))

# EPI Growth
sns.barplot(x='EPI_change', y='country', data=top_epi, ax=axes[0])
axes[0].set_title('Top 5 Smartest Growth (Education Progress)')

# GDP Growth
sns.barplot(x='GDP_CAGR', y='country', data=top_gdp, ax=axes[1])
axes[1].set_title('Top 5 Richest Growth (GDP Growth)')

plt.tight_layout()
plt.savefig('figures/top5_smart_vs_rich.png')
plt.show()

In [None]:
import plotly.express as px

fig = px.choropleth(
    growth_df,
    locations="country",  # use 3-letter ISO country codes
    locationmode="ISO-3",
    color="EPI_change",
    color_continuous_scale="Viridis",
    title="Education Progress Index (EPI) Change by Country"
)

fig.write_html('figures/epi_world_map.html')
fig.show()

In [None]:
# Count by Quadrant
quadrant_counts = growth_df['Quadrant'].value_counts()

# Plot
plt.figure(figsize=(8,8))
quadrant_counts.plot.pie(autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Countries by Growth Quadrant')
plt.ylabel('')
plt.savefig('figures/quadrant_pie_chart.png')
plt.show()

In [None]:
# Example: overall correlation for all countries
correlation_primary = df_clean[['GDP_per_capita', 'Primary_enrollment']].corr().iloc[0,1]
correlation_secondary = df_clean[['GDP_per_capita', 'Secondary_enrollment']].corr().iloc[0,1]
correlation_tertiary = df_clean[['GDP_per_capita', 'Tertiary_enrollment']].corr().iloc[0,1]

print(f"Primary Enrollment vs GDP Correlation: {correlation_primary:.2f}")
print(f"Secondary Enrollment vs GDP Correlation: {correlation_secondary:.2f}")
print(f"Tertiary Enrollment vs GDP Correlation: {correlation_tertiary:.2f}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Features to compare
education_levels = ['Primary_enrollment', 'Secondary_enrollment', 'Tertiary_enrollment']

# Set up figure
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)

for idx, level in enumerate(education_levels):
    # Drop NaNs before correlation
    temp_df = df_clean[['GDP_per_capita', level]].dropna()

    # Calculate correlation
    corr = temp_df['GDP_per_capita'].corr(temp_df[level])

    # Plot
    sns.regplot(
        x='GDP_per_capita', 
        y=level, 
        data=temp_df, 
        ax=axes[idx],
        line_kws={"color": "red"},
        scatter_kws={"alpha":0.5}
    )
    
    axes[idx].set_title(f"{level} vs GDP per Capita\nCorrelation: {corr:.2f}")
    axes[idx].set_xlabel('GDP per Capita (constant 2015 US$)')
    axes[idx].set_ylabel('Enrollment Rate (%)')

plt.suptitle('Correlation between GDP and Education Levels', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
os.makedirs('figures', exist_ok=True)
plt.savefig('figures/gdp_vs_education_correlations.png')
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Step 1: Merge Quadrant info into full df_clean

# Step 2: Define which quadrants and features to plot
quadrants_to_compare = ['Smart + Rich', 'Rich but Not Smarter']
education_levels = ['Primary_enrollment', 'Secondary_enrollment', 'Tertiary_enrollment']

# Step 3: Set up the subplots
fig, axes = plt.subplots(len(education_levels), len(quadrants_to_compare), figsize=(15, 12), sharey=True)

# Step 4: Loop through each education level and quadrant
for i, level in enumerate(education_levels):
    for j, quad in enumerate(quadrants_to_compare):
        
        # Filter to the specific quadrant inside df_clean
        temp_df = df_clean[df_clean['Quadrant'] == quad][['GDP_per_capita', level]].dropna()

        # Only plot if there is enough data
        if not temp_df.empty:
            # Calculate correlation
            corr = temp_df['GDP_per_capita'].corr(temp_df[level])

            # Scatterplot with trendline
            sns.regplot(
                x='GDP_per_capita', 
                y=level, 
                data=temp_df, 
                ax=axes[i, j],
                line_kws={"color": "red"},
                scatter_kws={"alpha":0.6}
            )

            axes[i, j].set_title(f"{level}\n{quad}\nCorrelation: {corr:.2f}")
            axes[i, j].set_xlabel('GDP per Capita (constant 2015 US$)')
            axes[i, j].set_ylabel('Enrollment Rate (%)')
        else:
            # Not enough data
            axes[i, j].text(0.5, 0.5, 'Not Enough Data', ha='center', va='center')
            axes[i, j].set_axis_off()

# Step 5: Final clean-up
plt.suptitle('Correlation between GDP and Education by Quadrant', fontsize=18)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
os.makedirs('figures', exist_ok=True)
plt.savefig('figures/gdp_vs_education_by_quadrant.png')
plt.show()