In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import os
from matplotlib.patches import Patch
import seaborn as sns

---

# PLOTS: BUBBLE CHART

---

From the dataset one, we need to extract: 
- % of total people with postmandatory studies in each neighborhood.
- % of inmigrants with postmandatory studies in each neighborhood.

In [None]:
# We read the dataset
df=pd.read_csv('generated_data/education_1997-2025.csv')

# We are interested on postmandatory studies, so let's define what they are
postmandatory_studies=['Upper secondary or post-secondary non-tertiary education', 'Tertiary education']

# Let's define inmigrants (out of Spain)
inmigrant_types=['Rest of European Union', 'Rest of World']

# Let's obtain the total population in each neighborhood
total_pop=df.groupby(['Year_Reference', 'District', 'Neighborhood'])['Value'].sum().reset_index()
total_pop.columns=['Year_Reference', 'District', 'Neighborhood', 'Total_Population']

# Let's obtain how many of these people have postmandatory studies
postmandatory=df[df['Education_Level'].isin(postmandatory_studies)] # We find the studies
postmandatory_total=postmandatory.groupby(['Year_Reference', 'District', 'Neighborhood'])['Value'].sum().reset_index() # We group by neighborhoods
postmandatory_total.columns=['Year_Reference', 'District', 'Neighborhood', 'Total_Postmandatory']

# Let's obtain the total inmigrants in each neighborhood
inmigrants=df[df['Birth_Place'].isin(inmigrant_types)] # We find the inmigrants
total_inmigrants=inmigrants.groupby(['Year_Reference', 'District', 'Neighborhood'])['Value'].sum().reset_index() # We group by neighborhoods
total_inmigrants.columns = ['Year_Reference', 'District', 'Neighborhood', 'Total_Inmigrants']

# Now, how many inmigrants have postmandatory studies
postmandatory_inmigrants=inmigrants[inmigrants['Education_Level'].isin(postmandatory_studies)] # We find the studies
postmandatory_inmigrants=postmandatory_inmigrants.groupby(['Year_Reference', 'District', 'Neighborhood'])['Value'].sum().reset_index() # We group by neighborhoods
postmandatory_inmigrants.columns=['Year_Reference', 'District', 'Neighborhood', 'Postmandatory_Inmigrants']

# Combine everything in the same DataFrame
percentages=total_pop.merge(postmandatory_total, on=['Year_Reference', 'District', 'Neighborhood'])
percentages=percentages.merge(postmandatory_inmigrants, on=['Year_Reference', 'District', 'Neighborhood'], how='left')
percentages=percentages.merge(total_inmigrants, on=['Year_Reference', 'District', 'Neighborhood'], how='left')

# Obtain percentages
percentages['Percentage_Postmandatory']=(percentages['Total_Postmandatory']/percentages['Total_Population'])*100
percentages['Percentage_Postmandatory_Inmigrants']=(percentages['Postmandatory_Inmigrants']/percentages['Total_Inmigrants'])*100

# Clean NaN from percentages: Refill with zero, the NaN is caused because there are no inmigrants
percentages['Percentage_Postmandatory_Inmigrants']=percentages['Percentage_Postmandatory_Inmigrants'].fillna(0)

# We save the dataset 
percentages.to_csv('education_percentages_1997-2025.csv', index=False, encoding='utf-8')
percentages

FileNotFoundError: [Errno 2] No such file or directory: '../generated_data/education_1997-2025.csv'