In [32]:
import numpy as numpy
import pandas as pd

In [None]:
datasource = "owd-tertiary-education"  # Example datasource, can be changed
# datasource = "owd-government-expenditure-on-education-gdp"

match datasource:
    case "owd-tertiary-education":
        dataName = "share-of-the-population-with-completed-tertiary-education"
        saveName = "owd_education_tertiary_completion_rate"
        columnName = "Combined - percentage of 25-64 years adults with incomplete tertiary education"
        newColumnName = "TERTIARYCOMPLETIONRATE"
    case "owd-government-expenditure-on-education-gdp":
        dataName = "total-government-expenditure-on-education-gdp"
        saveName = "owd_education_government_expenditure_gdp"
        columnName = "Public spending on education as a share of GDP (historical and recent)"
        newColumnName = "EDUCATIONGDPSHARE"

        
        
# Read in the share-of-the-population-with-completed-tertiary-education dataset
df_raw = pd.read_csv(f'data/in/{dataName}.csv')



In [34]:
df_raw

Unnamed: 0,Entity,Code,Year,Public spending on education as a share of GDP (historical and recent)
0,France,FRA,1870,0.300000
1,France,FRA,1913,1.500000
2,France,FRA,1937,1.300000
3,France,FRA,1960,2.400000
4,France,FRA,1970,3.429740
...,...,...,...,...
296,United States,USA,2018,4.895020
297,United States,USA,2019,4.957470
298,United States,USA,2020,5.399980
299,United States,USA,2021,5.428300


In [35]:
regions = ['Africa','Asia', 'Europe', 'North America', 'Oceania', 'South America']
code = {"Africa": "AFR",
        "Asia": "ASI",
        "Europe": "EUR",
        "North America": "AMN",
        "Oceania": "OCE",
        "South America": "AMS"}

for region in regions:
    df_raw["Code"].loc[df_raw["Entity"]==region] = code[region]
    print(df_raw["Code"].loc[df_raw["Entity"]==region])
    



Series([], Name: Code, dtype: object)
Series([], Name: Code, dtype: object)
Series([], Name: Code, dtype: object)
Series([], Name: Code, dtype: object)
Series([], Name: Code, dtype: object)
Series([], Name: Code, dtype: object)


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_raw["Code"].loc[df_raw["Entity"]==region] = code[region]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_

In [36]:
df = df_raw[["Code", "Year", columnName]]
df = df.rename(columns={"Code": "COUNTRYCODE"})
df = df.rename(columns={"Year": "YEAR"})
df = df.rename(columns={columnName: newColumnName})
display(df)

Unnamed: 0,COUNTRYCODE,YEAR,EDUCATIONGDPSHARE
0,FRA,1870,0.300000
1,FRA,1913,1.500000
2,FRA,1937,1.300000
3,FRA,1960,2.400000
4,FRA,1970,3.429740
...,...,...,...
296,USA,2018,4.895020
297,USA,2019,4.957470
298,USA,2020,5.399980
299,USA,2021,5.428300


In [37]:

df.to_csv(f'data/out/{saveName}.csv', index=False)