In [1]:

import pandas as pd

In [2]:
file_path = "Education price index (EPI), elementary and secondary.csv"
df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Index categories,Base year,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1971,Canada,,Education price index (EPI),Base year 1986=100,Index,160,units,0,v1026884,1.1.1,29.82,,,,2
1,1971,Canada,,Education price index (EPI),Base year 1992=100,Index,160,units,0,v1026885,1.1.2,22.82,,,,2
2,1971,Canada,,Salaries and wages sub-index,Base year 1986=100,Index,160,units,0,v1026886,1.2.1,30.51,,,,2
3,1971,Canada,,Salaries and wages sub-index,Base year 1992=100,Index,160,units,0,v1026887,1.2.2,23.36,,,,2
4,1971,Canada,,Teachers' salaries sub-index,Base year 1986=100,Index,160,units,0,v1026888,1.3.1,30.91,,,,2


In [4]:
# Define the list of GEO values to keep
geo_list = [
    "Canada", "Quebec", "Ontario", "British Columbia", "Alberta", 
    "Manitoba", "New Brunswick", "Newfoundland and Labrador", 
    "Nova Scotia", "Saskatchewan", "Prince Edward Island"
]

# Filter the DataFrame to keep only these GEO values
df = df[df['GEO'].isin(geo_list)].reset_index(drop=True)

In [5]:
df

Unnamed: 0,REF_DATE,GEO,DGUID,Index categories,Base year,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1971,Canada,,Education price index (EPI),Base year 1986=100,Index,160,units,0,v1026884,1.1.1,29.82,,,,2
1,1971,Canada,,Education price index (EPI),Base year 1992=100,Index,160,units,0,v1026885,1.1.2,22.82,,,,2
2,1971,Canada,,Salaries and wages sub-index,Base year 1986=100,Index,160,units,0,v1026886,1.2.1,30.51,,,,2
3,1971,Canada,,Salaries and wages sub-index,Base year 1992=100,Index,160,units,0,v1026887,1.2.2,23.36,,,,2
4,1971,Canada,,Teachers' salaries sub-index,Base year 1986=100,Index,160,units,0,v1026888,1.3.1,30.91,,,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5803,2003,British Columbia,,Instructional supplies sub-index,Base year 1992=100,Index,160,units,0,v1027055,11.6.2,168.51,,,,2
5804,2003,British Columbia,,"School facilities, supplies and services sub-i...",Base year 1986=100,Index,160,units,0,v1027056,11.7.1,146.89,,,,2
5805,2003,British Columbia,,"School facilities, supplies and services sub-i...",Base year 1992=100,Index,160,units,0,v1027057,11.7.2,133.22,,,,2
5806,2003,British Columbia,,Fees and contractual services sub-index,Base year 1986=100,Index,160,units,0,v1027058,11.8.1,196.16,,,,2


In [6]:
# Keep only the required columns
df = df[['REF_DATE', 'GEO', 'Index categories', 'VALUE']]

In [7]:
df.head()

Unnamed: 0,REF_DATE,GEO,Index categories,VALUE
0,1971,Canada,Education price index (EPI),29.82
1,1971,Canada,Education price index (EPI),22.82
2,1971,Canada,Salaries and wages sub-index,30.51
3,1971,Canada,Salaries and wages sub-index,23.36
4,1971,Canada,Teachers' salaries sub-index,30.91


In [8]:
# Adding index column
df.insert(0, 'Index', range(1, len(df) + 1))

In [9]:
df.head()

Unnamed: 0,Index,REF_DATE,GEO,Index categories,VALUE
0,1,1971,Canada,Education price index (EPI),29.82
1,2,1971,Canada,Education price index (EPI),22.82
2,3,1971,Canada,Salaries and wages sub-index,30.51
3,4,1971,Canada,Salaries and wages sub-index,23.36
4,5,1971,Canada,Teachers' salaries sub-index,30.91


In [10]:
#pivoting the Graduation rate column
df_pivot = df.pivot(index=['REF_DATE', 'GEO', 'Index'], columns="Index categories", values='VALUE').reset_index()

In [12]:
df_pivot

Index categories,REF_DATE,GEO,Index,Education price index (EPI),Fees and contractual services sub-index,Instructional supplies sub-index,Non-salary sub-index,Non-teaching salaries sub-index,Salaries and wages sub-index,"School facilities, supplies and services sub-index",Teachers' salaries sub-index
0,1971,Alberta,145,30.13,,,,,,,
1,1971,Alberta,146,23.92,,,,,,,
2,1971,Alberta,147,,,,,,30.76,,
3,1971,Alberta,148,,,,,,24.53,,
4,1971,Alberta,149,,,,,,,,31.33
...,...,...,...,...,...,...,...,...,...,...,...
5803,2003,Saskatchewan,5772,,,168.51,,,,,
5804,2003,Saskatchewan,5773,,,,,,,151.88,
5805,2003,Saskatchewan,5774,,,,,,,136.51,
5806,2003,Saskatchewan,5775,,208.37,,,,,,


In [13]:

df_pivot.reset_index(drop=True, inplace=True)
df_pivot.columns

Index(['REF_DATE', 'GEO', 'Index', 'Education price index (EPI)',
       'Fees and contractual services sub-index',
       'Instructional supplies sub-index', 'Non-salary sub-index',
       'Non-teaching salaries sub-index', 'Salaries and wages sub-index',
       'School facilities, supplies and services sub-index',
       'Teachers' salaries sub-index'],
      dtype='object', name='Index categories')

In [14]:
grouped_df = df_pivot.groupby(['REF_DATE', 'GEO'], as_index=False).agg({
    "Index": "first",
    "Education price index (EPI)": "first",
    "Fees and contractual services sub-index": "first",
    "Instructional supplies sub-index": "first",
    "Non-salary sub-index": "first",
    "Non-teaching salaries sub-index": "first",
    "Salaries and wages sub-index": "first",
    "School facilities, supplies and services sub-index": "first",
    "Teachers' salaries sub-index": "first"  # No issue inside double quotes
})


In [15]:
grouped_df


Index categories,REF_DATE,GEO,Index,Education price index (EPI),Fees and contractual services sub-index,Instructional supplies sub-index,Non-salary sub-index,Non-teaching salaries sub-index,Salaries and wages sub-index,"School facilities, supplies and services sub-index",Teachers' salaries sub-index
0,1971,Alberta,145,30.13,29.86,28.19,27.24,27.59,30.76,23.58,31.33
1,1971,British Columbia,161,30.19,25.15,28.19,25.06,27.79,31.13,22.67,31.82
2,1971,Canada,1,29.82,28.81,28.19,26.93,27.65,30.51,23.51,30.91
3,1971,Manitoba,113,29.49,31.80,28.19,27.75,27.85,29.88,23.81,30.22
4,1971,New Brunswick,65,27.93,31.71,28.19,27.28,27.76,28.10,24.67,28.22
...,...,...,...,...,...,...,...,...,...,...,...
358,2003,Nova Scotia,5681,149.55,213.64,206.00,176.70,154.86,146.35,130.50,145.24
359,2003,Ontario,5729,162.95,212.21,206.00,198.38,148.67,155.17,172.16,155.94
360,2003,Prince Edward Island,5665,157.39,209.86,206.00,171.32,164.34,155.95,139.54,154.49
361,2003,Quebec,5713,165.60,211.06,206.00,199.46,153.07,157.36,156.89,157.82


In [16]:
#Dropping the Index column
grouped_df.drop(columns=['Index'], inplace=True)

In [17]:
grouped_df

Index categories,REF_DATE,GEO,Education price index (EPI),Fees and contractual services sub-index,Instructional supplies sub-index,Non-salary sub-index,Non-teaching salaries sub-index,Salaries and wages sub-index,"School facilities, supplies and services sub-index",Teachers' salaries sub-index
0,1971,Alberta,30.13,29.86,28.19,27.24,27.59,30.76,23.58,31.33
1,1971,British Columbia,30.19,25.15,28.19,25.06,27.79,31.13,22.67,31.82
2,1971,Canada,29.82,28.81,28.19,26.93,27.65,30.51,23.51,30.91
3,1971,Manitoba,29.49,31.80,28.19,27.75,27.85,29.88,23.81,30.22
4,1971,New Brunswick,27.93,31.71,28.19,27.28,27.76,28.10,24.67,28.22
...,...,...,...,...,...,...,...,...,...,...
358,2003,Nova Scotia,149.55,213.64,206.00,176.70,154.86,146.35,130.50,145.24
359,2003,Ontario,162.95,212.21,206.00,198.38,148.67,155.17,172.16,155.94
360,2003,Prince Edward Island,157.39,209.86,206.00,171.32,164.34,155.95,139.54,154.49
361,2003,Quebec,165.60,211.06,206.00,199.46,153.07,157.36,156.89,157.82


In [18]:

# Remove column index name if it exists
grouped_df.columns.name = None  

In [19]:
grouped_df

Unnamed: 0,REF_DATE,GEO,Education price index (EPI),Fees and contractual services sub-index,Instructional supplies sub-index,Non-salary sub-index,Non-teaching salaries sub-index,Salaries and wages sub-index,"School facilities, supplies and services sub-index",Teachers' salaries sub-index
0,1971,Alberta,30.13,29.86,28.19,27.24,27.59,30.76,23.58,31.33
1,1971,British Columbia,30.19,25.15,28.19,25.06,27.79,31.13,22.67,31.82
2,1971,Canada,29.82,28.81,28.19,26.93,27.65,30.51,23.51,30.91
3,1971,Manitoba,29.49,31.80,28.19,27.75,27.85,29.88,23.81,30.22
4,1971,New Brunswick,27.93,31.71,28.19,27.28,27.76,28.10,24.67,28.22
...,...,...,...,...,...,...,...,...,...,...
358,2003,Nova Scotia,149.55,213.64,206.00,176.70,154.86,146.35,130.50,145.24
359,2003,Ontario,162.95,212.21,206.00,198.38,148.67,155.17,172.16,155.94
360,2003,Prince Edward Island,157.39,209.86,206.00,171.32,164.34,155.95,139.54,154.49
361,2003,Quebec,165.60,211.06,206.00,199.46,153.07,157.36,156.89,157.82


In [20]:
grouped_df.to_csv("Prepared_EPI.csv", index=False)