In [1]:
import pandas as pd

In [2]:
file_path = "Elementary and secondary private schools, by type of expenditure.csv"
df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Type of expenditure,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1947/1948,Canada,,Total expenditures,Dollars,81,thousands,3,v1568007,1.1,13951.0,,,,0
1,1947/1948,Canada,,Total operating expenditures,Dollars,81,thousands,3,v1568013,1.7,13951.0,,,,0
2,1947/1948,Canada,,Subtotal instructional expenditures,Dollars,81,thousands,3,v1568010,1.4,3742.0,,,,0
3,1947/1948,Canada,,Teachers salaries,Dollars,81,thousands,3,v1568008,1.2,3742.0,,,,0
4,1947/1948,Canada,,Non-instructional salaries and wages,Dollars,81,thousands,3,v1568011,1.5,1965.0,,,,0


In [4]:
# Keep only the left year
df['REF_DATE'] = df['REF_DATE'].str[:4]

In [5]:
df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Type of expenditure,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1947,Canada,,Total expenditures,Dollars,81,thousands,3,v1568007,1.1,13951.0,,,,0
1,1947,Canada,,Total operating expenditures,Dollars,81,thousands,3,v1568013,1.7,13951.0,,,,0
2,1947,Canada,,Subtotal instructional expenditures,Dollars,81,thousands,3,v1568010,1.4,3742.0,,,,0
3,1947,Canada,,Teachers salaries,Dollars,81,thousands,3,v1568008,1.2,3742.0,,,,0
4,1947,Canada,,Non-instructional salaries and wages,Dollars,81,thousands,3,v1568011,1.5,1965.0,,,,0


In [6]:
# Keep only the required columns
df = df[['REF_DATE', 'GEO', 'Type of expenditure', 'VALUE']]

In [7]:
df.head()

Unnamed: 0,REF_DATE,GEO,Type of expenditure,VALUE
0,1947,Canada,Total expenditures,13951.0
1,1947,Canada,Total operating expenditures,13951.0
2,1947,Canada,Subtotal instructional expenditures,3742.0
3,1947,Canada,Teachers salaries,3742.0
4,1947,Canada,Non-instructional salaries and wages,1965.0


In [8]:
# Filter the dataframe
filtered_df = df[df["Type of expenditure"].isin([
    "Total expenditures",
    "Total operating expenditures",
    "Teachers salaries",
    "Capita outlay and debt charges"
])]

In [9]:
filtered_df

Unnamed: 0,REF_DATE,GEO,Type of expenditure,VALUE
0,1947,Canada,Total expenditures,13951.0
1,1947,Canada,Total operating expenditures,13951.0
3,1947,Canada,Teachers salaries,3742.0
6,1947,Canada,Capita outlay and debt charges,0.0
7,1947,Atlantic Province,Total expenditures,744.0
...,...,...,...,...
5339,2002,Yukon,Capita outlay and debt charges,0.0
5340,2002,Northwest Territories,Total expenditures,303.0
5341,2002,Northwest Territories,Total operating expenditures,303.0
5343,2002,Northwest Territories,Teachers salaries,49.0


In [10]:
# Define the list of GEO values to keep
geo_list = [
    "Canada", "Quebec", "Ontario", "British Columbia", "Alberta", 
    "Manitoba", "New Brunswick", "Newfoundland and Labrador", 
    "Nova Scotia", "Saskatchewan", "Prince Edward Island"
]

# Filter the DataFrame to keep only these GEO values
filtered_df = filtered_df[filtered_df['GEO'].isin(geo_list)].reset_index(drop=True)

In [11]:
filtered_df

Unnamed: 0,REF_DATE,GEO,Type of expenditure,VALUE
0,1947,Canada,Total expenditures,13951.0
1,1947,Canada,Total operating expenditures,13951.0
2,1947,Canada,Teachers salaries,3742.0
3,1947,Canada,Capita outlay and debt charges,0.0
4,1947,Newfoundland and Labrador,Total expenditures,
...,...,...,...,...
2459,2002,Alberta,Capita outlay and debt charges,17373.0
2460,2002,British Columbia,Total expenditures,476518.0
2461,2002,British Columbia,Total operating expenditures,427825.0
2462,2002,British Columbia,Teachers salaries,245399.0


In [12]:
# Adding index column
filtered_df.insert(0, 'Index', range(1, len(filtered_df) + 1))

In [13]:
filtered_df

Unnamed: 0,Index,REF_DATE,GEO,Type of expenditure,VALUE
0,1,1947,Canada,Total expenditures,13951.0
1,2,1947,Canada,Total operating expenditures,13951.0
2,3,1947,Canada,Teachers salaries,3742.0
3,4,1947,Canada,Capita outlay and debt charges,0.0
4,5,1947,Newfoundland and Labrador,Total expenditures,
...,...,...,...,...,...
2459,2460,2002,Alberta,Capita outlay and debt charges,17373.0
2460,2461,2002,British Columbia,Total expenditures,476518.0
2461,2462,2002,British Columbia,Total operating expenditures,427825.0
2462,2463,2002,British Columbia,Teachers salaries,245399.0


In [14]:
#pivoting the Graduation rate column
df_pivot = filtered_df.pivot(index=['REF_DATE', 'GEO', 'Index'], columns="Type of expenditure", values='VALUE').reset_index()

In [15]:
df_pivot

Type of expenditure,REF_DATE,GEO,Index,Capita outlay and debt charges,Teachers salaries,Total expenditures,Total operating expenditures
0,1947,Alberta,37,,,528.0,
1,1947,Alberta,38,,,,528.0
2,1947,Alberta,39,,186.0,,
3,1947,Alberta,40,0.0,,,
4,1947,British Columbia,41,,,1265.0,
...,...,...,...,...,...,...,...
2459,2002,Quebec,2444,41789.0,,,
2460,2002,Saskatchewan,2453,,,36605.0,
2461,2002,Saskatchewan,2454,,,,35791.0
2462,2002,Saskatchewan,2455,,11148.0,,


In [16]:
df_pivot.reset_index(drop=True, inplace=True)
df_pivot.columns

Index(['REF_DATE', 'GEO', 'Index', 'Capita outlay and debt charges',
       'Teachers salaries', 'Total expenditures',
       'Total operating expenditures'],
      dtype='object', name='Type of expenditure')

In [17]:
## Group the data by REF_DATE and GEO without setting them as index
grouped_df = df_pivot.groupby(['REF_DATE', 'GEO'], as_index=False).agg({
    'Index': 'first',  
    'Capita outlay and debt charges': 'first',    
    'Teachers salaries': 'first',
    'Total expenditures': 'first',
    'Total operating expenditures': 'first'
})

In [18]:

grouped_df

Type of expenditure,REF_DATE,GEO,Index,Capita outlay and debt charges,Teachers salaries,Total expenditures,Total operating expenditures
0,1947,Alberta,37,0.0,186.0,528.0,528.0
1,1947,British Columbia,41,0.0,224.0,1265.0,1265.0
2,1947,Canada,1,0.0,3742.0,13951.0,13951.0
3,1947,Manitoba,29,0.0,106.0,556.0,556.0
4,1947,New Brunswick,17,,,,
...,...,...,...,...,...,...,...
611,2002,Nova Scotia,2433,,,,
612,2002,Ontario,2445,99215.0,452073.0,1023249.0,924033.0
613,2002,Prince Edward Island,2429,,,,
614,2002,Quebec,2441,41789.0,390213.0,821906.0,780117.0


In [19]:
#Dropping the Index column
grouped_df.drop(columns=['Index'], inplace=True)

In [20]:

# Remove column index name if it exists
grouped_df.columns.name = None  

In [21]:
grouped_df

Unnamed: 0,REF_DATE,GEO,Capita outlay and debt charges,Teachers salaries,Total expenditures,Total operating expenditures
0,1947,Alberta,0.0,186.0,528.0,528.0
1,1947,British Columbia,0.0,224.0,1265.0,1265.0
2,1947,Canada,0.0,3742.0,13951.0,13951.0
3,1947,Manitoba,0.0,106.0,556.0,556.0
4,1947,New Brunswick,,,,
...,...,...,...,...,...,...
611,2002,Nova Scotia,,,,
612,2002,Ontario,99215.0,452073.0,1023249.0,924033.0
613,2002,Prince Edward Island,,,,
614,2002,Quebec,41789.0,390213.0,821906.0,780117.0


In [22]:
grouped_df = grouped_df.interpolate(method='linear')

  grouped_df = grouped_df.interpolate(method='linear')


In [23]:
grouped_df = grouped_df.drop(columns=["Capita outlay and debt charges"])

In [24]:
grouped_df

Unnamed: 0,REF_DATE,GEO,Teachers salaries,Total expenditures,Total operating expenditures
0,1947,Alberta,186.00,528.00,528.00
1,1947,British Columbia,224.00,1265.00,1265.00
2,1947,Canada,3742.00,13951.00,13951.00
3,1947,Manitoba,106.00,556.00,556.00
4,1947,New Brunswick,387.25,1544.25,1544.25
...,...,...,...,...,...
611,2002,Nova Scotia,352476.25,792489.75,716783.25
612,2002,Ontario,452073.00,1023249.00,924033.00
613,2002,Prince Edward Island,421143.00,922577.50,852075.00
614,2002,Quebec,390213.00,821906.00,780117.00


In [25]:
grouped_df.to_csv("Prepared_Expenditures.csv", index=False)
