In [1]:
import pandas as pd

In [2]:
file_path = "Educators in public elementary and secondary schools.csv"
df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Work status,Age group,Sex,Statistics,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,2002/2003,Canada,2016A000011124,"Total, work status","Total, age group","Total, sex",Number of educators,Number,223,units,0,v1073546428,1.1.1.1.1,350619.0,,,,0
1,2002/2003,Canada,2016A000011124,"Total, work status","Total, age group","Total, sex",Proportion of educators by age group,Percent,239,units,0,v1074084353,1.1.1.1.2,100.0,,,,1
2,2002/2003,Canada,2016A000011124,"Total, work status","Total, age group","Total, sex",Proportion of male and female educators,Percent,239,units,0,v1074084354,1.1.1.1.3,100.0,,,,1
3,2002/2003,Canada,2016A000011124,"Total, work status","Total, age group","Total, sex",Proportion of full-time and part-time educators,Percent,239,units,0,v1074084355,1.1.1.1.4,100.0,,,,1
4,2002/2003,Canada,2016A000011124,"Total, work status","Total, age group",Male,Number of educators,Number,223,units,0,v1073546429,1.1.1.2.1,106041.0,,,,0


In [4]:
# Keep only the left year
df['REF_DATE'] = df['REF_DATE'].str[:4]

In [5]:
df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Work status,Age group,Sex,Statistics,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,2002,Canada,2016A000011124,"Total, work status","Total, age group","Total, sex",Number of educators,Number,223,units,0,v1073546428,1.1.1.1.1,350619.0,,,,0
1,2002,Canada,2016A000011124,"Total, work status","Total, age group","Total, sex",Proportion of educators by age group,Percent,239,units,0,v1074084353,1.1.1.1.2,100.0,,,,1
2,2002,Canada,2016A000011124,"Total, work status","Total, age group","Total, sex",Proportion of male and female educators,Percent,239,units,0,v1074084354,1.1.1.1.3,100.0,,,,1
3,2002,Canada,2016A000011124,"Total, work status","Total, age group","Total, sex",Proportion of full-time and part-time educators,Percent,239,units,0,v1074084355,1.1.1.1.4,100.0,,,,1
4,2002,Canada,2016A000011124,"Total, work status","Total, age group",Male,Number of educators,Number,223,units,0,v1073546429,1.1.1.2.1,106041.0,,,,0


In [6]:
# Define the list of GEO values to keep
geo_list = [
    "Canada", "Quebec", "Ontario", "British Columbia", "Alberta", 
    "Manitoba", "New Brunswick", "Newfoundland and Labrador", 
    "Nova Scotia", "Saskatchewan", "Prince Edward Island"
]

# Filter the DataFrame to keep only these GEO values
df = df[df['GEO'].isin(geo_list)].reset_index(drop=True)

In [7]:
df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Work status,Age group,Sex,Statistics,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,2002,Canada,2016A000011124,"Total, work status","Total, age group","Total, sex",Number of educators,Number,223,units,0,v1073546428,1.1.1.1.1,350619.0,,,,0
1,2002,Canada,2016A000011124,"Total, work status","Total, age group","Total, sex",Proportion of educators by age group,Percent,239,units,0,v1074084353,1.1.1.1.2,100.0,,,,1
2,2002,Canada,2016A000011124,"Total, work status","Total, age group","Total, sex",Proportion of male and female educators,Percent,239,units,0,v1074084354,1.1.1.1.3,100.0,,,,1
3,2002,Canada,2016A000011124,"Total, work status","Total, age group","Total, sex",Proportion of full-time and part-time educators,Percent,239,units,0,v1074084355,1.1.1.1.4,100.0,,,,1
4,2002,Canada,2016A000011124,"Total, work status","Total, age group",Male,Number of educators,Number,223,units,0,v1073546429,1.1.1.2.1,106041.0,,,,0


In [8]:
# Keep only the required columns
df = df[['REF_DATE', 'GEO', 'Work status', 'VALUE']]

In [9]:
df.head()

Unnamed: 0,REF_DATE,GEO,Work status,VALUE
0,2002,Canada,"Total, work status",350619.0
1,2002,Canada,"Total, work status",100.0
2,2002,Canada,"Total, work status",100.0
3,2002,Canada,"Total, work status",100.0
4,2002,Canada,"Total, work status",106041.0


In [10]:
# Adding index column
df.insert(0, 'Index', range(1, len(df) + 1))

In [11]:
df

Unnamed: 0,Index,REF_DATE,GEO,Work status,VALUE
0,1,2002,Canada,"Total, work status",350619.0
1,2,2002,Canada,"Total, work status",100.0
2,3,2002,Canada,"Total, work status",100.0
3,4,2002,Canada,"Total, work status",100.0
4,5,2002,Canada,"Total, work status",106041.0
...,...,...,...,...,...
121963,121964,2022,British Columbia,Part-time educators,0.0
121964,121965,2022,British Columbia,Part-time educators,
121965,121966,2022,British Columbia,Part-time educators,
121966,121967,2022,British Columbia,Part-time educators,


In [12]:
#pivoting the Graduation rate column
df_pivot = df.pivot(index=['REF_DATE', 'GEO', 'Index'], columns="Work status", values='VALUE').reset_index()

In [13]:
df_pivot.reset_index(drop=True, inplace=True)
df_pivot.columns

Index(['REF_DATE', 'GEO', 'Index', 'Full-time educators',
       'Part-time educators', 'Total, work status'],
      dtype='object', name='Work status')

In [14]:
## Group the data by REF_DATE and GEO without setting them as index
grouped_df = df_pivot.groupby(['REF_DATE', 'GEO'], as_index=False).agg({
    'Index': 'first',  
    'Full-time educators': 'first',    
    'Part-time educators': 'first',
    'Total, work status': 'first'
})

In [15]:
grouped_df

Work status,REF_DATE,GEO,Index,Full-time educators,Part-time educators,"Total, work status"
0,2002,Alberta,4753,26970.0,7788.0,34761.0
1,2002,British Columbia,5281,28572.0,8577.0,37149.0
2,2002,Canada,1,283023.0,67599.0,350619.0
3,2002,Manitoba,3697,12042.0,2097.0,14139.0
4,2002,New Brunswick,2113,7050.0,366.0,7416.0
...,...,...,...,...,...,...
226,2022,Nova Scotia,117745,11247.0,,11250.0
227,2022,Ontario,119329,134481.0,21006.0,155487.0
228,2022,Prince Edward Island,117217,1689.0,99.0,1788.0
229,2022,Quebec,118801,80757.0,39882.0,120639.0


In [16]:
#Dropping the Index column
grouped_df.drop(columns=['Index'], inplace=True)

In [17]:

# Remove column index name if it exists
grouped_df.columns.name = None  

In [18]:
grouped_df


Unnamed: 0,REF_DATE,GEO,Full-time educators,Part-time educators,"Total, work status"
0,2002,Alberta,26970.0,7788.0,34761.0
1,2002,British Columbia,28572.0,8577.0,37149.0
2,2002,Canada,283023.0,67599.0,350619.0
3,2002,Manitoba,12042.0,2097.0,14139.0
4,2002,New Brunswick,7050.0,366.0,7416.0
...,...,...,...,...,...
226,2022,Nova Scotia,11247.0,,11250.0
227,2022,Ontario,134481.0,21006.0,155487.0
228,2022,Prince Edward Island,1689.0,99.0,1788.0
229,2022,Quebec,80757.0,39882.0,120639.0


In [19]:
grouped_df.to_csv("Prepared_Educators.csv", index=False)