In [1]:
import pandas as pd
import numpy as np

In [2]:
educators_df = pd.read_csv("Prepared_Educators.csv")
epi_df = pd.read_csv("Prepared_EPI.csv")
expenditures_df = pd.read_csv("Prepared_Expenditures.csv")
graduation_df = pd.read_csv("Prepared_GraduationRate.csv")
participation_df = pd.read_csv("Prepared_ParticipationRate.csv")

In [3]:
#investigating the size of each dataset to get the right order of left joins
print(educators_df.shape)
print(epi_df.shape)
print(expenditures_df.shape)
print(graduation_df.shape)
print(participation_df.shape)

(231, 5)
(363, 10)
(616, 5)
(60, 4)
(231, 5)


In [4]:

dataframes = [educators_df, epi_df, expenditures_df, graduation_df, participation_df]
    # Convert year into datetime format (assuming Jan 1st as default)
    # Ensure GEO is string format for consistency
for df in dataframes:
    # Convert year into datetime format (assuming Jan 1st as default)
    df["REF_DATE"] = pd.to_datetime(df["REF_DATE"].astype(str) + "-01-01")
    # Ensure GEO is string format for consistency
    df["GEO"] = df["GEO"].astype(str)

In [5]:
from functools import reduce

# Use reduce with an outer join to ensure no data is lost across time and geography
merged_df = reduce(lambda left, right: pd.merge(left, right, on=["REF_DATE", "GEO"], how="outer"), dataframes)

# Sort for readability and consistency
merged_df.sort_values(by=["REF_DATE", "GEO"], inplace=True)

# Convert REF_DATE to string format for display or export
merged_df["REF_DATE"] = merged_df["REF_DATE"].dt.strftime("%d-%m-%Y")

In [6]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 847 entries, 572 to 846
Data columns (total 21 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   REF_DATE                                            847 non-null    object 
 1   GEO                                                 847 non-null    object 
 2   Full-time educators                                 231 non-null    float64
 3   Part-time educators                                 210 non-null    float64
 4   Total, work status                                  231 non-null    float64
 5   Education price index (EPI)                         363 non-null    float64
 6   Fees and contractual services sub-index             363 non-null    float64
 7   Instructional supplies sub-index                    363 non-null    float64
 8   Non-salary sub-index                                363 non-null    float64
 9   No

In [7]:
merged_df.head()

Unnamed: 0,REF_DATE,GEO,Full-time educators,Part-time educators,"Total, work status",Education price index (EPI),Fees and contractual services sub-index,Instructional supplies sub-index,Non-salary sub-index,Non-teaching salaries sub-index,...,"School facilities, supplies and services sub-index",Teachers' salaries sub-index,Teachers salaries,Total expenditures,Total operating expenditures,Extended-time,On-time,College,Elementary and/or High School,University
572,01-01-1947,Alberta,,,,,,,,,...,,,186.0,528.0,528.0,,,,,
573,01-01-1947,British Columbia,,,,,,,,,...,,,224.0,1265.0,1265.0,,,,,
574,01-01-1947,Canada,,,,,,,,,...,,,3742.0,13951.0,13951.0,,,,,
575,01-01-1947,Manitoba,,,,,,,,,...,,,106.0,556.0,556.0,,,,,
576,01-01-1947,New Brunswick,,,,,,,,,...,,,387.25,1544.25,1544.25,,,,,


In [8]:
# Check overall missingness
missing_summary = merged_df.isnull().sum().sort_values(ascending=False)
missing_percent = (missing_summary / len(merged_df)) * 100

# Display columns with significant missing values
missing_report = pd.DataFrame({
    'Missing Count': missing_summary,
    'Missing %': missing_percent
}).query('`Missing %` > 0')

print(missing_report)


                                                    Missing Count  Missing %
On-time                                                       787  92.916175
Extended-time                                                 787  92.916175
Part-time educators                                           637  75.206612
University                                                    616  72.727273
Full-time educators                                           616  72.727273
Total, work status                                            616  72.727273
Elementary and/or High School                                 616  72.727273
College                                                       616  72.727273
School facilities, supplies and services sub-index            484  57.142857
Teachers' salaries sub-index                                  484  57.142857
Salaries and wages sub-index                                  484  57.142857
Non-teaching salaries sub-index                               484  57.142857

In [22]:
merged_df.to_csv("Final_Education_Dataset.csv")