In [8]:
import pandas as pd
import os

In [9]:
# Define the folder path where the CSV files are located
folder_path = "."

In [10]:

# Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv")]

In [11]:

# Initialize an empty list to store DataFrames
dfs = []

# Read each CSV file and append it to the list
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    df['Source_File'] = file  # Optional: Track the file each row came from
    dfs.append(df)

# Concatenate all DataFrames
merged_df = pd.concat(dfs, ignore_index=True)

In [12]:
df

Unnamed: 0,REF_DATE,GEO,DGUID,Age group,Participation rate by type of institution attended,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS,Source_File
0,1995/1996,Canada,2021A000011124,18 to 24 years,Total participation rate,Percent,239,units,0,v99915216,1.1.1,42.0,A,,,0,"Participation rate in education, population ag..."
1,1995/1996,Canada,2021A000011124,18 to 24 years,Elementary and/or High School,Percent,239,units,0,v99915217,1.1.2,9.0,A,,,0,"Participation rate in education, population ag..."
2,1995/1996,Canada,2021A000011124,18 to 24 years,College,Percent,239,units,0,v99915218,1.1.3,15.0,A,,,0,"Participation rate in education, population ag..."
3,1995/1996,Canada,2021A000011124,18 to 24 years,University,Percent,239,units,0,v99915219,1.1.4,19.0,A,,,0,"Participation rate in education, population ag..."
4,1995/1996,Canada,2021A000011124,25 to 29 years,Total participation rate,Percent,239,units,0,v99915220,1.2.1,10.0,A,,,0,"Participation rate in education, population ag..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3442,2023/2024,Nunavut,2021A000262,25 to 29 years,University,Percent,239,units,0,v99915379,14.2.4,,x,,,0,"Participation rate in education, population ag..."
3443,2023/2024,Nunavut,2021A000262,30 to 34 years,Total participation rate,Percent,239,units,0,v99915380,14.3.1,,x,,,0,"Participation rate in education, population ag..."
3444,2023/2024,Nunavut,2021A000262,30 to 34 years,Elementary and/or High School,Percent,239,units,0,v99915381,14.3.2,,x,,,0,"Participation rate in education, population ag..."
3445,2023/2024,Nunavut,2021A000262,30 to 34 years,College,Percent,239,units,0,v99915382,14.3.3,,x,,,0,"Participation rate in education, population ag..."


In [13]:

# Keep only the required columns
df = df[['REF_DATE', 'GEO', 'Participation rate by type of institution attended', 'VALUE']]

In [14]:
df

Unnamed: 0,REF_DATE,GEO,Participation rate by type of institution attended,VALUE
0,1995/1996,Canada,Total participation rate,42.0
1,1995/1996,Canada,Elementary and/or High School,9.0
2,1995/1996,Canada,College,15.0
3,1995/1996,Canada,University,19.0
4,1995/1996,Canada,Total participation rate,10.0
...,...,...,...,...
3442,2023/2024,Nunavut,University,
3443,2023/2024,Nunavut,Total participation rate,
3444,2023/2024,Nunavut,Elementary and/or High School,
3445,2023/2024,Nunavut,College,


In [15]:
# Define the list of GEO values to keep
geo_list = [
    "Canada", "Quebec", "Ontario", "British Columbia", "Alberta", 
    "Manitoba", "New Brunswick", "Newfoundland and Labrador", 
    "Nova Scotia", "Saskatchewan", "Prince Edward Island"
]

# Filter the DataFrame to keep only these GEO values
df = df[df['GEO'].isin(geo_list)].reset_index(drop=True)

In [16]:
df

Unnamed: 0,REF_DATE,GEO,Participation rate by type of institution attended,VALUE
0,1995/1996,Canada,Total participation rate,42.0
1,1995/1996,Canada,Elementary and/or High School,9.0
2,1995/1996,Canada,College,15.0
3,1995/1996,Canada,University,19.0
4,1995/1996,Canada,Total participation rate,10.0
...,...,...,...,...
2727,2023/2024,British Columbia,University,7.0
2728,2023/2024,British Columbia,Total participation rate,6.0
2729,2023/2024,British Columbia,Elementary and/or High School,
2730,2023/2024,British Columbia,College,2.0


In [17]:
# Keep only the left year
df['REF_DATE'] = df['REF_DATE'].str[:4]

In [18]:
df

Unnamed: 0,REF_DATE,GEO,Participation rate by type of institution attended,VALUE
0,1995,Canada,Total participation rate,42.0
1,1995,Canada,Elementary and/or High School,9.0
2,1995,Canada,College,15.0
3,1995,Canada,University,19.0
4,1995,Canada,Total participation rate,10.0
...,...,...,...,...
2727,2023,British Columbia,University,7.0
2728,2023,British Columbia,Total participation rate,6.0
2729,2023,British Columbia,Elementary and/or High School,
2730,2023,British Columbia,College,2.0


In [19]:
df = df[df["Participation rate by type of institution attended"] != "Total participation rate"]


In [20]:
df

Unnamed: 0,REF_DATE,GEO,Participation rate by type of institution attended,VALUE
1,1995,Canada,Elementary and/or High School,9.0
2,1995,Canada,College,15.0
3,1995,Canada,University,19.0
5,1995,Canada,Elementary and/or High School,1.0
6,1995,Canada,College,4.0
...,...,...,...,...
2726,2023,British Columbia,College,4.0
2727,2023,British Columbia,University,7.0
2729,2023,British Columbia,Elementary and/or High School,
2730,2023,British Columbia,College,2.0


In [21]:
# Adding index column
df.insert(0, 'Index', range(1, len(df) + 1))

In [22]:
#pivoting the Graduation rate column
df_pivot = df.pivot(index=['REF_DATE', 'GEO', 'Index'], columns="Participation rate by type of institution attended", values='VALUE').reset_index()

In [23]:
df_pivot

Participation rate by type of institution attended,REF_DATE,GEO,Index,College,Elementary and/or High School,University
0,1995,Alberta,82,,7.0,
1,1995,Alberta,83,11.0,,
2,1995,Alberta,84,,,15.0
3,1995,Alberta,85,,,
4,1995,Alberta,86,5.0,,
...,...,...,...,...,...,...
2034,2023,Saskatchewan,2017,3.0,,
2035,2023,Saskatchewan,2018,,,8.0
2036,2023,Saskatchewan,2019,,,
2037,2023,Saskatchewan,2020,1.0,,


In [24]:
df_pivot.reset_index(drop=True, inplace=True)
df_pivot.columns

Index(['REF_DATE', 'GEO', 'Index', 'College', 'Elementary and/or High School',
       'University'],
      dtype='object', name='Participation rate by type of institution attended')

In [25]:
## Group the data by REF_DATE and GEO without setting them as index
grouped_df = df_pivot.groupby(['REF_DATE', 'GEO'], as_index=False).agg({
    'Index': 'first',  
    'College': 'first',    
    'Elementary and/or High School': 'first',
    'University':'first'
})

In [26]:

grouped_df

Participation rate by type of institution attended,REF_DATE,GEO,Index,College,Elementary and/or High School,University
0,1995,Alberta,82,11.0,7.0,15.0
1,1995,British Columbia,91,15.0,5.0,16.0
2,1995,Canada,1,15.0,9.0,19.0
3,1995,Manitoba,64,5.0,7.0,20.0
4,1995,New Brunswick,37,8.0,6.0,19.0
...,...,...,...,...,...,...
226,2023,Nova Scotia,1970,9.0,4.0,28.0
227,2023,Ontario,1995,13.0,5.0,36.0
228,2023,Prince Edward Island,1962,10.0,4.0,30.0
229,2023,Quebec,1986,23.0,4.0,24.0


In [27]:
#Dropping the Index column
grouped_df.drop(columns=['Index'], inplace=True)

In [28]:

# Remove column index name if it exists
grouped_df.columns.name = None  

In [29]:
grouped_df

Unnamed: 0,REF_DATE,GEO,College,Elementary and/or High School,University
0,1995,Alberta,11.0,7.0,15.0
1,1995,British Columbia,15.0,5.0,16.0
2,1995,Canada,15.0,9.0,19.0
3,1995,Manitoba,5.0,7.0,20.0
4,1995,New Brunswick,8.0,6.0,19.0
...,...,...,...,...,...
226,2023,Nova Scotia,9.0,4.0,28.0
227,2023,Ontario,13.0,5.0,36.0
228,2023,Prince Edward Island,10.0,4.0,30.0
229,2023,Quebec,23.0,4.0,24.0


In [30]:
grouped_df.to_csv("Prepared_GraduationRate.csv", index=False)