In [1]:
import pandas as pd
import os

In [2]:
# Define the folder path where the CSV files are located
folder_path = "."

In [3]:
# Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv")]

In [4]:
# Initialize an empty list to store DataFrames
dfs = []

# Read each CSV file and append it to the list
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    df['Source_File'] = file  # Optional: Track the file each row came from
    dfs.append(df)

# Concatenate all DataFrames
merged_df = pd.concat(dfs, ignore_index=True)

In [5]:
merged_df

Unnamed: 0,REF_DATE,GEO,DGUID,Graduation rate,Gender,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS,Source_File
0,2016/2017,Alberta,2021A000248,On-time,Total gender,Percent,239,units,0,v1391416755,10.1.1,78.0,,,,0,Alberta.csv
1,2017/2018,Alberta,2021A000248,On-time,Total gender,Percent,239,units,0,v1391416755,10.1.1,80.0,,,,0,Alberta.csv
2,2018/2019,Alberta,2021A000248,On-time,Total gender,Percent,239,units,0,v1391416755,10.1.1,80.0,,,,0,Alberta.csv
3,2019/2020,Alberta,2021A000248,On-time,Total gender,Percent,239,units,0,v1391416755,10.1.1,84.0,,,,0,Alberta.csv
4,2020/2021,Alberta,2021A000248,On-time,Total gender,Percent,239,units,0,v1391416755,10.1.1,84.0,,,,0,Alberta.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,2017/2018,Saskatchewan,2021A000247,Extended-time,Female gender,Percent,239,units,0,v1391416754,9.2.3,86.0,,,,0,Saskatchewan.csv
464,2018/2019,Saskatchewan,2021A000247,Extended-time,Female gender,Percent,239,units,0,v1391416754,9.2.3,88.0,,,,0,Saskatchewan.csv
465,2019/2020,Saskatchewan,2021A000247,Extended-time,Female gender,Percent,239,units,0,v1391416754,9.2.3,90.0,,,,0,Saskatchewan.csv
466,2020/2021,Saskatchewan,2021A000247,Extended-time,Female gender,Percent,239,units,0,v1391416754,9.2.3,92.0,,,,0,Saskatchewan.csv


In [6]:
# Keep only the left year
merged_df['REF_DATE'] = merged_df['REF_DATE'].str[:4]

In [7]:
merged_df

Unnamed: 0,REF_DATE,GEO,DGUID,Graduation rate,Gender,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS,Source_File
0,2016,Alberta,2021A000248,On-time,Total gender,Percent,239,units,0,v1391416755,10.1.1,78.0,,,,0,Alberta.csv
1,2017,Alberta,2021A000248,On-time,Total gender,Percent,239,units,0,v1391416755,10.1.1,80.0,,,,0,Alberta.csv
2,2018,Alberta,2021A000248,On-time,Total gender,Percent,239,units,0,v1391416755,10.1.1,80.0,,,,0,Alberta.csv
3,2019,Alberta,2021A000248,On-time,Total gender,Percent,239,units,0,v1391416755,10.1.1,84.0,,,,0,Alberta.csv
4,2020,Alberta,2021A000248,On-time,Total gender,Percent,239,units,0,v1391416755,10.1.1,84.0,,,,0,Alberta.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,2017,Saskatchewan,2021A000247,Extended-time,Female gender,Percent,239,units,0,v1391416754,9.2.3,86.0,,,,0,Saskatchewan.csv
464,2018,Saskatchewan,2021A000247,Extended-time,Female gender,Percent,239,units,0,v1391416754,9.2.3,88.0,,,,0,Saskatchewan.csv
465,2019,Saskatchewan,2021A000247,Extended-time,Female gender,Percent,239,units,0,v1391416754,9.2.3,90.0,,,,0,Saskatchewan.csv
466,2020,Saskatchewan,2021A000247,Extended-time,Female gender,Percent,239,units,0,v1391416754,9.2.3,92.0,,,,0,Saskatchewan.csv


In [8]:
# Keep only the required columns
merged_df = merged_df[['REF_DATE', 'GEO', 'Graduation rate', 'VALUE']]

In [9]:
merged_df

Unnamed: 0,REF_DATE,GEO,Graduation rate,VALUE
0,2016,Alberta,On-time,78.0
1,2017,Alberta,On-time,80.0
2,2018,Alberta,On-time,80.0
3,2019,Alberta,On-time,84.0
4,2020,Alberta,On-time,84.0
...,...,...,...,...
463,2017,Saskatchewan,Extended-time,86.0
464,2018,Saskatchewan,Extended-time,88.0
465,2019,Saskatchewan,Extended-time,90.0
466,2020,Saskatchewan,Extended-time,92.0


In [24]:
# Adding index column
merged_df.insert(0, 'Index', range(1, len(merged_df) + 1))

ValueError: cannot insert Index, already exists

In [25]:
merged_df

Unnamed: 0,Index,REF_DATE,GEO,Graduation rate,VALUE
0,1,2016,Alberta,On-time,78.0
1,2,2017,Alberta,On-time,80.0
2,3,2018,Alberta,On-time,80.0
3,4,2019,Alberta,On-time,84.0
4,5,2020,Alberta,On-time,84.0
...,...,...,...,...,...
463,464,2017,Saskatchewan,Extended-time,86.0
464,465,2018,Saskatchewan,Extended-time,88.0
465,466,2019,Saskatchewan,Extended-time,90.0
466,467,2020,Saskatchewan,Extended-time,92.0


In [26]:
#pivoting the Graduation rate column
df_pivot = merged_df.pivot(index=['REF_DATE', 'GEO', 'Index'], columns="Graduation rate", values='VALUE').reset_index()

In [27]:
df_pivot

Graduation rate,REF_DATE,GEO,Index,Extended-time,On-time
0,2016,Alberta,1,,78.0
1,2016,Alberta,7,,76.0
2,2016,Alberta,13,,81.0
3,2016,Alberta,19,84.0,
4,2016,Alberta,25,82.0,
...,...,...,...,...,...
463,2021,Saskatchewan,444,,82.0
464,2021,Saskatchewan,450,,84.0
465,2021,Saskatchewan,456,91.0,
466,2021,Saskatchewan,462,89.0,


In [28]:
df_pivot.reset_index(drop=True, inplace=True)
df_pivot.columns

Index(['REF_DATE', 'GEO', 'Index', 'Extended-time', 'On-time'], dtype='object', name='Graduation rate')

In [29]:
## Group the data by REF_DATE and GEO without setting them as index
grouped_df = df_pivot.groupby(['REF_DATE', 'GEO'], as_index=False).agg({
    'Index': 'first',  
    'Extended-time': 'first',    
    'On-time': 'first'  
})

In [30]:
grouped_df

Graduation rate,REF_DATE,GEO,Index,Extended-time,On-time
0,2016,Alberta,1,84.0,78.0
1,2016,British Columbia,37,88.0,81.0
2,2016,Canada,73,87.0,80.0
3,2016,Manitoba,109,87.0,82.0
4,2016,New Brunswick,145,85.0,85.0
...,...,...,...,...,...
67,2021,Nova Scotia,258,96.0,91.0
68,2021,Nunavut,294,,
69,2021,Ontario,330,94.0,87.0
70,2021,Quebec,402,86.0,76.0


In [None]:
#Dropping the Index column
grouped_df.drop(columns=['Index'], inplace=True)

In [37]:
# Remove column index name if it exists
grouped_df.columns.name = None  

In [39]:
grouped_df

Unnamed: 0,REF_DATE,GEO,Extended-time,On-time
0,2016,Alberta,84.0,78.0
1,2016,British Columbia,88.0,81.0
2,2016,Canada,87.0,80.0
3,2016,Manitoba,87.0,82.0
4,2016,New Brunswick,85.0,85.0
...,...,...,...,...
67,2021,Nova Scotia,96.0,91.0
68,2021,Nunavut,,
69,2021,Ontario,94.0,87.0
70,2021,Quebec,86.0,76.0


In [40]:
grouped_df["REF_DATE"] = pd.to_datetime(grouped_df["REF_DATE"])

In [41]:
grouped_df

Unnamed: 0,REF_DATE,GEO,Extended-time,On-time
0,2016-01-01,Alberta,84.0,78.0
1,2016-01-01,British Columbia,88.0,81.0
2,2016-01-01,Canada,87.0,80.0
3,2016-01-01,Manitoba,87.0,82.0
4,2016-01-01,New Brunswick,85.0,85.0
...,...,...,...,...
67,2021-01-01,Nova Scotia,96.0,91.0
68,2021-01-01,Nunavut,,
69,2021-01-01,Ontario,94.0,87.0
70,2021-01-01,Quebec,86.0,76.0


In [43]:
# Perform linear interpolation for numerical columns
grouped_df[['Extended-time', 'On-time']] = grouped_df[['Extended-time', 'On-time']].interpolate(method='linear')

In [44]:
grouped_df

Unnamed: 0,REF_DATE,GEO,Extended-time,On-time
0,2016-01-01,Alberta,84.0,78.0
1,2016-01-01,British Columbia,88.0,81.0
2,2016-01-01,Canada,87.0,80.0
3,2016-01-01,Manitoba,87.0,82.0
4,2016-01-01,New Brunswick,85.0,85.0
...,...,...,...,...
67,2021-01-01,Nova Scotia,96.0,91.0
68,2021-01-01,Nunavut,95.0,89.0
69,2021-01-01,Ontario,94.0,87.0
70,2021-01-01,Quebec,86.0,76.0


In [None]:
grouped_df.to_csv("Final_GraduationRate.csv", index=False)

: 