In [63]:
import pandas as pd

In [64]:
file_path = "HousingStarted_Raw.csv"
df = pd.read_csv(file_path)

In [65]:
df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Housing estimates,Type of unit,Seasonal adjustment,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1948-01,Canada,2016A000011124,Housing starts,Total units,Unadjusted,Units,300,units,0,v730416,1.1.1.1,6678.0,,,,0
1,1948-01,Canada,2016A000011124,Housing under construction,Total units,Unadjusted,Units,300,units,0,v731381,1.2.1.1,37529.0,,,,0
2,1948-01,Canada,2016A000011124,Housing completions,Total units,Unadjusted,Units,300,units,0,v732318,1.3.1.1,11569.0,,,,0
3,1948-01,Prince Edward Island,2016A000211,Housing starts,Total units,Unadjusted,Units,300,units,0,v730419,4.1.1.1,0.0,,,,0
4,1948-01,Prince Edward Island,2016A000211,Housing under construction,Total units,Unadjusted,Units,300,units,0,v731384,4.2.1.1,173.0,,,,0


In [66]:
# List of columns to remove
columns_to_remove = [
    "DGUID", "Seasonal adjustment", "UOM", "UOM_ID", 
    "SCALAR_FACTOR", "SCALAR_ID", "VECTOR", "COORDINATE", "STATUS", 
    "SYMBOL", "TERMINATED", "DECIMALS"
]

# Drop the specified columns
df.drop(columns=columns_to_remove, inplace=True, errors='ignore')

In [67]:
df.head()

Unnamed: 0,REF_DATE,GEO,Housing estimates,Type of unit,VALUE
0,1948-01,Canada,Housing starts,Total units,6678.0
1,1948-01,Canada,Housing under construction,Total units,37529.0
2,1948-01,Canada,Housing completions,Total units,11569.0
3,1948-01,Prince Edward Island,Housing starts,Total units,0.0
4,1948-01,Prince Edward Island,Housing under construction,Total units,173.0


In [68]:
#Filtering only total units data
df_filtered = df[df["Type of unit"] == "Total units"].drop(columns=["Type of unit"])

In [69]:
df_filtered.head()

Unnamed: 0,REF_DATE,GEO,Housing estimates,VALUE
0,1948-01,Canada,Housing starts,6678.0
1,1948-01,Canada,Housing under construction,37529.0
2,1948-01,Canada,Housing completions,11569.0
3,1948-01,Prince Edward Island,Housing starts,0.0
4,1948-01,Prince Edward Island,Housing under construction,173.0


In [70]:
# Adding index column
df_filtered.insert(0, 'Index', range(1, len(df_filtered) + 1))

In [71]:
df_filtered.head()

Unnamed: 0,Index,REF_DATE,GEO,Housing estimates,VALUE
0,1,1948-01,Canada,Housing starts,6678.0
1,2,1948-01,Canada,Housing under construction,37529.0
2,3,1948-01,Canada,Housing completions,11569.0
3,4,1948-01,Prince Edward Island,Housing starts,0.0
4,5,1948-01,Prince Edward Island,Housing under construction,173.0


In [72]:
#pivoting the Housing estimates column
df_pivot = df_filtered.pivot(index=['REF_DATE', 'GEO', 'Index'], columns="Housing estimates", values='VALUE').reset_index()

In [73]:
df_pivot

Housing estimates,REF_DATE,GEO,Index,Housing completions,Housing starts,Housing under construction
0,1948-01,Alberta,25,,250.0,
1,1948-01,Alberta,26,,,1855.0
2,1948-01,Alberta,27,855.0,,
3,1948-01,British Columbia,28,,2121.0,
4,1948-01,British Columbia,29,,,6628.0
...,...,...,...,...,...,...
12255,2024-10,Nova Scotia,12253,,1437.0,
12256,2024-10,Ontario,12256,,16287.0,
12257,2024-10,Prince Edward Island,12252,,357.0,
12258,2024-10,Quebec,12255,,15814.0,


In [74]:
df_pivot.reset_index(drop=True, inplace=True)
df_pivot.columns

Index(['REF_DATE', 'GEO', 'Index', 'Housing completions', 'Housing starts',
       'Housing under construction'],
      dtype='object', name='Housing estimates')

In [75]:
## Group the data by REF_DATE and GEO without setting them as index
grouped_df = df_pivot.groupby(['REF_DATE', 'GEO'], as_index=False).agg({
    'Index': 'first',  
    'Housing completions': 'first',  
    'Housing starts': 'first',  
    'Housing under construction': 'first'  
})

In [76]:
grouped_df

Housing estimates,REF_DATE,GEO,Index,Housing completions,Housing starts,Housing under construction
0,1948-01,Alberta,25,855.0,250.0,1855.0
1,1948-01,British Columbia,28,2189.0,2121.0,6628.0
2,1948-01,Canada,1,11569.0,6678.0,37529.0
3,1948-01,Manitoba,19,655.0,125.0,1785.0
4,1948-01,New Brunswick,10,209.0,14.0,531.0
...,...,...,...,...,...,...
3731,2024-10,Nova Scotia,12253,,1437.0,
3732,2024-10,Ontario,12256,,16287.0,
3733,2024-10,Prince Edward Island,12252,,357.0,
3734,2024-10,Quebec,12255,,15814.0,


In [77]:
grouped_df["REF_DATE"] = pd.to_datetime(grouped_df["REF_DATE"])

In [78]:
grouped_df

Housing estimates,REF_DATE,GEO,Index,Housing completions,Housing starts,Housing under construction
0,1948-01-01,Alberta,25,855.0,250.0,1855.0
1,1948-01-01,British Columbia,28,2189.0,2121.0,6628.0
2,1948-01-01,Canada,1,11569.0,6678.0,37529.0
3,1948-01-01,Manitoba,19,655.0,125.0,1785.0
4,1948-01-01,New Brunswick,10,209.0,14.0,531.0
...,...,...,...,...,...,...
3731,2024-10-01,Nova Scotia,12253,,1437.0,
3732,2024-10-01,Ontario,12256,,16287.0,
3733,2024-10-01,Prince Edward Island,12252,,357.0,
3734,2024-10-01,Quebec,12255,,15814.0,


In [79]:
# Generate a complete monthly date range for each GEO
full_dates = pd.date_range(start=grouped_df['REF_DATE'].min(), end=grouped_df['REF_DATE'].max(), freq='MS')
all_geos = grouped_df['GEO'].unique()

In [80]:
# Create a new DataFrame with all month-wise GEO combinations
full_df = pd.MultiIndex.from_product([full_dates, all_geos], names=['REF_DATE', 'GEO']).to_frame(index=False)

In [81]:
# Merge with original data
grouped_df = full_df.merge(grouped_df, on=['REF_DATE', 'GEO'], how='left')

In [82]:
# Apply linear interpolation for missing values
grouped_df.interpolate(method='linear', inplace=True)

  grouped_df.interpolate(method='linear', inplace=True)


In [83]:
# Filter data to start from 1976
grouped_df = grouped_df[grouped_df['REF_DATE'] >= '1976-01-01'].reset_index(drop=True)

In [84]:
#Dropping the Index column
grouped_df.drop(columns=['Index'], inplace=True)

In [85]:
grouped_df

Unnamed: 0,REF_DATE,GEO,Housing completions,Housing starts,Housing under construction
0,1976-01-01,Alberta,3733.0,6860.0,19705.0
1,1976-01-01,British Columbia,5846.0,6804.0,21969.0
2,1976-01-01,Canada,41048.0,43525.0,175010.0
3,1976-01-01,Manitoba,1448.0,1809.0,5337.0
4,1976-01-01,New Brunswick,1112.0,466.0,3504.0
...,...,...,...,...,...
7613,2024-10-01,Quebec,359.0,15814.0,966.0
7614,2024-10-01,Saskatchewan,359.0,1184.0,966.0
7615,2024-10-01,Newfoundland and Labrador,359.0,473.0,966.0
7616,2024-10-01,Atlantic provinces,359.0,473.0,966.0


In [86]:
grouped_df.to_csv("HousingStarted_Manipulated.csv", index=False)