In [49]:
import pandas as pd

In [50]:
file_path = "HousingStarted_Raw.csv"
df = pd.read_csv(file_path)

In [51]:
df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Housing estimates,Type of unit,Seasonal adjustment,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1948-01,Canada,2016A000011124,Housing starts,Total units,Unadjusted,Units,300,units,0,v730416,1.1.1.1,6678.0,,,,0
1,1948-01,Canada,2016A000011124,Housing under construction,Total units,Unadjusted,Units,300,units,0,v731381,1.2.1.1,37529.0,,,,0
2,1948-01,Canada,2016A000011124,Housing completions,Total units,Unadjusted,Units,300,units,0,v732318,1.3.1.1,11569.0,,,,0
3,1948-01,Prince Edward Island,2016A000211,Housing starts,Total units,Unadjusted,Units,300,units,0,v730419,4.1.1.1,0.0,,,,0
4,1948-01,Prince Edward Island,2016A000211,Housing under construction,Total units,Unadjusted,Units,300,units,0,v731384,4.2.1.1,173.0,,,,0


In [52]:
# List of columns to remove
columns_to_remove = [
    "DGUID", "Seasonal adjustment", "UOM", "UOM_ID", 
    "SCALAR_FACTOR", "SCALAR_ID", "VECTOR", "COORDINATE", "STATUS", 
    "SYMBOL", "TERMINATED", "DECIMALS"
]

# Drop the specified columns
df.drop(columns=columns_to_remove, inplace=True, errors='ignore')

In [53]:
df.head()

Unnamed: 0,REF_DATE,GEO,Housing estimates,Type of unit,VALUE
0,1948-01,Canada,Housing starts,Total units,6678.0
1,1948-01,Canada,Housing under construction,Total units,37529.0
2,1948-01,Canada,Housing completions,Total units,11569.0
3,1948-01,Prince Edward Island,Housing starts,Total units,0.0
4,1948-01,Prince Edward Island,Housing under construction,Total units,173.0


In [54]:
#Filtering only total units data
df_filtered = df[df["Type of unit"] == "Total units"].drop(columns=["Type of unit"])

In [55]:
df_filtered.head()

Unnamed: 0,REF_DATE,GEO,Housing estimates,VALUE
0,1948-01,Canada,Housing starts,6678.0
1,1948-01,Canada,Housing under construction,37529.0
2,1948-01,Canada,Housing completions,11569.0
3,1948-01,Prince Edward Island,Housing starts,0.0
4,1948-01,Prince Edward Island,Housing under construction,173.0


In [56]:
# Adding index column
df_filtered.insert(0, 'Index', range(1, len(df_filtered) + 1))

In [57]:
df_filtered.head()

Unnamed: 0,Index,REF_DATE,GEO,Housing estimates,VALUE
0,1,1948-01,Canada,Housing starts,6678.0
1,2,1948-01,Canada,Housing under construction,37529.0
2,3,1948-01,Canada,Housing completions,11569.0
3,4,1948-01,Prince Edward Island,Housing starts,0.0
4,5,1948-01,Prince Edward Island,Housing under construction,173.0


In [58]:
#pivoting the Housing estimates column
df_pivot = df_filtered.pivot(index=['REF_DATE', 'GEO', 'Index'], columns="Housing estimates", values='VALUE').reset_index()

In [59]:
df_pivot

Housing estimates,REF_DATE,GEO,Index,Housing completions,Housing starts,Housing under construction
0,1948-01,Alberta,25,,250.0,
1,1948-01,Alberta,26,,,1855.0
2,1948-01,Alberta,27,855.0,,
3,1948-01,British Columbia,28,,2121.0,
4,1948-01,British Columbia,29,,,6628.0
...,...,...,...,...,...,...
12255,2024-10,Nova Scotia,12253,,1437.0,
12256,2024-10,Ontario,12256,,16287.0,
12257,2024-10,Prince Edward Island,12252,,357.0,
12258,2024-10,Quebec,12255,,15814.0,


In [60]:
df_pivot.reset_index(drop=True, inplace=True)
df_pivot.columns

Index(['REF_DATE', 'GEO', 'Index', 'Housing completions', 'Housing starts',
       'Housing under construction'],
      dtype='object', name='Housing estimates')

In [65]:
grouped_df = df_pivot.groupby(['REF_DATE', 'GEO'], as_index=False).agg({
    'Index': 'first',  
    'Housing completions': 'first',  
    'Housing starts': 'first',  
    'Housing under construction': 'first'  
})

In [64]:
grouped_df

Housing estimates,REF_DATE,GEO,Index,Housing completions,Housing starts,Housing under construction
0,1948-01,Alberta,25,855.0,250.0,1855.0
1,1948-01,British Columbia,28,2189.0,2121.0,6628.0
2,1948-01,Canada,1,11569.0,6678.0,37529.0
3,1948-01,Manitoba,19,655.0,125.0,1785.0
4,1948-01,New Brunswick,10,209.0,14.0,531.0
...,...,...,...,...,...,...
3731,2024-10,Nova Scotia,12253,,1437.0,
3732,2024-10,Ontario,12256,,16287.0,
3733,2024-10,Prince Edward Island,12252,,357.0,
3734,2024-10,Quebec,12255,,15814.0,


In [None]:
# Create a complete range of months for each GEO location
all_dates = pd.date_range(start=df_pivot["REF_DATE"].min(), end=df_pivot["REF_DATE"].max(), freq='MS')  # MS = Month Start

In [None]:
# Create a MultiIndex with GEO and complete dates
geo_list = df_pivot["GEO"].unique()
multi_index = pd.MultiIndex.from_product([all_dates, geo_list], names=["REF_DATE", "GEO"])

In [None]:
# Reindex the DataFrame to include missing months
df_pivot = df_pivot.set_index(["REF_DATE", "GEO"]).reindex(multi_index).reset_index()

ValueError: cannot handle a non-unique multi-index!

In [None]:
df_pivot.to_csv("HousingStarted_Manipulated.csv", index=False)
