In [150]:
import pandas as pd

In [151]:
file_path = "Raw-housing-starts-csv.csv"
df = pd.read_csv(file_path)

In [152]:
df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Housing estimates,Type of unit,Seasonal adjustment,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1948-01,Canada,2016A000011124,Housing starts,Total units,Unadjusted,Units,300,units,0,v730416,1.1.1.1,6678.0,,,,0
1,1948-01,Canada,2016A000011124,Housing under construction,Total units,Unadjusted,Units,300,units,0,v731381,1.2.1.1,37529.0,,,,0
2,1948-01,Canada,2016A000011124,Housing completions,Total units,Unadjusted,Units,300,units,0,v732318,1.3.1.1,11569.0,,,,0
3,1948-01,Prince Edward Island,2016A000211,Housing starts,Total units,Unadjusted,Units,300,units,0,v730419,4.1.1.1,0.0,,,,0
4,1948-01,Prince Edward Island,2016A000211,Housing under construction,Total units,Unadjusted,Units,300,units,0,v731384,4.2.1.1,173.0,,,,0


In [153]:
df.describe()

Unnamed: 0,UOM_ID,SCALAR_ID,VALUE,STATUS,SYMBOL,DECIMALS
count,56084.0,56084.0,56084.0,0.0,0.0,56084.0
mean,300.0,0.13651,5819.929402,,,0.045503
std,0.0,0.625221,17396.093413,,,0.208407
min,300.0,0.0,0.0,,,0.0
25%,300.0,0.0,108.0,,,0.0
50%,300.0,0.0,732.0,,,0.0
75%,300.0,0.0,3836.0,,,0.0
max,300.0,3.0,378042.0,,,1.0


In [154]:
# List of columns to remove
columns_to_remove = [
    "DGUID", "Type of unit", "Seasonal adjustment", "UOM", "UOM_ID", 
    "SCALAR_FACTOR", "SCALAR_ID", "VECTOR", "COORDINATE", "STATUS", 
    "SYMBOL", "TERMINATED", "DECIMALS"
]

# Drop the specified columns
df.drop(columns=columns_to_remove, inplace=True, errors='ignore')

In [155]:
df.head()

Unnamed: 0,REF_DATE,GEO,Housing estimates,VALUE
0,1948-01,Canada,Housing starts,6678.0
1,1948-01,Canada,Housing under construction,37529.0
2,1948-01,Canada,Housing completions,11569.0
3,1948-01,Prince Edward Island,Housing starts,0.0
4,1948-01,Prince Edward Island,Housing under construction,173.0


In [156]:
# Keep only specified values in the 'GEO' column
valid_geo_values = ["Canada", "Quebec", "Ontario", "British Columbia", "Alberta", "Manitoba", "New Brunswick", "Newfoundland and Labrador", "Nova Scotia", "Saskatchewan", "Prince Edward Island"]
df = df[df["GEO"].isin(valid_geo_values)]

In [157]:
df.head()

Unnamed: 0,REF_DATE,GEO,Housing estimates,VALUE
0,1948-01,Canada,Housing starts,6678.0
1,1948-01,Canada,Housing under construction,37529.0
2,1948-01,Canada,Housing completions,11569.0
3,1948-01,Prince Edward Island,Housing starts,0.0
4,1948-01,Prince Edward Island,Housing under construction,173.0


In [158]:
df.shape

(55732, 4)

In [159]:
# duplicate_dates = df['REF_DATE'].duplicated().sum()
# duplicate_dates

In [160]:
# Ensure REF_DATE is in datetime format
df['REF_DATE'] = pd.to_datetime(df['REF_DATE'])

In [161]:
# Filter out rows where REF_DATE is between 1948 and 1976
df = df[(df['REF_DATE'].dt.year > 1976)]

In [162]:
df.head()

Unnamed: 0,REF_DATE,GEO,Housing estimates,VALUE
17144,1977-01-01,Canada,Housing starts,35606.0
17145,1977-01-01,Canada,Housing starts,228.5
17146,1977-01-01,Canada,Housing starts,14213.0
17147,1977-01-01,Canada,Housing starts,110.3
17148,1977-01-01,Canada,Housing starts,21393.0


In [163]:
duplicates = df.duplicated(subset=['REF_DATE', 'GEO', 'Housing estimates'], keep=False)
if duplicates.any():
    print("Duplicate values detected.")

Duplicate values detected.


In [164]:
df = df.groupby(['REF_DATE', 'GEO', 'Housing estimates'], as_index=False)['VALUE'].sum()

In [165]:
df_pivot = df.pivot(index=['REF_DATE', 'GEO'], columns="Housing estimates", values='VALUE')

In [166]:
df_pivot = df_pivot.reset_index()

In [167]:
df_pivot.head()

Housing estimates,REF_DATE,GEO,Housing completions,Housing starts,Housing under construction
0,1977-01-01,Alberta,18211.0,11477.1,70186.0
1,1977-01-01,British Columbia,20198.0,16169.0,49689.0
2,1977-01-01,Canada,133803.0,93062.0,492630.0
3,1977-01-01,Manitoba,4214.0,5559.1,16727.0
4,1977-01-01,New Brunswick,3649.0,483.9,6120.0


In [168]:
df_pivot.columns

Index(['REF_DATE', 'GEO', 'Housing completions', 'Housing starts',
       'Housing under construction'],
      dtype='object', name='Housing estimates')

In [169]:
# df_pivot.to_csv("housing-manipulated.csv", index=False)

In [170]:
# Create a complete range of months for each GEO location
all_dates = pd.date_range(start=df_pivot["REF_DATE"].min(), end=df_pivot["REF_DATE"].max(), freq='MS')  # MS = Month Start

In [171]:
# Create a MultiIndex with GEO and complete dates
geo_list = df_pivot["GEO"].unique()
multi_index = pd.MultiIndex.from_product([all_dates, geo_list], names=["REF_DATE", "GEO"])

In [172]:
# Reindex the DataFrame to include missing months
df_pivot = df_pivot.set_index(["REF_DATE", "GEO"]).reindex(multi_index).reset_index()

In [173]:
# Interpolate missing values (linear method)
df_pivot.interpolate(method='linear', inplace=True)

  df_pivot.interpolate(method='linear', inplace=True)


In [174]:
df_pivot.head()

Housing estimates,REF_DATE,GEO,Housing completions,Housing starts,Housing under construction
0,1977-01-01,Alberta,18211.0,11477.1,70186.0
1,1977-01-01,British Columbia,20198.0,16169.0,49689.0
2,1977-01-01,Canada,133803.0,93062.0,492630.0
3,1977-01-01,Manitoba,4214.0,5559.1,16727.0
4,1977-01-01,New Brunswick,3649.0,483.9,6120.0


In [175]:
df_pivot.to_csv("HousingStarted_Manipulated.csv", index=False)