In [4]:
import pandas as pd 
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.decomposition import PCA
import datetime 

In [5]:
city_df = pd.read_csv("../Resources/cleaned_canada_pop_2011_2016.csv")
city_df.head()

Unnamed: 0,city_name,city_size_type,city_type,province,pop_2016,city_size,pop_density
0,Acton,"Small population centre (1,000 to 29,999)",Fringe,Ontario,9462.0,7.8,1213.2
1,Acton Vale,"Small population centre (1,000 to 29,999)",Population centre outside metropolitan area / ...,Quebec,5399.0,4.3,1254.2
2,Airdrie,"Medium population centre (30,000 to 99,999)",Secondary core,Alberta,61082.0,32.78,1863.2
3,Alençon,"Small population centre (1,000 to 29,999)",Population centre outside metropolitan area / ...,Quebec,1562.0,2.94,530.4
4,Alexandria,"Small population centre (1,000 to 29,999)",Population centre outside metropolitan area / ...,Ontario,2845.0,2.07,1375.1


In [6]:
inventory_df = pd.read_csv("../Resources/housing_inventory_clean_v2.csv")
inventory_df.head()

Unnamed: 0,REF_DATE,City,Province,Completed_units,Dwelling_Type,UNIT_VALUE
0,1988-06-01,Abbotsford-Mission,British Columbia,Absorptions,Single detached units,74
1,1988-06-01,Abbotsford-Mission,British Columbia,Absorptions,Semi-detached units,3
2,1988-06-01,Abbotsford-Mission,British Columbia,Unabsorbed inventory,Single detached units,64
3,1988-06-01,Abbotsford-Mission,British Columbia,Unabsorbed inventory,Semi-detached units,2
4,1988-06-01,Barrie,Ontario,Absorptions,Single detached units,84


In [7]:
housing_price_df = pd.read_csv("../Resources/price_index_clean.csv")
housing_price_df.head()

Unnamed: 0,REF_DATE,GEO,Housing_Type,PRICE_INDEX
0,1981-01-01,"St. John's, Newfoundland and Labrador",House only,37.5
1,1981-01-01,"St. John's, Newfoundland and Labrador",Land only,31.4
2,1981-01-01,"Saint John, Fredericton, and Moncton, New Brun...",House only,61.4
3,1981-01-01,"Saint John, Fredericton, and Moncton, New Brun...",Land only,38.0
4,1981-01-01,"Québec, Quebec",House only,34.9


In [8]:
housing_price_df["GEO"].astype("str")

0                    St. John's, Newfoundland and Labrador
1                    St. John's, Newfoundland and Labrador
2        Saint John, Fredericton, and Moncton, New Brun...
3        Saint John, Fredericton, and Moncton, New Brun...
4                                           Québec, Quebec
                               ...                        
28141                            Kelowna, British Columbia
28142                          Vancouver, British Columbia
28143                          Vancouver, British Columbia
28144                           Victoria, British Columbia
28145                           Victoria, British Columbia
Name: GEO, Length: 28146, dtype: object

In [9]:
df = housing_price_df.GEO.str.split(", ", expand=True,)
df.head()

Unnamed: 0,0,1,2,3
0,St. John's,Newfoundland and Labrador,,
1,St. John's,Newfoundland and Labrador,,
2,Saint John,Fredericton,and Moncton,New Brunswick
3,Saint John,Fredericton,and Moncton,New Brunswick
4,Québec,Quebec,,


In [10]:
len(df)

28146

In [11]:
df = df.drop(columns = [2,3])
df

Unnamed: 0,0,1
0,St. John's,Newfoundland and Labrador
1,St. John's,Newfoundland and Labrador
2,Saint John,Fredericton
3,Saint John,Fredericton
4,Québec,Quebec
...,...,...
28141,Kelowna,British Columbia
28142,Vancouver,British Columbia
28143,Vancouver,British Columbia
28144,Victoria,British Columbia


In [12]:
index_list = df.dropna().index
index_list

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            28134, 28135, 28136, 28137, 28140, 28141, 28142, 28143, 28144,
            28145],
           dtype='int64', length=20062)

In [13]:
df = df.loc[index_list,:].reset_index()
df.head()


Unnamed: 0,index,0,1
0,0,St. John's,Newfoundland and Labrador
1,1,St. John's,Newfoundland and Labrador
2,2,Saint John,Fredericton
3,3,Saint John,Fredericton
4,4,Québec,Quebec


In [14]:
housing_price_df = housing_price_df.loc[index_list,:].reset_index()
housing_price_df.head()

Unnamed: 0,index,REF_DATE,GEO,Housing_Type,PRICE_INDEX
0,0,1981-01-01,"St. John's, Newfoundland and Labrador",House only,37.5
1,1,1981-01-01,"St. John's, Newfoundland and Labrador",Land only,31.4
2,2,1981-01-01,"Saint John, Fredericton, and Moncton, New Brun...",House only,61.4
3,3,1981-01-01,"Saint John, Fredericton, and Moncton, New Brun...",Land only,38.0
4,4,1981-01-01,"Québec, Quebec",House only,34.9


In [15]:
housing_price_df = pd.concat([housing_price_df,df], axis = 1)
housing_price_df.head()

Unnamed: 0,index,REF_DATE,GEO,Housing_Type,PRICE_INDEX,index.1,0,1
0,0,1981-01-01,"St. John's, Newfoundland and Labrador",House only,37.5,0,St. John's,Newfoundland and Labrador
1,1,1981-01-01,"St. John's, Newfoundland and Labrador",Land only,31.4,1,St. John's,Newfoundland and Labrador
2,2,1981-01-01,"Saint John, Fredericton, and Moncton, New Brun...",House only,61.4,2,Saint John,Fredericton
3,3,1981-01-01,"Saint John, Fredericton, and Moncton, New Brun...",Land only,38.0,3,Saint John,Fredericton
4,4,1981-01-01,"Québec, Quebec",House only,34.9,4,Québec,Quebec


In [16]:
housing_price_df = housing_price_df.drop(columns=["GEO"])
housing_price_df.head()

Unnamed: 0,index,REF_DATE,Housing_Type,PRICE_INDEX,index.1,0,1
0,0,1981-01-01,House only,37.5,0,St. John's,Newfoundland and Labrador
1,1,1981-01-01,Land only,31.4,1,St. John's,Newfoundland and Labrador
2,2,1981-01-01,House only,61.4,2,Saint John,Fredericton
3,3,1981-01-01,Land only,38.0,3,Saint John,Fredericton
4,4,1981-01-01,House only,34.9,4,Québec,Quebec


In [17]:
housing_price_df = housing_price_df.rename(columns = {0: "City", 1: "Province"})
housing_price_df.head()

Unnamed: 0,index,REF_DATE,Housing_Type,PRICE_INDEX,index.1,City,Province
0,0,1981-01-01,House only,37.5,0,St. John's,Newfoundland and Labrador
1,1,1981-01-01,Land only,31.4,1,St. John's,Newfoundland and Labrador
2,2,1981-01-01,House only,61.4,2,Saint John,Fredericton
3,3,1981-01-01,Land only,38.0,3,Saint John,Fredericton
4,4,1981-01-01,House only,34.9,4,Québec,Quebec


In [18]:
housing_price_df["Year"] = pd.DatetimeIndex(housing_price_df['REF_DATE']).year
housing_price_df["Month"] = pd.DatetimeIndex(housing_price_df['REF_DATE']).month
housing_price_df = housing_price_df.drop(columns = ["index"])
housing_price_df.head()

Unnamed: 0,REF_DATE,Housing_Type,PRICE_INDEX,City,Province,Year,Month
0,1981-01-01,House only,37.5,St. John's,Newfoundland and Labrador,1981,1
1,1981-01-01,Land only,31.4,St. John's,Newfoundland and Labrador,1981,1
2,1981-01-01,House only,61.4,Saint John,Fredericton,1981,1
3,1981-01-01,Land only,38.0,Saint John,Fredericton,1981,1
4,1981-01-01,House only,34.9,Québec,Quebec,1981,1


In [19]:
inventory_df.head()

Unnamed: 0,REF_DATE,City,Province,Completed_units,Dwelling_Type,UNIT_VALUE
0,1988-06-01,Abbotsford-Mission,British Columbia,Absorptions,Single detached units,74
1,1988-06-01,Abbotsford-Mission,British Columbia,Absorptions,Semi-detached units,3
2,1988-06-01,Abbotsford-Mission,British Columbia,Unabsorbed inventory,Single detached units,64
3,1988-06-01,Abbotsford-Mission,British Columbia,Unabsorbed inventory,Semi-detached units,2
4,1988-06-01,Barrie,Ontario,Absorptions,Single detached units,84


In [20]:
inventory_df["Year"] = pd.DatetimeIndex(inventory_df['REF_DATE']).year
inventory_df["Month"] = pd.DatetimeIndex(inventory_df['REF_DATE']).month
inventory_df.head()

Unnamed: 0,REF_DATE,City,Province,Completed_units,Dwelling_Type,UNIT_VALUE,Year,Month
0,1988-06-01,Abbotsford-Mission,British Columbia,Absorptions,Single detached units,74,1988,6
1,1988-06-01,Abbotsford-Mission,British Columbia,Absorptions,Semi-detached units,3,1988,6
2,1988-06-01,Abbotsford-Mission,British Columbia,Unabsorbed inventory,Single detached units,64,1988,6
3,1988-06-01,Abbotsford-Mission,British Columbia,Unabsorbed inventory,Semi-detached units,2,1988,6
4,1988-06-01,Barrie,Ontario,Absorptions,Single detached units,84,1988,6


In [21]:
inventory_df_filtered = inventory_df.loc[(inventory_df["Year"] >2011)]
inventory_df_filtered.head()

Unnamed: 0,REF_DATE,City,Province,Completed_units,Dwelling_Type,UNIT_VALUE,Year,Month
21140,2020-01-01,Abbotsford-Mission,British Columbia,Absorptions,Single detached units,39,2020,1
21141,2020-01-01,Abbotsford-Mission,British Columbia,Absorptions,Semi-detached units,0,2020,1
21142,2020-01-01,Abbotsford-Mission,British Columbia,Unabsorbed inventory,Single detached units,80,2020,1
21143,2020-01-01,Abbotsford-Mission,British Columbia,Unabsorbed inventory,Semi-detached units,32,2020,1
21144,2020-01-01,Barrie,Ontario,Absorptions,Single detached units,137,2020,1


In [22]:
housing_price_df_filtered = housing_price_df.loc[(housing_price_df["Year"] >2011)]
housing_price_df_filtered.head()

Unnamed: 0,REF_DATE,Housing_Type,PRICE_INDEX,City,Province,Year,Month
9664,2020-01-01,House only,51.1,St. John's,Newfoundland and Labrador,2020,1
9665,2020-01-01,Land only,45.3,St. John's,Newfoundland and Labrador,2020,1
9666,2020-01-01,House only,89.7,Charlottetown,Prince Edward Island,2020,1
9667,2020-01-01,Land only,68.9,Charlottetown,Prince Edward Island,2020,1
9668,2020-01-01,House only,65.7,Halifax,Nova Scotia,2020,1


In [23]:
# Merging the Various databases 
combined_df_1 = pd.merge(housing_price_df_filtered, city_df,  how='inner', left_on=['City','Province'], right_on = ['city_name','province'])
combined_df_1.head()

Unnamed: 0,REF_DATE,Housing_Type,PRICE_INDEX,City,Province,Year,Month,city_name,city_size_type,city_type,province,pop_2016,city_size,pop_density
0,2020-01-01,House only,51.1,St. John's,Newfoundland and Labrador,2020,1,St. John's,"Large urban population centre (100,000 or more)",Core,Newfoundland and Labrador,178427.0,166.0,1074.9
1,2020-01-01,Land only,45.3,St. John's,Newfoundland and Labrador,2020,1,St. John's,"Large urban population centre (100,000 or more)",Core,Newfoundland and Labrador,178427.0,166.0,1074.9
2,2020-02-01,House only,51.1,St. John's,Newfoundland and Labrador,2020,2,St. John's,"Large urban population centre (100,000 or more)",Core,Newfoundland and Labrador,178427.0,166.0,1074.9
3,2020-02-01,Land only,45.3,St. John's,Newfoundland and Labrador,2020,2,St. John's,"Large urban population centre (100,000 or more)",Core,Newfoundland and Labrador,178427.0,166.0,1074.9
4,2020-03-01,House only,51.1,St. John's,Newfoundland and Labrador,2020,3,St. John's,"Large urban population centre (100,000 or more)",Core,Newfoundland and Labrador,178427.0,166.0,1074.9


In [24]:
# Merging the Various databases 
combined_df_2 = pd.merge(combined_df_1, inventory_df,  how='inner', left_on=['City','Province','Year', 'Month'], right_on = ['City','Province', 'Year', 'Month'])
combined_df_2.head()

Unnamed: 0,REF_DATE_x,Housing_Type,PRICE_INDEX,City,Province,Year,Month,city_name,city_size_type,city_type,province,pop_2016,city_size,pop_density,REF_DATE_y,Completed_units,Dwelling_Type,UNIT_VALUE
0,2020-01-01,House only,51.1,St. John's,Newfoundland and Labrador,2020,1,St. John's,"Large urban population centre (100,000 or more)",Core,Newfoundland and Labrador,178427.0,166.0,1074.9,2020-01-01,Absorptions,Single detached units,72
1,2020-01-01,House only,51.1,St. John's,Newfoundland and Labrador,2020,1,St. John's,"Large urban population centre (100,000 or more)",Core,Newfoundland and Labrador,178427.0,166.0,1074.9,2020-01-01,Absorptions,Semi-detached units,4
2,2020-01-01,House only,51.1,St. John's,Newfoundland and Labrador,2020,1,St. John's,"Large urban population centre (100,000 or more)",Core,Newfoundland and Labrador,178427.0,166.0,1074.9,2020-01-01,Unabsorbed inventory,Single detached units,23
3,2020-01-01,House only,51.1,St. John's,Newfoundland and Labrador,2020,1,St. John's,"Large urban population centre (100,000 or more)",Core,Newfoundland and Labrador,178427.0,166.0,1074.9,2020-01-01,Unabsorbed inventory,Semi-detached units,1
4,2020-01-01,House only,51.1,St. John's,Newfoundland and Labrador,2020,1,St. John's,"Large urban population centre (100,000 or more)",Core,Newfoundland and Labrador,178427.0,166.0,1074.9,2020-01-02,Absorptions,Single detached units,56


In [25]:
combined_df_refined = combined_df_2.drop(columns = ["REF_DATE_x" , "City" , "Province" , "city_name" , "Year" , "Month" , "province"])
combined_df_refined

Unnamed: 0,Housing_Type,PRICE_INDEX,city_size_type,city_type,pop_2016,city_size,pop_density,REF_DATE_y,Completed_units,Dwelling_Type,UNIT_VALUE
0,House only,51.1,"Large urban population centre (100,000 or more)",Core,178427.0,166.00,1074.9,2020-01-01,Absorptions,Single detached units,72
1,House only,51.1,"Large urban population centre (100,000 or more)",Core,178427.0,166.00,1074.9,2020-01-01,Absorptions,Semi-detached units,4
2,House only,51.1,"Large urban population centre (100,000 or more)",Core,178427.0,166.00,1074.9,2020-01-01,Unabsorbed inventory,Single detached units,23
3,House only,51.1,"Large urban population centre (100,000 or more)",Core,178427.0,166.00,1074.9,2020-01-01,Unabsorbed inventory,Semi-detached units,1
4,House only,51.1,"Large urban population centre (100,000 or more)",Core,178427.0,166.00,1074.9,2020-01-02,Absorptions,Single detached units,56
...,...,...,...,...,...,...,...,...,...,...,...
587155,Land only,101.5,"Large urban population centre (100,000 or more)",Core,151957.0,136.44,1113.8,2020-11-18,Unabsorbed inventory,Semi-detached units,27
587156,Land only,101.5,"Large urban population centre (100,000 or more)",Core,151957.0,136.44,1113.8,2020-11-19,Absorptions,Single detached units,36
587157,Land only,101.5,"Large urban population centre (100,000 or more)",Core,151957.0,136.44,1113.8,2020-11-19,Absorptions,Semi-detached units,8
587158,Land only,101.5,"Large urban population centre (100,000 or more)",Core,151957.0,136.44,1113.8,2020-11-19,Unabsorbed inventory,Single detached units,99


In [26]:
combined_df_refined=combined_df_refined.set_index('REF_DATE_y')
combined_df_refined

Unnamed: 0_level_0,Housing_Type,PRICE_INDEX,city_size_type,city_type,pop_2016,city_size,pop_density,Completed_units,Dwelling_Type,UNIT_VALUE
REF_DATE_y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-01-01,House only,51.1,"Large urban population centre (100,000 or more)",Core,178427.0,166.00,1074.9,Absorptions,Single detached units,72
2020-01-01,House only,51.1,"Large urban population centre (100,000 or more)",Core,178427.0,166.00,1074.9,Absorptions,Semi-detached units,4
2020-01-01,House only,51.1,"Large urban population centre (100,000 or more)",Core,178427.0,166.00,1074.9,Unabsorbed inventory,Single detached units,23
2020-01-01,House only,51.1,"Large urban population centre (100,000 or more)",Core,178427.0,166.00,1074.9,Unabsorbed inventory,Semi-detached units,1
2020-01-02,House only,51.1,"Large urban population centre (100,000 or more)",Core,178427.0,166.00,1074.9,Absorptions,Single detached units,56
...,...,...,...,...,...,...,...,...,...,...
2020-11-18,Land only,101.5,"Large urban population centre (100,000 or more)",Core,151957.0,136.44,1113.8,Unabsorbed inventory,Semi-detached units,27
2020-11-19,Land only,101.5,"Large urban population centre (100,000 or more)",Core,151957.0,136.44,1113.8,Absorptions,Single detached units,36
2020-11-19,Land only,101.5,"Large urban population centre (100,000 or more)",Core,151957.0,136.44,1113.8,Absorptions,Semi-detached units,8
2020-11-19,Land only,101.5,"Large urban population centre (100,000 or more)",Core,151957.0,136.44,1113.8,Unabsorbed inventory,Single detached units,99


In [27]:
# Find duplicated entries 
combined_df_refined = combined_df_refined.drop_duplicates()
print(f"Duplication entries: {combined_df_refined.duplicated().sum()}")


Duplication entries: 0


In [28]:
# Generate our categorical variable list
cat = combined_df_refined.dtypes[combined_df_refined.dtypes == "object"].index.tolist()

In [29]:
# Begining encoding with OneHotEncoder
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(combined_df_refined[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,Housing_Type_House only,Housing_Type_Land only,"city_size_type_Large urban population centre (100,000 or more)",city_type_Core,Completed_units_Absorptions,Completed_units_Unabsorbed inventory,Dwelling_Type_Semi-detached units,Dwelling_Type_Single detached units
0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
2,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
3,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
4,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0


In [30]:
# Merge one-hot encoded features and drop the originals
final_df = pd.concat([combined_df_refined,encode_df], axis = 1)
# pd.merge(combined_df_refined,encode_df,left_on=combined_df_refined.index, right_on=encode_df.index)
# final_df = final_df.drop(cat,1)
final_df.head()

KeyboardInterrupt: 

In [None]:
# Standarize data with StandardScaler 
data_scaled = StandardScaler().fit_transform(final_df)


In [None]:
# Initialize PCA model
pca = PCA(n_components=3)

In [None]:
# Get two principal components for the iris data.
data_pca = pca.fit_transform(data_scaled)

In [None]:
# Transform PCA data to a DataFrame 
df_data_pca = pd.DataFrame(
    data=data_pca, columns=["PC1", "PC2", "PC3"])
df_data_pca.head()

In [None]:
# Fetch the explained variance 
pca.explained_variance_ratio_

In [None]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
	km = KMeans(n_clusters=i, random_state=0)
	km.fit(df_data_pca)
	inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [None]:
# Initialize the K-means model
model = KMeans(n_clusters=6, random_state=0)

# Fit the model
model.fit(df_data_pca)

# Predict clusters
predictions = model.predict(df_data_pca)

# Add the predicted class columns
df_data_pca["class"] = model.labels_
df_data_pca.head()

In [None]:
# Plotting the clusters with three features 
fig = px.scatter_3d(df_data_pca, x= "PC1", y= "PC2", z="PC3", color= "class", symbol = "class", size = "class", width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()