In [1]:
# Install PyMongo if necesary on computer 
#pip install pymongo

# if receiving error message install "dnspython"
#!pip install dnspython

In [2]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AffinityPropagation, AgglomerativeClustering
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, kneighbors_graph
from config import db_user, db_password
import pymongo
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff

In [3]:
# Check version of Pymongo
import pymongo
print("version:",pymongo.version)

version: 3.12.0


# Importing Dataframe from MongoDB using Pymongo

In [4]:
# Setup Pymongo Connection
conn = f"mongodb+srv://{db_user}:{db_password}@cluster0.f7wzt.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"
client = pymongo.MongoClient(conn)
# Create data base
db = client.sleeper_cities
# assign Mongo collection to a variable 
housing_mongo = db.housing 

#### Note: Create a new collection/table for each CSV file

In [5]:
# Pull the collection/table from MongoDB.  
response = housing_mongo.find()
# Transforming it into a Dataframe
housing_pd_df = pd.DataFrame(list(response))
#Show the dataframe 
housing_pd_df.head()

Unnamed: 0,_id,date,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count
0,60f637a113e1abd8d69aa6ab,2021-06-01,"tampa-st. petersburg-clearwater, fl",349700.0,4086,32.0,4828,360,1320,10347,549116.7527,14433
1,60f637a113e1abd8d69aa6a3,2021-06-01,"atlanta-sandy springs-roswell, ga",395000.0,15594,32.5,16952,1048,3808,29827,550611.109,45421
2,60f637a113e1abd8d69aa6b1,2021-06-01,"portland-vancouver-hillsboro, or-wa",557000.0,3323,29.5,4192,2580,1384,6857,729463.8197,10180
3,60f637a113e1abd8d69aa6ce,2021-06-01,"salt lake city, ut",569950.0,853,11.0,1520,68,268,2620,814076.3695,3473
4,60f637a113e1abd8d69aa703,2021-06-01,"fayetteville-springdale-rogers, ar-mo",327000.0,693,37.0,688,36,156,2285,500708.5935,2978


### Note: Imported dataframe has "_id" column from MongoDB indexing. Needs to be removed

# Cleaning Data

In [6]:
housing_pd_df.dtypes

_id                       object
date                      object
metro_area/city           object
median_listing_price     float64
active_listing_count       int64
median_days_on_market    float64
new_listing_count          int64
price_increased_count      int64
price_reduced_count        int64
pending_listing_count      int64
average_listing_price    float64
total_listing_count        int64
dtype: object

In [7]:
# Convert Date to a datetime column
housing_pd_df["date"] = pd.to_datetime(housing_pd_df["date"])
housing_pd_df.dtypes

_id                              object
date                     datetime64[ns]
metro_area/city                  object
median_listing_price            float64
active_listing_count              int64
median_days_on_market           float64
new_listing_count                 int64
price_increased_count             int64
price_reduced_count               int64
pending_listing_count             int64
average_listing_price           float64
total_listing_count               int64
dtype: object

In [8]:
# Display new DataFrame
housing_pd_df.head()

Unnamed: 0,_id,date,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count
0,60f637a113e1abd8d69aa6ab,2021-06-01,"tampa-st. petersburg-clearwater, fl",349700.0,4086,32.0,4828,360,1320,10347,549116.7527,14433
1,60f637a113e1abd8d69aa6a3,2021-06-01,"atlanta-sandy springs-roswell, ga",395000.0,15594,32.5,16952,1048,3808,29827,550611.109,45421
2,60f637a113e1abd8d69aa6b1,2021-06-01,"portland-vancouver-hillsboro, or-wa",557000.0,3323,29.5,4192,2580,1384,6857,729463.8197,10180
3,60f637a113e1abd8d69aa6ce,2021-06-01,"salt lake city, ut",569950.0,853,11.0,1520,68,268,2620,814076.3695,3473
4,60f637a113e1abd8d69aa703,2021-06-01,"fayetteville-springdale-rogers, ar-mo",327000.0,693,37.0,688,36,156,2285,500708.5935,2978


In [9]:
# Extract the year from the 'date' column.
housing_pd_df["year"] = pd.DatetimeIndex(housing_pd_df['date']).year
housing_pd_df.head()


Unnamed: 0,_id,date,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count,year
0,60f637a113e1abd8d69aa6ab,2021-06-01,"tampa-st. petersburg-clearwater, fl",349700.0,4086,32.0,4828,360,1320,10347,549116.7527,14433,2021
1,60f637a113e1abd8d69aa6a3,2021-06-01,"atlanta-sandy springs-roswell, ga",395000.0,15594,32.5,16952,1048,3808,29827,550611.109,45421,2021
2,60f637a113e1abd8d69aa6b1,2021-06-01,"portland-vancouver-hillsboro, or-wa",557000.0,3323,29.5,4192,2580,1384,6857,729463.8197,10180,2021
3,60f637a113e1abd8d69aa6ce,2021-06-01,"salt lake city, ut",569950.0,853,11.0,1520,68,268,2620,814076.3695,3473,2021
4,60f637a113e1abd8d69aa703,2021-06-01,"fayetteville-springdale-rogers, ar-mo",327000.0,693,37.0,688,36,156,2285,500708.5935,2978,2021


In [10]:
# Filter to show only data from 2019
housing_df_filtered = housing_pd_df[housing_pd_df['year'] == 2019]
print(housing_df_filtered.shape)
housing_df_filtered.head()

(11004, 13)


Unnamed: 0,_id,date,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count,year
16500,60f637a113e1abd8d69ae72b,2019-12-01,"portland-vancouver-hillsboro, or-wa",469450.0,5702,70.5,1496,1020,2296,4213,595581.0413,9915,2019
16501,60f637a113e1abd8d69ae71a,2019-12-01,"philadelphia-camden-wilmington, pa-nj-de-md",288450.0,15076,75.0,3368,276,3060,1741,422417.5192,16817,2019
16502,60f637a113e1abd8d69ae75e,2019-12-01,"akron, oh",150950.0,1704,61.5,476,8,320,869,218599.6856,2573,2019
16503,60f637a113e1abd8d69ae762,2019-12-01,"madison, wi",348700.0,1278,87.5,248,12,144,719,404568.6108,1997,2019
16504,60f637a113e1abd8d69ae76a,2019-12-01,"des moines-west des moines, ia",257700.0,3322,75.5,676,164,1172,1361,297043.9775,4683,2019


In [11]:
# Group by metro_area/city - index split separately 
grouped_housing_df = housing_df_filtered.groupby(['metro_area/city'],axis=0,as_index=False).mean()
grouped_housing_df.head(20)

Unnamed: 0,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count,year
0,"aberdeen, sd",182846.428567,161.75,81.333333,43.333333,0.0,36.0,0.333333,224213.7298,162.083333,2019.0
1,"aberdeen, wa",243286.190475,336.0,71.375,110.0,4.333333,65.0,125.666667,281836.496283,461.666667,2019.0
2,"abilene, tx",214853.273808,561.166667,66.416667,195.0,3.333333,149.0,282.0,247640.66745,843.166667,2019.0
3,"ada, ok",155062.500008,151.666667,67.5,44.333333,0.0,20.666667,53.666667,192856.850308,205.333333,2019.0
4,"adrian, mi",169820.2381,428.416667,64.416667,123.333333,2.0,83.666667,31.25,221564.868992,459.666667,2019.0
5,"akron, oh",165456.904758,1701.5,54.166667,798.666667,7.666667,563.333333,1187.583333,244281.999525,2889.083333,2019.0
6,"alamogordo, nm",185736.904767,410.666667,99.833333,90.666667,0.0,48.333333,104.166667,220033.831333,514.833333,2019.0
7,"albany, ga",120405.726192,545.666667,91.5,126.666667,2.0,109.0,17.583333,150925.848925,563.25,2019.0
8,"albany, or",349433.928575,450.583333,50.375,232.333333,8.0,143.0,231.916667,448459.134575,682.5,2019.0
9,"albany-schenectady-troy, ny",297317.10715,3856.0,83.0,1118.666667,31.666667,887.333333,2326.666667,342436.266492,6182.666667,2019.0


In [12]:
# Check types
grouped_housing_df.dtypes

metro_area/city           object
median_listing_price     float64
active_listing_count     float64
median_days_on_market    float64
new_listing_count        float64
price_increased_count    float64
price_reduced_count      float64
pending_listing_count    float64
average_listing_price    float64
total_listing_count      float64
year                     float64
dtype: object

In [13]:
# create new df with DF cities  
metro_name_df = pd.DataFrame(grouped_housing_df['metro_area/city'])
print(metro_name_df.shape)
metro_name_df

(917, 1)


Unnamed: 0,metro_area/city
0,"aberdeen, sd"
1,"aberdeen, wa"
2,"abilene, tx"
3,"ada, ok"
4,"adrian, mi"
...,...
912,"youngstown-warren-boardman, oh-pa"
913,"yuba city, ca"
914,"yuma, az"
915,"zanesville, oh"


In [14]:
# create new DF with columns needed for ML - Drop: (year, price red, price inc, pending listing)
ml_housing_df= grouped_housing_df[['median_listing_price','active_listing_count','median_days_on_market','new_listing_count','average_listing_price','total_listing_count']]
print(ml_housing_df.shape)
ml_housing_df.head()

(917, 6)


Unnamed: 0,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,average_listing_price,total_listing_count
0,182846.428567,161.75,81.333333,43.333333,224213.7298,162.083333
1,243286.190475,336.0,71.375,110.0,281836.496283,461.666667
2,214853.273808,561.166667,66.416667,195.0,247640.66745,843.166667
3,155062.500008,151.666667,67.5,44.333333,192856.850308,205.333333
4,169820.2381,428.416667,64.416667,123.333333,221564.868992,459.666667


## import income & Population data

In [26]:
# Import population and income data
# assign Mongo collection to a variable 
income_pop_mongo = db.income_pop

In [27]:
# Pull the collection/table from MongoDB.  
response = income_pop_mongo.find()
# Transforming it into a Dataframe
income_pop_pd_df = pd.DataFrame(list(response))
#Show the dataframe 
income_pop_pd_df.head()

Unnamed: 0.1,_id,Unnamed: 0,metro_area/city,Per Capita Income,Pop
0,60fa1d37ff5df8808eb93440,1,"aberdeen, sd",30802,28257
1,60fa1d37ff5df8808eb93441,2,"aberdeen, wa",23090,16756
2,60fa1d37ff5df8808eb93442,3,"abilene, tx",24529,123420
3,60fa1d37ff5df8808eb93443,4,"ada, ok",23538,17235
4,60fa1d37ff5df8808eb93444,5,"adrian, mi",20799,20600


### Note: Imported dataframe has "_id" column from MongoDB indexing. Needs to be removed

In [28]:
# Clean income_pop_pd_df to then merge it with ml_housing_df
cleaned_income_pop_pd_df = income_pop_pd_df[['Per Capita Income','Pop']]
cleaned_income_pop_pd_df.head()

Unnamed: 0,Per Capita Income,Pop
0,30802,28257
1,23090,16756
2,24529,123420
3,23538,17235
4,20799,20600


In [29]:
# change the header
cleaned_income_pop_pd_df = cleaned_income_pop_pd_df.rename(columns={'Per Capita Income':'income','Pop':'pop'})
cleaned_income_pop_pd_df.head()

Unnamed: 0,income,pop
0,30802,28257
1,23090,16756
2,24529,123420
3,23538,17235
4,20799,20600


In [30]:
# Merge 'ml_housing_df' & 'cleaned_income_pop_pd_df'
ml_housing_income_pop_df = ml_housing_df.join(cleaned_income_pop_pd_df, how='left')
print(ml_housing_income_pop_df.shape)
ml_housing_income_pop_df.head()

(917, 8)


Unnamed: 0,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,average_listing_price,total_listing_count,income,pop
0,182846.428567,161.75,81.333333,43.333333,224213.7298,162.083333,30802,28257
1,243286.190475,336.0,71.375,110.0,281836.496283,461.666667,23090,16756
2,214853.273808,561.166667,66.416667,195.0,247640.66745,843.166667,24529,123420
3,155062.500008,151.666667,67.5,44.333333,192856.850308,205.333333,23538,17235
4,169820.2381,428.416667,64.416667,123.333333,221564.868992,459.666667,20799,20600


# Prepare data for Machine Learning

In [31]:
# Standardize the data with StandardScaler().
X_scaled = StandardScaler().fit_transform(ml_housing_income_pop_df)
X_scaled[:5]

array([[-0.41134894, -0.26119984, -0.00626211, -0.26879905, -0.39071823,
        -0.27383375,  0.63668812, -0.25943154],
       [-0.01127082, -0.21946674, -0.4072039 , -0.22459811, -0.20049605,
        -0.22012706, -0.47037499, -0.3315778 ],
       [-0.19948115, -0.16553905, -0.60683601, -0.16824192, -0.31338208,
        -0.15173508, -0.26380551,  0.33753002],
       [-0.59526332, -0.26361481, -0.56321891, -0.26813604, -0.49423243,
        -0.26608027, -0.40606427, -0.32857301],
       [-0.49757518, -0.19733284, -0.68735988, -0.21575793, -0.39946256,
        -0.22048561, -0.79924967, -0.30746422]])

In [32]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
pca.fit(X_scaled)
print(pca.explained_variance_ratio_)

X_pca = pca.transform(X_scaled)

[0.51048887 0.24226372 0.12547681]


In [33]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(X_pca, columns=['PC 1','PC 2', 'PC 3'])
pcs_df.head(10)

Unnamed: 0,PC 1,PC 2,PC 3
0,-0.499891,0.025396,-0.336139
1,-0.551533,0.103539,-0.358122
2,-0.186804,0.364752,-0.6414
3,-0.784764,0.575732,-0.614407
4,-0.738936,0.646516,-0.590835
5,0.474245,0.879584,-1.022092
6,-0.74404,0.334229,0.574511
7,-0.89769,1.032337,0.36288
8,0.286068,-0.874189,-1.239458
9,0.938669,0.205143,0.313412


In [34]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

## KMeans

In [50]:
# Initialize the K-Means model.
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
prediction = model.predict(pcs_df)
print(prediction)

[0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0 2
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2 0 0 2 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 2 2 0 2 0 0 0 2 2
 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 2 2 2 0 0 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 2 0 0 0 0 0 0 2
 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 2 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 0 0 0 2 0 2 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0
 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 0 0 2 0 0 0 0 0 0 0 2 2 0
 0 0 0 2 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 2 2 0 2 0 2 2 0 0 0 0 0 2 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0
 0 0 0 0 0 2 1 2 0 0 0 0 

In [51]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the ml_housing_df and pcs_df DataFrames on the same columns.
clustered_k_df = ml_housing_income_pop_df.join(pcs_df, how='inner')

#  Add a new column, "metro_area/city" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_k_df = clustered_k_df.join(metro_name_df, how='inner')

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# Add the predicted class columns
clustered_k_df["Class"] = model.labels_
clustered_k_df.head(10)

# Print the shape of the clustered_df
print(clustered_k_df.shape)
clustered_k_df.head(10)

(917, 13)


Unnamed: 0,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,average_listing_price,total_listing_count,income,pop,PC 1,PC 2,PC 3,metro_area/city,Class
0,182846.428567,161.75,81.333333,43.333333,224213.7298,162.083333,30802,28257,-0.499891,0.025396,-0.336139,"aberdeen, sd",0
1,243286.190475,336.0,71.375,110.0,281836.496283,461.666667,23090,16756,-0.551533,0.103539,-0.358122,"aberdeen, wa",0
2,214853.273808,561.166667,66.416667,195.0,247640.66745,843.166667,24529,123420,-0.186804,0.364752,-0.6414,"abilene, tx",0
3,155062.500008,151.666667,67.5,44.333333,192856.850308,205.333333,23538,17235,-0.784764,0.575732,-0.614407,"ada, ok",0
4,169820.2381,428.416667,64.416667,123.333333,221564.868992,459.666667,20799,20600,-0.738936,0.646516,-0.590835,"adrian, mi",0
5,165456.904758,1701.5,54.166667,798.666667,244281.999525,2889.083333,24688,197597,0.474245,0.879584,-1.022092,"akron, oh",0
6,185736.904767,410.666667,99.833333,90.666667,220033.831333,514.833333,24999,31980,-0.74404,0.334229,0.574511,"alamogordo, nm",0
7,120405.726192,545.666667,91.5,126.666667,150925.848925,563.25,20313,72130,-0.89769,1.032337,0.36288,"albany, ga",0
8,349433.928575,450.583333,50.375,232.333333,448459.134575,682.5,29664,55338,0.286068,-0.874189,-1.239458,"albany, or",0
9,297317.10715,3856.0,83.0,1118.666667,342436.266492,6182.666667,26440,70295,0.938669,0.205143,0.313412,"albany-schenectady-troy, ny",0


In [52]:
# Find Austin metro area 
austin_group_k = clustered_k_df.loc[clustered_k_df['metro_area/city'].str.contains("austin", case=False)]
austin_group_k

Unnamed: 0,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,average_listing_price,total_listing_count,income,pop,PC 1,PC 2,PC 3,metro_area/city,Class
52,155880.35715,124.083333,60.125,42.0,185615.22225,177.083333,27965,25233,-0.565727,0.374262,-1.054393,"austin, mn",0
53,359709.845233,7644.25,56.708333,3252.666667,552108.749992,10896.666667,38904,556140,4.516994,0.150833,-0.777391,"austin-round rock, tx",2


In [55]:
# Metro_area/city with austin group
metro_area_city_k = clustered_k_df.loc[clustered_k_df['Class'] == 2]
print(metro_area_city_k.shape)
metro_area_city_k.head(10)

(113, 13)


Unnamed: 0,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,average_listing_price,total_listing_count,income,pop,PC 1,PC 2,PC 3,metro_area/city,Class
13,268651.24405,2470.083333,54.958333,1104.333333,350132.412267,4352.666667,30403,560513,2.158623,0.49614,-1.161531,"albuquerque, nm",2
26,317977.976192,1724.5,69.125,552.0,348989.5822,1724.5,41415,288000,1.403945,-0.78645,-1.027985,"anchorage, ak",2
29,386657.14285,1023.833333,59.166667,454.666667,455150.216492,1434.833333,42674,119980,1.144005,-1.552363,-1.331948,"ann arbor, mi",2
36,394037.797617,2613.0,92.875,664.666667,544391.427908,3521.583333,33176,92870,1.010333,-1.139074,0.444292,"asheville, nc",2
53,359709.845233,7644.25,56.708333,3252.666667,552108.749992,10896.666667,38904,556140,4.516994,0.150833,-0.777391,"austin-round rock, tx",2
56,323465.178567,9550.916667,55.833333,3853.0,417196.827275,10743.333333,43368,249434,4.121117,0.189771,-0.838788,"baltimore-columbia-towson, md",2
60,581648.166667,3366.416667,88.875,833.666667,949985.609233,3924.666667,42596,44477,2.047079,-3.10971,0.323721,"barnstable town, ma",2
77,440240.178567,664.75,58.333333,263.666667,563961.364575,908.916667,30791,92314,0.664665,-1.459364,-0.88712,"bellingham, wa",2
79,462127.398808,1228.833333,83.333333,371.666667,634468.756192,1824.083333,32143,66421,0.761153,-1.728006,0.101416,"bend-redmond, or",2
98,361145.410708,2530.333333,41.333333,1366.0,447201.69965,2598.833333,34636,228959,1.718167,-0.630193,-1.629237,"boise city, id",2


### Kmeans Plots

In [56]:
# Creating a 3D-Scatter with the PCA data and the clusters
# Plot
fig = px.scatter_3d(clustered_k_df, x="PC 1", y="PC 2", z="PC 3", color="Class", symbol="Class",size="median_listing_price", size_max=25, width=800, hover_name="metro_area/city", hover_data=["average_listing_price"])
fig.update_layout(legend=dict(x=0,y=1))
fig.show()


In [57]:
# 2D scatter plot
clustered_k_df.hvplot.scatter(
    x="PC 1",
    y="PC 2",
    hover_cols=["metro_area/city"],
    by="Class",
)

## Affinity Propagation

In [42]:
# Initialize the Affinity Propagation model.
af = AffinityPropagation()

# Fit the model
clustering = af.fit(pcs_df)





In [43]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the ml_housing_df and pcs_df DataFrames on the same columns.
clustered_af_df = ml_housing_df.join(pcs_df, how='inner')

#  Add a new column, "metro_area/city" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_af_df = clustered_af_df.join(metro_name_df, how='inner')

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# Add the predicted class columns
clustered_af_df["Class"] = clustering.labels_
clustered_af_df.head(10)

# Print the shape of the clustered_df
print(clustered_af_df.shape)
clustered_af_df.head(10)

(917, 11)


Unnamed: 0,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,average_listing_price,total_listing_count,PC 1,PC 2,PC 3,metro_area/city,Class
0,182846.428567,161.75,81.333333,43.333333,224213.7298,162.083333,-0.499891,0.025396,-0.336139,"aberdeen, sd",9
1,243286.190475,336.0,71.375,110.0,281836.496283,461.666667,-0.551533,0.103539,-0.358122,"aberdeen, wa",63
2,214853.273808,561.166667,66.416667,195.0,247640.66745,843.166667,-0.186804,0.364752,-0.6414,"abilene, tx",63
3,155062.500008,151.666667,67.5,44.333333,192856.850308,205.333333,-0.784764,0.575732,-0.614407,"ada, ok",21
4,169820.2381,428.416667,64.416667,123.333333,221564.868992,459.666667,-0.738936,0.646516,-0.590835,"adrian, mi",21
5,165456.904758,1701.5,54.166667,798.666667,244281.999525,2889.083333,0.474245,0.879584,-1.022092,"akron, oh",53
6,185736.904767,410.666667,99.833333,90.666667,220033.831333,514.833333,-0.74404,0.334229,0.574511,"alamogordo, nm",58
7,120405.726192,545.666667,91.5,126.666667,150925.848925,563.25,-0.89769,1.032337,0.36288,"albany, ga",20
8,349433.928575,450.583333,50.375,232.333333,448459.134575,682.5,0.286068,-0.874189,-1.239458,"albany, or",0
9,297317.10715,3856.0,83.0,1118.666667,342436.266492,6182.666667,0.938669,0.205143,0.313412,"albany-schenectady-troy, ny",10


In [60]:
# Find Austin metro area 
austin_group_af = clustered_af_df.loc[clustered_af_df['metro_area/city'].str.contains("austin", case=False)]
austin_group_af

Unnamed: 0,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,average_listing_price,total_listing_count,PC 1,PC 2,PC 3,metro_area/city,Class
52,155880.35715,124.083333,60.125,42.0,185615.22225,177.083333,-0.600098,-0.614852,-0.905114,"austin, mn",31
53,359709.845233,7644.25,56.708333,3252.666667,552108.749992,10896.666667,3.193447,-0.187577,-0.547523,"austin-round rock, tx",2


In [69]:
# Metro_area/city with austin group
metro_area_city_af = clustered_af_df.loc[clustered_af_df['Class'] == 2]
print(metro_area_city_af.shape)
metro_area_city_af.head(10)

(10, 11)


Unnamed: 0,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,average_listing_price,total_listing_count,PC 1,PC 2,PC 3,metro_area/city,Class
53,359709.845233,7644.25,56.708333,3252.666667,552108.749992,10896.666667,3.193447,-0.187577,-0.547523,"austin-round rock, tx",2
56,323465.178567,9550.916667,55.833333,3853.0,417196.827275,10743.333333,3.455175,-0.815181,-0.516196,"baltimore-columbia-towson, md",2
154,341191.74405,8189.833333,57.833333,3566.333333,456165.700833,12507.5,3.403963,-0.606298,-0.454894,"charlotte-concord-gastonia, nc-sc",2
400,313403.470233,9012.833333,70.0,3027.0,441528.248517,14288.166667,3.355671,-0.76013,0.074652,"jacksonville, fl",2
458,318463.125,9260.666667,48.875,3995.333333,469506.162525,10418.75,3.513496,-0.734708,-0.789587,"las vegas-henderson-paradise, nv",2
546,358345.476192,10025.416667,46.583333,5232.333333,469037.62565,15911.5,4.65079,-0.990455,-0.733105,"minneapolis-st. paul-bloomington, mn-wi",2
622,314673.095242,9731.333333,62.958333,3798.333333,447268.243358,15485.333333,3.87876,-0.951137,-0.134389,"orlando-kissimmee-sanford, fl",2
731,296762.654758,10324.083333,60.333333,3754.0,390859.308233,15648.5,3.888241,-1.189479,-0.22112,"san antonio-new braunfels, tx",2
793,223574.303575,8485.5,67.958333,3322.0,305442.5193,13668.5,3.068721,-1.439279,-0.004727,"st. louis, mo-il",2
864,297883.541675,9877.333333,59.333333,3316.666667,380071.02835,10380.25,3.175837,-0.925655,-0.388003,"virginia beach-norfolk-newport news, va-nc",2


### Affinity Propagation Plots

In [90]:
# Creating a 3D-Scatter with the PCA data and the clusters
# Plot
fig = px.scatter_3d(clustered_af_df, x="PC 1", y="PC 2", z="PC 3", color="Class", symbol="Class",size="median_listing_price", size_max=25, width=800, hover_name="metro_area/city", hover_data=["average_listing_price"])
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [57]:
# 2D scatter plot
#plt.scatter(clustered_af_df[:,0], [:,1], c=clustering.labels_, cmap='rainbow', alpha=0.7, edgecolors='b')
clustered_af_df.hvplot.scatter(
    x="PC 1",
    y="PC 2",
    hover_cols=["metro_area/city"],
    by="Class",
)

## Hierarchical Agglomerative Clustering

In [106]:
# Create the dendrogram
fig = ff.create_dendrogram(pcs_df, color_threshold=13)
fig.update_layout(width=800, height=1000)
fig.show()

In [107]:
# # Initialize the Hierarchical Agglomerative Clustering model.
agg = AgglomerativeClustering(n_clusters=4)

# fit the model
hac = agg.fit(pcs_df)

In [108]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the ml_housing_df and pcs_df DataFrames on the same columns.
clustered_hac_df = ml_housing_df.join(pcs_df, how='inner')

#  Add a new column, "metro_area/city" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_hac_df = clustered_hac_df.join(metro_name_df, how='inner')

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# Add the predicted class columns
clustered_hac_df["Class"] = hac.labels_
clustered_hac_df.head(10)

# Print the shape of the clustered_df
print(clustered_hac_df.shape)
clustered_hac_df.head(10)

(917, 11)


Unnamed: 0,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,average_listing_price,total_listing_count,PC 1,PC 2,PC 3,metro_area/city,Class
0,182846.428567,161.75,81.333333,43.333333,224213.7298,162.083333,-0.636693,-0.35568,-0.071281,"aberdeen, sd",0
1,243286.190475,336.0,71.375,110.0,281836.496283,461.666667,-0.348494,-0.02751,-0.470171,"aberdeen, wa",0
2,214853.273808,561.166667,66.416667,195.0,247640.66745,843.166667,-0.305553,-0.276655,-0.642662,"abilene, tx",0
3,155062.500008,151.666667,67.5,44.333333,192856.850308,205.333333,-0.630645,-0.584442,-0.610437,"ada, ok",0
4,169820.2381,428.416667,64.416667,123.333333,221564.868992,459.666667,-0.47518,-0.501927,-0.718885,"adrian, mi",0
5,165456.904758,1701.5,54.166667,798.666667,244281.999525,2889.083333,0.224175,-0.752096,-1.013899,"akron, oh",0
6,185736.904767,410.666667,99.833333,90.666667,220033.831333,514.833333,-0.659593,-0.332557,0.673585,"alamogordo, nm",0
7,120405.726192,545.666667,91.5,126.666667,150925.848925,563.25,-0.75332,-0.802304,0.368167,"albany, ga",0
8,349433.928575,450.583333,50.375,232.333333,448459.134575,682.5,0.182498,0.702249,-1.309289,"albany, or",0
9,297317.10715,3856.0,83.0,1118.666667,342436.266492,6182.666667,1.07568,-0.169139,0.216294,"albany-schenectady-troy, ny",0


In [109]:
# Find Austin metro area 
austin_group_hac = clustered_hac_df.loc[clustered_hac_df['metro_area/city'].str.contains("austin", case=False)]
austin_group_hac

Unnamed: 0,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,average_listing_price,total_listing_count,PC 1,PC 2,PC 3,metro_area/city,Class
52,155880.35715,124.083333,60.125,42.0,185615.22225,177.083333,-0.600098,-0.614852,-0.905114,"austin, mn",0
53,359709.845233,7644.25,56.708333,3252.666667,552108.749992,10896.666667,3.193447,-0.187577,-0.547523,"austin-round rock, tx",2


In [110]:
# Metro_area/city with austin group
metro_area_city_hac = clustered_hac_df.loc[clustered_hac_df['Class'] == 2]
print(metro_area_city_hac.shape)
metro_area_city_hac.head(10)

(39, 11)


Unnamed: 0,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,average_listing_price,total_listing_count,PC 1,PC 2,PC 3,metro_area/city,Class
53,359709.845233,7644.25,56.708333,3252.666667,552108.749992,10896.666667,3.193447,-0.187577,-0.547523,"austin-round rock, tx",2
56,323465.178567,9550.916667,55.833333,3853.0,417196.827275,10743.333333,3.455175,-0.815181,-0.516196,"baltimore-columbia-towson, md",2
102,583291.880958,9825.083333,51.208333,5195.333333,886703.35875,12665.0,5.036594,1.042629,-0.648754,"boston-cambridge-newton, ma-nh",2
136,303677.232142,9477.416667,86.375,2283.333333,509095.152358,12299.75,2.910567,-0.453781,0.689622,"cape coral-fort myers, fl",2
153,420351.696425,4679.416667,79.291667,1372.0,701691.670067,6553.333333,1.852433,1.041996,0.113938,"charleston-north charleston, sc",2
154,341191.74405,8189.833333,57.833333,3566.333333,456165.700833,12507.5,3.403963,-0.606298,-0.454894,"charlotte-concord-gastonia, nc-sc",2
161,270155.732142,5471.5,54.166667,2539.0,354225.385358,9727.25,2.242559,-0.757089,-0.749236,"cincinnati, oh-ky-in",2
169,195869.404767,5874.916667,63.166667,2358.666667,285278.250925,9608.916667,1.975614,-1.19351,-0.367755,"cleveland-elyria, oh",2
206,347887.76785,21799.166667,54.25,9386.0,487881.060392,32960.916667,9.170418,-2.804937,0.372778,"dallas-fort worth-arlington, tx",2
220,289412.428575,6567.166667,76.0,1907.0,409018.535517,7973.75,1.948142,-0.394966,0.092375,"deltona-daytona beach-ormond beach, fl",2


### Hierarchical Agglomerative Clustering Plots

In [111]:
# Creating a 3D-Scatter with the PCA data and the clusters
# Plot
fig = px.scatter_3d(clustered_hac_df, x="PC 1", y="PC 2", z="PC 3", color="Class", symbol="Class",size="median_listing_price", size_max=25, width=800, hover_name="metro_area/city", hover_data=["average_listing_price"])
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [113]:
# 2D scatter plot
clustered_hac_df.hvplot.scatter(
    x="PC 1",
    y="PC 2",
    hover_cols=["metro_area/city"],
    by="Class",
)