In [1]:
# Install PyMongo if necesary on computer 
#pip install pymongo

# if receiving error message install "dnspython"
#!pip install dnspython

In [2]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from config import db_user, db_password
import pymongo

In [3]:
# Check version of Pymongo
import pymongo
print("version:",pymongo.version)

version: 3.12.0


# Importing Dataframe from MongoDB using Pymongo

In [7]:
# Setup Pymongo Connection
conn = f"mongodb+srv://{db_user}:{db_password}@cluster0.f7wzt.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"
client = pymongo.MongoClient(conn)
# Create data base
db = client.sleeper_cities
# assign Mongo collection to a variable 
housing_mongo = db.housing 

#### Note: Create a new collection/table for each CSV file

In [12]:
# Pull the collection/table from MongoDB.  
response = housing_mongo.find()
# Transforming it into a Dataframe
housing_pd_df = pd.DataFrame(list(response))
#Show the dataframe 
housing_pd_df.head()

Unnamed: 0,_id,date,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count
0,60f637a113e1abd8d69aa69b,2021-06-01,"new york-newark-jersey city, ny-nj-pa",617500.0,61846,55.0,26280,1020,9540,32230,1260094.0,94076
1,60f637a113e1abd8d69aa69c,2021-06-01,"los angeles-long beach-anaheim, ca",1024500.0,13258,44.0,10096,476,1928,15494,2507311.0,28752
2,60f637a113e1abd8d69aa69d,2021-06-01,"chicago-naperville-elgin, il-in-wi",354950.0,19089,33.0,17088,444,5712,25296,538092.2,44385
3,60f637a113e1abd8d69aa69e,2021-06-01,"dallas-fort worth-arlington, tx",387000.0,6850,29.0,9008,704,1924,14504,595209.7,21354
4,60f637a113e1abd8d69aa69f,2021-06-01,"houston-the woodlands-sugar land, tx",365995.0,13440,36.0,11368,1748,3636,17155,556535.0,30595


### Note: Imported dataframe has "_id" column from MongoDB indexing. Needs to be removed

# Cleaning Data

In [14]:
housing_pd_df.dtypes

_id                       object
date                      object
metro_area/city           object
median_listing_price     float64
active_listing_count       int64
median_days_on_market    float64
new_listing_count          int64
price_increased_count      int64
price_reduced_count        int64
pending_listing_count      int64
average_listing_price    float64
total_listing_count        int64
dtype: object

In [16]:
# Convert Date to a datetime column
housing_pd_df["date"] = pd.to_datetime(housing_pd_df["date"])
housing_pd_df.dtypes

_id                              object
date                     datetime64[ns]
metro_area/city                  object
median_listing_price            float64
active_listing_count              int64
median_days_on_market           float64
new_listing_count                 int64
price_increased_count             int64
price_reduced_count               int64
pending_listing_count             int64
average_listing_price           float64
total_listing_count               int64
dtype: object

In [17]:
# Display new DataFrame
housing_pd_df.head()

Unnamed: 0,_id,date,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count
0,60f637a113e1abd8d69aa69b,2021-06-01,"new york-newark-jersey city, ny-nj-pa",617500.0,61846,55.0,26280,1020,9540,32230,1260094.0,94076
1,60f637a113e1abd8d69aa69c,2021-06-01,"los angeles-long beach-anaheim, ca",1024500.0,13258,44.0,10096,476,1928,15494,2507311.0,28752
2,60f637a113e1abd8d69aa69d,2021-06-01,"chicago-naperville-elgin, il-in-wi",354950.0,19089,33.0,17088,444,5712,25296,538092.2,44385
3,60f637a113e1abd8d69aa69e,2021-06-01,"dallas-fort worth-arlington, tx",387000.0,6850,29.0,9008,704,1924,14504,595209.7,21354
4,60f637a113e1abd8d69aa69f,2021-06-01,"houston-the woodlands-sugar land, tx",365995.0,13440,36.0,11368,1748,3636,17155,556535.0,30595


In [19]:
# Extract the year from the 'date' column.
housing_pd_df["year"] = pd.DatetimeIndex(housing_pd_df['date']).year
housing_pd_df.head()


Unnamed: 0,_id,date,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count,year
0,60f637a113e1abd8d69aa69b,2021-06-01,"new york-newark-jersey city, ny-nj-pa",617500.0,61846,55.0,26280,1020,9540,32230,1260094.0,94076,2021
1,60f637a113e1abd8d69aa69c,2021-06-01,"los angeles-long beach-anaheim, ca",1024500.0,13258,44.0,10096,476,1928,15494,2507311.0,28752,2021
2,60f637a113e1abd8d69aa69d,2021-06-01,"chicago-naperville-elgin, il-in-wi",354950.0,19089,33.0,17088,444,5712,25296,538092.2,44385,2021
3,60f637a113e1abd8d69aa69e,2021-06-01,"dallas-fort worth-arlington, tx",387000.0,6850,29.0,9008,704,1924,14504,595209.7,21354,2021
4,60f637a113e1abd8d69aa69f,2021-06-01,"houston-the woodlands-sugar land, tx",365995.0,13440,36.0,11368,1748,3636,17155,556535.0,30595,2021


In [20]:
# Filter to show only data from 2019
housing_df_filtered = housing_pd_df[housing_pd_df['year'] == 2019]
print(housing_df_filtered.shape)
housing_df_filtered.head()

(11004, 13)


Unnamed: 0,_id,date,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count,year
16506,60f637a113e1abd8d69ae715,2019-12-01,"new york-newark-jersey city, ny-nj-pa",549999.5,65994,83.5,10272,428,7132,16934,1110110.0,82928,2019
16507,60f637a113e1abd8d69ae716,2019-12-01,"los angeles-long beach-anaheim, ca",877500.0,17466,75.5,4632,300,1972,11449,2112489.0,28915,2019
16508,60f637a113e1abd8d69ae717,2019-12-01,"chicago-naperville-elgin, il-in-wi",299250.0,31563,68.5,7588,240,5440,12497,438986.0,44060,2019
16509,60f637a113e1abd8d69ae718,2019-12-01,"dallas-fort worth-arlington, tx",335500.0,19493,64.5,5780,680,5240,9232,468830.1,28725,2019
16510,60f637a113e1abd8d69ae719,2019-12-01,"houston-the woodlands-sugar land, tx",299994.0,26614,69.0,7032,836,5988,9069,437528.9,35683,2019


In [21]:
# Group by metro_area/city - index split separately 
grouped_housing_df = housing_df_filtered.groupby(['metro_area/city'],axis=0,as_index=False).mean()
grouped_housing_df.head(20)

Unnamed: 0,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count,year
0,"aberdeen, sd",182846.428567,161.75,81.333333,43.333333,0.0,36.0,0.333333,224213.7298,162.083333,2019.0
1,"aberdeen, wa",243286.190475,336.0,71.375,110.0,4.333333,65.0,125.666667,281836.496283,461.666667,2019.0
2,"abilene, tx",214853.273808,561.166667,66.416667,195.0,3.333333,149.0,282.0,247640.66745,843.166667,2019.0
3,"ada, ok",155062.500008,151.666667,67.5,44.333333,0.0,20.666667,53.666667,192856.850308,205.333333,2019.0
4,"adrian, mi",169820.2381,428.416667,64.416667,123.333333,2.0,83.666667,31.25,221564.868992,459.666667,2019.0
5,"akron, oh",165456.904758,1701.5,54.166667,798.666667,7.666667,563.333333,1187.583333,244281.999525,2889.083333,2019.0
6,"alamogordo, nm",185736.904767,410.666667,99.833333,90.666667,0.0,48.333333,104.166667,220033.831333,514.833333,2019.0
7,"albany, ga",120405.726192,545.666667,91.5,126.666667,2.0,109.0,17.583333,150925.848925,563.25,2019.0
8,"albany, or",349433.928575,450.583333,50.375,232.333333,8.0,143.0,231.916667,448459.134575,682.5,2019.0
9,"albany-schenectady-troy, ny",297317.10715,3856.0,83.0,1118.666667,31.666667,887.333333,2326.666667,342436.266492,6182.666667,2019.0


In [22]:
# Check types
grouped_housing_df.dtypes

metro_area/city           object
median_listing_price     float64
active_listing_count     float64
median_days_on_market    float64
new_listing_count        float64
price_increased_count    float64
price_reduced_count      float64
pending_listing_count    float64
average_listing_price    float64
total_listing_count      float64
year                     float64
dtype: object

In [23]:
# create new df with DF cities  
metro_name_df = pd.DataFrame(grouped_housing_df['metro_area/city'])
print(metro_name_df.shape)
metro_name_df

(917, 1)


Unnamed: 0,metro_area/city
0,"aberdeen, sd"
1,"aberdeen, wa"
2,"abilene, tx"
3,"ada, ok"
4,"adrian, mi"
...,...
912,"youngstown-warren-boardman, oh-pa"
913,"yuba city, ca"
914,"yuma, az"
915,"zanesville, oh"


In [24]:
# create new DF with columns needed for ML - Drop: (year, price red, price inc, pending listing)
ml_housing_df= grouped_housing_df[['median_listing_price','active_listing_count','median_days_on_market','new_listing_count','average_listing_price','total_listing_count']]
print(ml_housing_df.shape)
ml_housing_df.head()

(917, 6)


Unnamed: 0,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,average_listing_price,total_listing_count
0,182846.428567,161.75,81.333333,43.333333,224213.7298,162.083333
1,243286.190475,336.0,71.375,110.0,281836.496283,461.666667
2,214853.273808,561.166667,66.416667,195.0,247640.66745,843.166667
3,155062.500008,151.666667,67.5,44.333333,192856.850308,205.333333
4,169820.2381,428.416667,64.416667,123.333333,221564.868992,459.666667


# Prepare data for Machine Learning

In [25]:
# Standardize the data with StandardScaler().
X_scaled = StandardScaler().fit_transform(ml_housing_df)
X_scaled[:5]

array([[-0.41134894, -0.26119984, -0.00626211, -0.26879905, -0.39071823,
        -0.27383375],
       [-0.01127082, -0.21946674, -0.4072039 , -0.22459811, -0.20049605,
        -0.22012706],
       [-0.19948115, -0.16553905, -0.60683601, -0.16824192, -0.31338208,
        -0.15173508],
       [-0.59526332, -0.26361481, -0.56321891, -0.26813604, -0.49423243,
        -0.26608027],
       [-0.49757518, -0.19733284, -0.68735988, -0.21575793, -0.39946256,
        -0.22048561]])

In [26]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
pca.fit(X_scaled)
print(pca.explained_variance_ratio_)

X_pca = pca.transform(X_scaled)

[0.53605678 0.2879205  0.1598929 ]


In [27]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(X_pca, columns=['PC 1','PC 2', 'PC 3'])
pcs_df.head(10)

Unnamed: 0,PC 1,PC 2,PC 3
0,-0.636693,-0.35568,-0.071281
1,-0.348494,-0.02751,-0.470171
2,-0.305553,-0.276655,-0.642662
3,-0.630645,-0.584442,-0.610437
4,-0.47518,-0.501927,-0.718885
5,0.224175,-0.752096,-1.013899
6,-0.659593,-0.332557,0.673585
7,-0.75332,-0.802304,0.368167
8,0.182498,0.702249,-1.309289
9,1.07568,-0.169139,0.216294


In [28]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

### Nearest Neighbors

In [29]:
# Initialize the NeasrestNeighbors model.
neigh = NearestNeighbors(n_neighbors=5)

# Fit the model
neigh.fit(pcs_df)

# Predict clusters
NearestNeighbors(n_neighbors=5)
A = neigh.kneighbors_graph(pcs_df)
A.toarray()

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [24]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = ml_housing_df.join(pcs_df, how='inner')

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df = clustered_df.join(metro_name_df, how='inner')

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# Add the predicted class columns
clustered_df["Class"] = neigh.labels_
clustered_df.head(10)

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

AttributeError: 'NearestNeighbors' object has no attribute 'labels_'

## KMeans

In [30]:
# Initialize the K-Means model.
model = KMeans(n_clusters=5, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
prediction = model.predict(pcs_df)
print(prediction)

[0 0 0 0 0 0 3 3 0 0 3 0 3 0 0 0 3 0 0 3 0 3 0 3 0 3 0 0 0 0 0 0 0 3 3 3 0
 0 0 0 3 0 0 0 0 4 0 0 0 0 0 0 0 1 3 0 1 3 3 0 2 3 0 3 0 3 0 0 0 3 0 0 0 3
 0 3 0 0 0 0 3 3 3 3 0 3 0 0 0 0 0 0 0 0 3 3 3 3 0 0 3 0 1 2 0 0 3 0 3 2 0
 0 3 2 3 3 0 3 3 3 0 0 0 3 0 3 0 0 0 0 3 3 3 0 3 0 1 0 3 0 0 0 0 0 0 0 0 0
 0 0 0 3 3 0 1 0 0 0 4 0 0 1 3 3 3 0 0 3 0 0 0 3 0 0 3 0 0 0 0 0 0 0 3 0 0
 0 0 0 0 3 3 0 0 0 0 3 0 0 3 0 0 0 0 0 3 3 1 0 3 0 3 0 0 0 0 0 3 0 0 3 0 3
 1 3 0 1 0 0 0 0 3 0 3 3 0 3 0 3 0 0 0 0 0 3 0 2 0 2 0 3 0 3 0 3 0 3 0 3 3
 0 0 0 0 0 0 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 3 3 0 0 3 3
 0 3 0 0 0 0 0 0 0 0 0 3 0 0 0 0 3 3 0 2 3 0 0 0 3 2 3 0 3 0 0 0 0 3 3 0 0
 0 0 0 0 0 3 0 0 0 3 0 3 0 3 0 2 0 0 0 0 3 0 0 0 0 3 2 0 0 0 0 0 0 0 0 3 0
 0 0 0 0 0 3 3 1 3 0 0 3 0 0 3 0 0 0 3 1 0 0 0 3 0 0 0 3 0 2 1 3 0 0 3 0 0
 0 0 0 3 0 3 0 0 3 0 2 0 3 0 1 2 0 0 0 0 0 0 3 2 3 0 0 0 0 3 3 0 0 0 0 0 0
 3 0 0 0 0 0 0 0 0 0 0 0 0 3 1 3 3 0 0 0 0 0 0 3 0 0 0 3 0 0 3 0 3 0 0 3 0
 3 0 0 0 0 0 1 0 0 3 0 3 

In [31]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_k_df = ml_housing_df.join(pcs_df, how='inner')

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_k_df = clustered_k_df.join(metro_name_df, how='inner')

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# Add the predicted class columns
clustered_k_df["Class"] = model.labels_
clustered_k_df.head(10)

# Print the shape of the clustered_df
print(clustered_k_df.shape)
clustered_k_df.head(10)

(917, 11)


Unnamed: 0,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,average_listing_price,total_listing_count,PC 1,PC 2,PC 3,metro_area/city,Class
0,182846.428567,161.75,81.333333,43.333333,224213.7298,162.083333,-0.636693,-0.35568,-0.071281,"aberdeen, sd",0
1,243286.190475,336.0,71.375,110.0,281836.496283,461.666667,-0.348494,-0.02751,-0.470171,"aberdeen, wa",0
2,214853.273808,561.166667,66.416667,195.0,247640.66745,843.166667,-0.305553,-0.276655,-0.642662,"abilene, tx",0
3,155062.500008,151.666667,67.5,44.333333,192856.850308,205.333333,-0.630645,-0.584442,-0.610437,"ada, ok",0
4,169820.2381,428.416667,64.416667,123.333333,221564.868992,459.666667,-0.47518,-0.501927,-0.718885,"adrian, mi",0
5,165456.904758,1701.5,54.166667,798.666667,244281.999525,2889.083333,0.224175,-0.752096,-1.013899,"akron, oh",0
6,185736.904767,410.666667,99.833333,90.666667,220033.831333,514.833333,-0.659593,-0.332557,0.673585,"alamogordo, nm",3
7,120405.726192,545.666667,91.5,126.666667,150925.848925,563.25,-0.75332,-0.802304,0.368167,"albany, ga",3
8,349433.928575,450.583333,50.375,232.333333,448459.134575,682.5,0.182498,0.702249,-1.309289,"albany, or",0
9,297317.10715,3856.0,83.0,1118.666667,342436.266492,6182.666667,1.07568,-0.169139,0.216294,"albany-schenectady-troy, ny",0


In [32]:
# Find Austin metro area 
clustered_k_df

Unnamed: 0,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,average_listing_price,total_listing_count,PC 1,PC 2,PC 3,metro_area/city,Class
0,182846.428567,161.750000,81.333333,43.333333,224213.729800,162.083333,-0.636693,-0.355680,-0.071281,"aberdeen, sd",0
1,243286.190475,336.000000,71.375000,110.000000,281836.496283,461.666667,-0.348494,-0.027510,-0.470171,"aberdeen, wa",0
2,214853.273808,561.166667,66.416667,195.000000,247640.667450,843.166667,-0.305553,-0.276655,-0.642662,"abilene, tx",0
3,155062.500008,151.666667,67.500000,44.333333,192856.850308,205.333333,-0.630645,-0.584442,-0.610437,"ada, ok",0
4,169820.238100,428.416667,64.416667,123.333333,221564.868992,459.666667,-0.475180,-0.501927,-0.718885,"adrian, mi",0
...,...,...,...,...,...,...,...,...,...,...,...
912,124043.452367,1675.833333,79.166667,504.000000,172364.189867,2453.333333,-0.202872,-0.957687,-0.037352,"youngstown-warren-boardman, oh-pa",0
913,333594.375000,489.083333,51.666667,237.333333,483306.059425,826.500000,0.196987,0.705338,-1.241015,"yuba city, ca",0
914,233504.761917,609.083333,68.625000,204.333333,259280.280767,781.000000,-0.271543,-0.165954,-0.561348,"yuma, az",0
915,180653.571433,271.916667,77.375000,84.666667,230812.605642,394.083333,-0.561780,-0.382290,-0.216827,"zanesville, oh",0


# Visualization

### Kmeans

In [33]:
# Initialize the K-Means model.
neigh = NearestNeighbors(n_neighbors=5)

# Fit the model
neigh.fit(pcs_df)

# Predict clusters
NearestNeighbors(n_neighbors=5)
A = neigh.kneighbors_graph(pcs_df)
A.toarray()

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [34]:
# Creating a 3D-Scatter with the PCA data and the clusters
# Plot
fig = px.scatter_3d(clustered_k_df, x="PC 1", y="PC 2", z="PC 3", color="Class", symbol="Class", width=800, hover_name="metro_area/city", hover_data=["average_listing_price"])
fig.update_layout(legend=dict(x=0,y=1))
fig.show()


In [35]:
# 2D scatter plot
clustered_k_df.hvplot.scatter(
    x="PC 1",
    y="PC 2",
    hover_cols=["metro_area/city"],
    by="Class",
)