In [2]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AffinityPropagation, AgglomerativeClustering
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, kneighbors_graph
from config import db_user, db_password
import pymongo
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import boto3
import os


## AWS Connection

In [3]:
# set variables for bucket, file name, and s3. 
bucket = "sleeper-cities"
file_name = "housing_data_realtor_cleaned.csv"
s3 = boto3.client('s3') 
# create connection to S3 using default config and all buckets within S3
obj = s3.get_object(Bucket= bucket, Key= file_name) 
# get object and file (key) from bucket
housing_df = pd.read_csv(obj['Body']) 

NoCredentialsError: Unable to locate credentials

In [3]:
housing_df.head()

Unnamed: 0,date,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count
0,2021-06-01,"new york-newark-jersey city, ny-nj-pa",617500.0,61846,55.0,26280,1020,9540,32230,1260094.0,94076
1,2021-06-01,"los angeles-long beach-anaheim, ca",1024500.0,13258,44.0,10096,476,1928,15494,2507311.0,28752
2,2021-06-01,"chicago-naperville-elgin, il-in-wi",354950.0,19089,33.0,17088,444,5712,25296,538092.2,44385
3,2021-06-01,"dallas-fort worth-arlington, tx",387000.0,6850,29.0,9008,704,1924,14504,595209.7,21354
4,2021-06-01,"houston-the woodlands-sugar land, tx",365995.0,13440,36.0,11368,1748,3636,17155,556535.0,30595


# Importing Dataframe from MongoDB using Pymongo

In [None]:
# Setup Pymongo Connection
conn = f"mongodb+srv://{db_user}:{db_password}@cluster0.f7wzt.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"
client = pymongo.MongoClient(conn)
# Create data base
db = client.sleeper_cities
# assign Mongo collection to a variable 
housing_mongo = db.housing 

#### Note: Create a new collection/table for each CSV file

In [None]:
# Pull the collection/table from MongoDB.  
response = housing_mongo.find()
# Transforming it into a Dataframe
housing_pd_df = pd.DataFrame(list(response))
#Show the dataframe 
housing_pd_df.head()

### Note: Imported dataframe has "_id" column from MongoDB indexing. Needs to be removed

# Cleaning Data

In [None]:
housing_pd_df.dtypes

In [None]:
# Convert Date to a datetime column
housing_pd_df["date"] = pd.to_datetime(housing_pd_df["date"])
housing_pd_df.dtypes

In [None]:
# Display new DataFrame
housing_pd_df.head()

In [None]:
# Extract the year from the 'date' column.
housing_pd_df["year"] = pd.DatetimeIndex(housing_pd_df['date']).year
housing_pd_df.head()


In [None]:
# Filter to show only data from 2019
housing_df_filtered = housing_pd_df[housing_pd_df['year'] == 2019]
print(housing_df_filtered.shape)
housing_df_filtered.head()

In [None]:
# Group by metro_area/city - index split separately 
grouped_housing_df = housing_df_filtered.groupby(['metro_area/city'],axis=0,as_index=False).mean()
grouped_housing_df.head(20)

In [None]:
# Check types
grouped_housing_df.dtypes

In [None]:
# create new df with DF cities  
metro_name_df = pd.DataFrame(grouped_housing_df['metro_area/city'])
print(metro_name_df.shape)
metro_name_df

In [None]:
# create new DF with columns needed for ML - Drop: (year, price red, price inc, pending listing)
ml_housing_df= grouped_housing_df[['median_listing_price','active_listing_count','median_days_on_market','new_listing_count','average_listing_price','total_listing_count']]
print(ml_housing_df.shape)
ml_housing_df.head()

## import income & Population data

In [None]:
# Import population and income data
# assign Mongo collection to a variable 
income_pop_mongo = db.income_pop

In [None]:
# Pull the collection/table from MongoDB.  
response = income_pop_mongo.find()
# Transforming it into a Dataframe
income_pop_pd_df = pd.DataFrame(list(response))
#Show the dataframe 
income_pop_pd_df.head()

### Note: Imported dataframe has "_id" column from MongoDB indexing. Needs to be removed

In [None]:
# Clean income_pop_pd_df to then merge it with ml_housing_df
cleaned_income_pop_pd_df = income_pop_pd_df[['Per Capita Income','Pop']]
cleaned_income_pop_pd_df.head()

In [None]:
# change the header
cleaned_income_pop_pd_df = cleaned_income_pop_pd_df.rename(columns={'Per Capita Income':'income','Pop':'pop'})
cleaned_income_pop_pd_df.head()

In [None]:
# Merge 'ml_housing_df' & 'cleaned_income_pop_pd_df'
ml_housing_income_pop_df = ml_housing_df.join(cleaned_income_pop_pd_df, how='left')
print(ml_housing_income_pop_df.shape)
ml_housing_income_pop_df.head()

# Prepare data for Machine Learning

In [None]:
# Standardize the data with StandardScaler().
X_scaled = StandardScaler().fit_transform(ml_housing_income_pop_df)
X_scaled[:5]

In [None]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
pca.fit(X_scaled)
print(pca.explained_variance_ratio_)

X_pca = pca.transform(X_scaled)

In [None]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(X_pca, columns=['PC 1','PC 2', 'PC 3'])
pcs_df.head(10)

In [None]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

## KMeans

In [None]:
# Initialize the K-Means model.
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
prediction = model.predict(pcs_df)
print(prediction)

In [None]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the ml_housing_df and pcs_df DataFrames on the same columns.
clustered_k_df = ml_housing_income_pop_df.join(pcs_df, how='inner')

#  Add a new column, "metro_area/city" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_k_df = clustered_k_df.join(metro_name_df, how='inner')

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# Add the predicted class columns
clustered_k_df["Class"] = model.labels_
clustered_k_df.head(10)

# Print the shape of the clustered_df
print(clustered_k_df.shape)
clustered_k_df.head(10)

In [None]:
# Find Austin metro area 
austin_group_k = clustered_k_df.loc[clustered_k_df['metro_area/city'].str.contains("austin", case=False)]
austin_group_k

In [None]:
# Metro_area/city with austin group
metro_area_city_k = clustered_k_df.loc[clustered_k_df['Class'] == 2]
print(metro_area_city_k.shape)
metro_area_city_k.head(10)

### Kmeans Plots

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters
# Plot
fig = px.scatter_3d(clustered_k_df, x="PC 1", y="PC 2", z="PC 3", color="Class", symbol="Class",size="median_listing_price", size_max=25, width=800, hover_name="metro_area/city", hover_data=["average_listing_price"])
fig.update_layout(legend=dict(x=0,y=1))
fig.show()


In [None]:
# 2D scatter plot
clustered_k_df.hvplot.scatter(
    x="PC 1",
    y="PC 2",
    hover_cols=["metro_area/city"],
    by="Class",
)

## Affinity Propagation

In [None]:
# Initialize the Affinity Propagation model.
af = AffinityPropagation()

# Fit the model
clustering = af.fit(pcs_df)

In [None]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the ml_housing_df and pcs_df DataFrames on the same columns.
clustered_af_df = ml_housing_df.join(pcs_df, how='inner')

#  Add a new column, "metro_area/city" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_af_df = clustered_af_df.join(metro_name_df, how='inner')

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# Add the predicted class columns
clustered_af_df["Class"] = clustering.labels_
clustered_af_df.head(10)

# Print the shape of the clustered_df
print(clustered_af_df.shape)
clustered_af_df.head(10)

In [None]:
# Find Austin metro area 
austin_group_af = clustered_af_df.loc[clustered_af_df['metro_area/city'].str.contains("austin", case=False)]
austin_group_af

In [None]:
# Metro_area/city with austin group
metro_area_city_af = clustered_af_df.loc[clustered_af_df['Class'] == 2]
print(metro_area_city_af.shape)
metro_area_city_af.head(10)

### Affinity Propagation Plots

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters
# Plot
fig = px.scatter_3d(clustered_af_df, x="PC 1", y="PC 2", z="PC 3", color="Class", symbol="Class",size="median_listing_price", size_max=25, width=800, hover_name="metro_area/city", hover_data=["average_listing_price"])
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [None]:
# 2D scatter plot
#plt.scatter(clustered_af_df[:,0], [:,1], c=clustering.labels_, cmap='rainbow', alpha=0.7, edgecolors='b')
clustered_af_df.hvplot.scatter(
    x="PC 1",
    y="PC 2",
    hover_cols=["metro_area/city"],
    by="Class",
)

## Hierarchical Agglomerative Clustering

In [None]:
# Create the dendrogram
fig = ff.create_dendrogram(pcs_df, color_threshold=13)
fig.update_layout(width=800, height=1000)
fig.show()

In [None]:
# # Initialize the Hierarchical Agglomerative Clustering model.
agg = AgglomerativeClustering(n_clusters=4)

# fit the model
hac = agg.fit(pcs_df)

In [None]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the ml_housing_df and pcs_df DataFrames on the same columns.
clustered_hac_df = ml_housing_df.join(pcs_df, how='inner')

#  Add a new column, "metro_area/city" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_hac_df = clustered_hac_df.join(metro_name_df, how='inner')

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# Add the predicted class columns
clustered_hac_df["Class"] = hac.labels_
clustered_hac_df.head(10)

# Print the shape of the clustered_df
print(clustered_hac_df.shape)
clustered_hac_df.head(10)

In [None]:
# Find Austin metro area 
austin_group_hac = clustered_hac_df.loc[clustered_hac_df['metro_area/city'].str.contains("austin", case=False)]
austin_group_hac

In [None]:
# Metro_area/city with austin group
metro_area_city_hac = clustered_hac_df.loc[clustered_hac_df['Class'] == 2]
print(metro_area_city_hac.shape)
metro_area_city_hac.head(10)

### Hierarchical Agglomerative Clustering Plots

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters
# Plot
fig = px.scatter_3d(clustered_hac_df, x="PC 1", y="PC 2", z="PC 3", color="Class", symbol="Class",size="median_listing_price", size_max=25, width=800, hover_name="metro_area/city", hover_data=["average_listing_price"])
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [None]:
# 2D scatter plot
clustered_hac_df.hvplot.scatter(
    x="PC 1",
    y="PC 2",
    hover_cols=["metro_area/city"],
    by="Class",
)