In [None]:
# Install PyMongo if necesary on computer 
#pip install pymongo

# if receiving error message install "dnspython"
#!pip install dnspython

In [None]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from config import db_user, db_password
import pymongo

In [None]:
# Check version of Pymongo
import pymongo
print("version:",pymongo.version)

# Importing Dataframe from MongoDB using Pymongo

In [None]:
# Setup Pymongo Connection
conn = f"mongodb+srv://{db_user}:{db_password}@cluster0.f7wzt.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"
client = pymongo.MongoClient(conn)
# Create data base
db = client.sleeper_cities
# assign Mongo collection to a variable 
housing_mongo = db.housing 

#### Note: Create a new collection/table for each CSV file

In [None]:
# Pull the collection/table from MongoDB.  
response = housing_mongo.find()
# Transforming it into a Dataframe
housing_panda_df = pd.DataFrame(list(response))
#Show the dataframe 
housing_panda_df.head()

### Note: Imported dataframe has "_id" column from MongoDB indexing. Needs to be removed

# Cleaning Data

In [None]:
# Load the Housing_data.csv dataset.
file_path = "Resources/housing_data_cleaned.csv"
housing_df = pd.read_csv(file_path)
print(housing_df.shape)
housing_df.head(5)

In [None]:
housing_df.dtypes

In [None]:
# Convert Date to a datetime column
housing_df["date"] = pd.to_datetime(housing_df["date"])
housing_df.dtypes

In [None]:
# Display new DataFrame
housing_df.head()

In [None]:
# Extract the year from the 'date' column.
housing_df["year"] = pd.DatetimeIndex(housing_df['date']).year
housing_df.head()


In [None]:
# Filter to show only data from 2019
housing_df_filtered = housing_df[housing_df['year'] == 2019]
print(housing_df_filtered.shape)
housing_df_filtered.head()

In [None]:
# Group by metro_area/city - index split separately 
grouped_housing_df = housing_df_filtered.groupby(['metro_area/city'],axis=0,as_index=False).mean()
grouped_housing_df.head(20)

In [None]:
# Check types
grouped_housing_df.dtypes

In [None]:
# create new df with DF cities  
metro_name_df = pd.DataFrame(grouped_housing_df['metro_area/city'])
print(metro_name_df.shape)
metro_name_df

In [None]:
# create new DF with columns needed for ML - Drop: (year, price red, price inc, pending listing)
ml_housing_df= grouped_housing_df[['median_listing_price','active_listing_count','median_days_on_market','new_listing_count','average_listing_price','total_listing_count']]
print(ml_housing_df.shape)
ml_housing_df.head()

# Prepare data for Machine Learning

In [None]:
# Standardize the data with StandardScaler().
X_scaled = StandardScaler().fit_transform(ml_housing_df)
X_scaled[:5]

In [None]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
pca.fit(X_scaled)
print(pca.explained_variance_ratio_)

X_pca = pca.transform(X_scaled)

In [None]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(X_pca, columns=['PC 1','PC 2', 'PC 3'])
pcs_df.head(10)

In [None]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

### Nearest Neighbors

In [None]:
# Initialize the NeasrestNeighbors model.
neigh = NearestNeighbors(n_neighbors=5)

# Fit the model
neigh.fit(pcs_df)

# Predict clusters
NearestNeighbors(n_neighbors=5)
A = neigh.kneighbors_graph(pcs_df)
A.toarray()

In [None]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = ml_housing_df.join(pcs_df, how='inner')

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df = clustered_df.join(metro_name_df, how='inner')

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# Add the predicted class columns
clustered_df["Class"] = neigh.labels_
clustered_df.head(10)

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

### KMeans

In [None]:
# Initialize the K-Means model.
model = KMeans(n_clusters=5, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
prediction = model.predict(pcs_df)
print(prediction)

In [None]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_k_df = ml_housing_df.join(pcs_df, how='inner')

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_k_df = clustered_k_df.join(metro_name_df, how='inner')

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# Add the predicted class columns
clustered_k_df["Class"] = model.labels_
clustered_k_df.head(10)

# Print the shape of the clustered_df
print(clustered_k_df.shape)
clustered_k_df.head(10)

In [None]:
# Find Austin metro area 
clustered_k_df

# Visualization

### Kmeans

In [None]:
# Initialize the K-Means model.
neigh = NearestNeighbors(n_neighbors=5)

# Fit the model
neigh.fit(pcs_df)

# Predict clusters
NearestNeighbors(n_neighbors=5)
A = neigh.kneighbors_graph(pcs_df)
A.toarray()

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters
# Plot
fig = px.scatter_3d(clustered_k_df, x="PC 1", y="PC 2", z="PC 3", color="Class", symbol="Class", width=800, hover_name="metro_area/city", hover_data=["average_listing_price"])
fig.update_layout(legend=dict(x=0,y=1))
fig.show()


In [None]:
# 2D scatter plot
clustered_k_df.hvplot.scatter(
    x="PC 1",
    y="PC 2",
    hover_cols=["metro_area/city"],
    by="Class",
)