In [47]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [48]:
# Load the Housing_data.csv dataset.
file_path = "Resources/housing_data_cleaned.csv"
housing_df = pd.read_csv(file_path)
print(housing_df.shape)
housing_df.head(5)

(55020, 11)


Unnamed: 0,date,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count
0,2021-06-01,"new york-newark-jersey city, ny-nj-pa",617500.0,61846,55.0,26280,1020,9540,32230,1260094.0,94076
1,2021-06-01,"los angeles-long beach-anaheim, ca",1024500.0,13258,44.0,10096,476,1928,15494,2507311.0,28752
2,2021-06-01,"chicago-naperville-elgin, il-in-wi",354950.0,19089,33.0,17088,444,5712,25296,538092.2,44385
3,2021-06-01,"dallas-fort worth-arlington, tx",387000.0,6850,29.0,9008,704,1924,14504,595209.7,21354
4,2021-06-01,"houston-the woodlands-sugar land, tx",365995.0,13440,36.0,11368,1748,3636,17155,556535.0,30595


In [49]:
housing_df.dtypes

date                      object
metro_area/city           object
median_listing_price     float64
active_listing_count       int64
median_days_on_market    float64
new_listing_count          int64
price_increased_count      int64
price_reduced_count        int64
pending_listing_count      int64
average_listing_price    float64
total_listing_count        int64
dtype: object

In [50]:
# Convert Date to a datetime column
housing_df["date"] = pd.to_datetime(housing_df["date"])
housing_df.dtypes

date                     datetime64[ns]
metro_area/city                  object
median_listing_price            float64
active_listing_count              int64
median_days_on_market           float64
new_listing_count                 int64
price_increased_count             int64
price_reduced_count               int64
pending_listing_count             int64
average_listing_price           float64
total_listing_count               int64
dtype: object

In [51]:
# Display new DataFrame
housing_df.head()

Unnamed: 0,date,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count
0,2021-06-01,"new york-newark-jersey city, ny-nj-pa",617500.0,61846,55.0,26280,1020,9540,32230,1260094.0,94076
1,2021-06-01,"los angeles-long beach-anaheim, ca",1024500.0,13258,44.0,10096,476,1928,15494,2507311.0,28752
2,2021-06-01,"chicago-naperville-elgin, il-in-wi",354950.0,19089,33.0,17088,444,5712,25296,538092.2,44385
3,2021-06-01,"dallas-fort worth-arlington, tx",387000.0,6850,29.0,9008,704,1924,14504,595209.7,21354
4,2021-06-01,"houston-the woodlands-sugar land, tx",365995.0,13440,36.0,11368,1748,3636,17155,556535.0,30595


In [52]:
# Extract the year from the 'date' column.
housing_df["year"] = pd.DatetimeIndex(housing_df['date']).year
housing_df.head()


Unnamed: 0,date,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count,year
0,2021-06-01,"new york-newark-jersey city, ny-nj-pa",617500.0,61846,55.0,26280,1020,9540,32230,1260094.0,94076,2021
1,2021-06-01,"los angeles-long beach-anaheim, ca",1024500.0,13258,44.0,10096,476,1928,15494,2507311.0,28752,2021
2,2021-06-01,"chicago-naperville-elgin, il-in-wi",354950.0,19089,33.0,17088,444,5712,25296,538092.2,44385,2021
3,2021-06-01,"dallas-fort worth-arlington, tx",387000.0,6850,29.0,9008,704,1924,14504,595209.7,21354,2021
4,2021-06-01,"houston-the woodlands-sugar land, tx",365995.0,13440,36.0,11368,1748,3636,17155,556535.0,30595,2021


In [53]:
# Filter to show only data from 2019
housing_df_filtered = housing_df[housing_df['year'] == 2019]
print(housing_df_filtered.shape)
housing_df_filtered.head()

(11004, 12)


Unnamed: 0,date,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count,year
16506,2019-12-01,"new york-newark-jersey city, ny-nj-pa",549999.5,65994,83.5,10272,428,7132,16934,1110110.0,82928,2019
16507,2019-12-01,"los angeles-long beach-anaheim, ca",877500.0,17466,75.5,4632,300,1972,11449,2112489.0,28915,2019
16508,2019-12-01,"chicago-naperville-elgin, il-in-wi",299250.0,31563,68.5,7588,240,5440,12497,438986.0,44060,2019
16509,2019-12-01,"dallas-fort worth-arlington, tx",335500.0,19493,64.5,5780,680,5240,9232,468830.1,28725,2019
16510,2019-12-01,"houston-the woodlands-sugar land, tx",299994.0,26614,69.0,7032,836,5988,9069,437528.9,35683,2019


In [54]:
# Group by metro_area/city - index split separately 
grouped_housing_df = housing_df_filtered.groupby(['metro_area/city'],axis=0,as_index=False).mean()
grouped_housing_df.head(20)

Unnamed: 0,metro_area/city,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,pending_listing_count,average_listing_price,total_listing_count,year
0,"aberdeen, sd",182846.428567,161.75,81.333333,43.333333,0.0,36.0,0.333333,224213.7298,162.083333,2019.0
1,"aberdeen, wa",243286.190475,336.0,71.375,110.0,4.333333,65.0,125.666667,281836.496283,461.666667,2019.0
2,"abilene, tx",214853.273808,561.166667,66.416667,195.0,3.333333,149.0,282.0,247640.66745,843.166667,2019.0
3,"ada, ok",155062.500008,151.666667,67.5,44.333333,0.0,20.666667,53.666667,192856.850308,205.333333,2019.0
4,"adrian, mi",169820.2381,428.416667,64.416667,123.333333,2.0,83.666667,31.25,221564.868992,459.666667,2019.0
5,"akron, oh",165456.904758,1701.5,54.166667,798.666667,7.666667,563.333333,1187.583333,244281.999525,2889.083333,2019.0
6,"alamogordo, nm",185736.904767,410.666667,99.833333,90.666667,0.0,48.333333,104.166667,220033.831333,514.833333,2019.0
7,"albany, ga",120405.726192,545.666667,91.5,126.666667,2.0,109.0,17.583333,150925.848925,563.25,2019.0
8,"albany, or",349433.928575,450.583333,50.375,232.333333,8.0,143.0,231.916667,448459.134575,682.5,2019.0
9,"albany-schenectady-troy, ny",297317.10715,3856.0,83.0,1118.666667,31.666667,887.333333,2326.666667,342436.266492,6182.666667,2019.0


In [55]:
# Check types
grouped_housing_df.dtypes

metro_area/city           object
median_listing_price     float64
active_listing_count     float64
median_days_on_market    float64
new_listing_count        float64
price_increased_count    float64
price_reduced_count      float64
pending_listing_count    float64
average_listing_price    float64
total_listing_count      float64
year                     float64
dtype: object

In [57]:
# create new df with DF cities  
metro_name_df = pd.DataFrame(grouped_housing_df['metro_area/city'])
print(metro_name_df.shape)
metro_name_df

(917, 1)


Unnamed: 0,metro_area/city
0,"aberdeen, sd"
1,"aberdeen, wa"
2,"abilene, tx"
3,"ada, ok"
4,"adrian, mi"
...,...
912,"youngstown-warren-boardman, oh-pa"
913,"yuba city, ca"
914,"yuma, az"
915,"zanesville, oh"


In [59]:
# create new DF with columns needed for ML - Drop: (year, price red, price inc, pending listing)
ml_housing_df= grouped_housing_df[['median_listing_price','active_listing_count','median_days_on_market','new_listing_count','average_listing_price','total_listing_count']]
print(ml_housing_df.shape)
ml_housing_df.head()

(917, 6)


Unnamed: 0,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,average_listing_price,total_listing_count
0,182846.428567,161.75,81.333333,43.333333,224213.7298,162.083333
1,243286.190475,336.0,71.375,110.0,281836.496283,461.666667
2,214853.273808,561.166667,66.416667,195.0,247640.66745,843.166667
3,155062.500008,151.666667,67.5,44.333333,192856.850308,205.333333
4,169820.2381,428.416667,64.416667,123.333333,221564.868992,459.666667
