In [1]:
# Dependencies and Setup
import os
import pandas as pd
import datetime
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import requests
import json

# Google developer API key
from secret import gkey

In [2]:
#read the data sets from resources directory
csv_path = "Resources/Zillow Observed Rent Index, smoothed, seasonally adjusted.csv"
rent_df = pd.read_csv(csv_path)

In [3]:
csv_path = "Resources/Zillow House Value Index, smoothed, seaonally adjusted.csv"
house_df = pd.read_csv(csv_path)

## rent data cleaning 

In [4]:
#check data
rent_df.head()

Unnamed: 0,RegionID,RegionName,SizeRank,2014-01,2014-02,2014-03,2014-04,2014-05,2014-06,2014-07,...,2020-04,2020-05,2020-06,2020-07,2020-08,2020-09,2020-10,2020-11,2020-12,2021-01
0,102001,United States,0,1373.0,1380.0,1388.0,1395.0,1402.0,1409,1417,...,1744,1745,1745,1746,1746,1747,1747,1747,1747.0,1748.0
1,394913,"New York, NY",1,2379.0,2391.0,2403.0,2415.0,2426.0,2438,2450,...,2756,2734,2713,2691,2669,2646,2624,2600,2576.0,2552.0
2,753899,"Los Angeles-Long Beach-Anaheim, CA",2,1831.0,1848.0,1865.0,1882.0,1899.0,1916,1932,...,2566,2565,2564,2563,2561,2559,2558,2556,2554.0,2553.0
3,394463,"Chicago, IL",3,1511.0,1517.0,1522.0,1528.0,1533.0,1539,1544,...,1766,1762,1758,1754,1750,1746,1742,1737,1732.0,1728.0
4,394514,"Dallas-Fort Worth, TX",4,1199.0,1207.0,1214.0,1221.0,1229.0,1236,1243,...,1566,1568,1571,1573,1575,1577,1580,1582,1585.0,1587.0


In [5]:
# getting lng and lat to add to table to create maps in future js layer
lat_col = []
lng_col = []
for city in rent_df['RegionName']:
    target_url = ('https://maps.googleapis.com/maps/api/geocode/json?'
    'address={0}&key={1}').format(city, gkey)
    geo_data = requests.get(target_url).json()
    lat = geo_data["results"][0]["geometry"]["location"]["lat"]
    lng = geo_data["results"][0]["geometry"]["location"]["lng"]
    lat_col.append(lat)
    lng_col.append(lng)
    
# adding columns to dataframe
rent_df['Lat'] = lat_col
rent_df['Lng'] = lng_col
rent_df

Unnamed: 0,RegionID,RegionName,SizeRank,2014-01,2014-02,2014-03,2014-04,2014-05,2014-06,2014-07,...,2020-06,2020-07,2020-08,2020-09,2020-10,2020-11,2020-12,2021-01,Lat,Lng
0,102001,United States,0,1373.0,1380.0,1388.0,1395.0,1402.0,1409,1417,...,1745,1746,1746,1747,1747,1747,1747.0,1748.0,37.090240,-95.712891
1,394913,"New York, NY",1,2379.0,2391.0,2403.0,2415.0,2426.0,2438,2450,...,2713,2691,2669,2646,2624,2600,2576.0,2552.0,40.712775,-74.005973
2,753899,"Los Angeles-Long Beach-Anaheim, CA",2,1831.0,1848.0,1865.0,1882.0,1899.0,1916,1932,...,2564,2563,2561,2559,2558,2556,2554.0,2553.0,34.052217,-118.243662
3,394463,"Chicago, IL",3,1511.0,1517.0,1522.0,1528.0,1533.0,1539,1544,...,1758,1754,1750,1746,1742,1737,1732.0,1728.0,41.878114,-87.629798
4,394514,"Dallas-Fort Worth, TX",4,1199.0,1207.0,1214.0,1221.0,1229.0,1236,1243,...,1571,1573,1575,1577,1580,1582,1585.0,1587.0,32.707875,-96.920913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,394995,"Port St. Lucie, FL",119,1179.0,1189.0,1198.0,1208.0,1217.0,1227,1236,...,1742,1753,1763,1773,1784,1794,1804.0,1815.0,27.273049,-80.358226
102,394602,"Fort Collins, CO",159,1168.0,1184.0,1200.0,1215.0,1230.0,1245,1260,...,1595,1598,1601,1604,1608,1611,1614.0,1618.0,40.585260,-105.084423
103,394405,"Boulder, CO",162,1494.0,1507.0,1520.0,1533.0,1545.0,1558,1571,...,1997,2000,2003,2007,2010,2013,2017.0,2020.0,40.014986,-105.270546
104,394645,"Greeley, CO",177,1167.0,1179.0,1191.0,1203.0,1215.0,1227,1239,...,1651,1656,1660,1664,1668,1673,1677.0,1681.0,40.423314,-104.709132


In [6]:
#depivot the table from the time series
rent_df = pd.melt(rent_df, id_vars=['RegionID', 'RegionName','SizeRank', 'Lat', 'Lng'], var_name='Date', value_name='Price')

In [7]:
rent_df

Unnamed: 0,RegionID,RegionName,SizeRank,Lat,Lng,Date,Price
0,102001,United States,0,37.090240,-95.712891,2014-01,1373.0
1,394913,"New York, NY",1,40.712775,-74.005973,2014-01,2379.0
2,753899,"Los Angeles-Long Beach-Anaheim, CA",2,34.052217,-118.243662,2014-01,1831.0
3,394463,"Chicago, IL",3,41.878114,-87.629798,2014-01,1511.0
4,394514,"Dallas-Fort Worth, TX",4,32.707875,-96.920913,2014-01,1199.0
...,...,...,...,...,...,...,...
9005,394995,"Port St. Lucie, FL",119,27.273049,-80.358226,2021-01,1815.0
9006,394602,"Fort Collins, CO",159,40.585260,-105.084423,2021-01,1618.0
9007,394405,"Boulder, CO",162,40.014986,-105.270546,2021-01,2020.0
9008,394645,"Greeley, CO",177,40.423314,-104.709132,2021-01,1681.0


In [8]:
#check for null values
rent_df.isnull().sum()

RegionID       0
RegionName     0
SizeRank       0
Lat            0
Lng            0
Date           0
Price         17
dtype: int64

In [9]:
#drop the null values
    ##included the null values might lead to inaccurate results
rent_df = rent_df.dropna()

In [10]:
rent_df.isnull().sum()

RegionID      0
RegionName    0
SizeRank      0
Lat           0
Lng           0
Date          0
Price         0
dtype: int64

In [11]:
#data types check
rent_df.dtypes

RegionID        int64
RegionName     object
SizeRank        int64
Lat           float64
Lng           float64
Date           object
Price         float64
dtype: object

In [12]:
#convert data types for further analysis
rent_df['Price'] = rent_df['Price'].astype(int)

In [13]:
#split Date into Year and Month columns 
rent_df[['Year','Month']] = rent_df.Date.str.split("-",expand=True)
#split RegionName into State and City columns 
rent_df[['City','State']] = rent_df.RegionName.str.split(",",expand=True)
rent_df

Unnamed: 0,RegionID,RegionName,SizeRank,Lat,Lng,Date,Price,Year,Month,City,State
0,102001,United States,0,37.090240,-95.712891,2014-01,1373,2014,01,United States,
1,394913,"New York, NY",1,40.712775,-74.005973,2014-01,2379,2014,01,New York,NY
2,753899,"Los Angeles-Long Beach-Anaheim, CA",2,34.052217,-118.243662,2014-01,1831,2014,01,Los Angeles-Long Beach-Anaheim,CA
3,394463,"Chicago, IL",3,41.878114,-87.629798,2014-01,1511,2014,01,Chicago,IL
4,394514,"Dallas-Fort Worth, TX",4,32.707875,-96.920913,2014-01,1199,2014,01,Dallas-Fort Worth,TX
...,...,...,...,...,...,...,...,...,...,...,...
9005,394995,"Port St. Lucie, FL",119,27.273049,-80.358226,2021-01,1815,2021,01,Port St. Lucie,FL
9006,394602,"Fort Collins, CO",159,40.585260,-105.084423,2021-01,1618,2021,01,Fort Collins,CO
9007,394405,"Boulder, CO",162,40.014986,-105.270546,2021-01,2020,2021,01,Boulder,CO
9008,394645,"Greeley, CO",177,40.423314,-104.709132,2021-01,1681,2021,01,Greeley,CO


In [14]:
#adjust the data frame columns
rent_df = rent_df[['RegionID','State','City','Lat', 'Lng', 'Year', 'Month','Price']]
rent_df.head()


Unnamed: 0,RegionID,State,City,Lat,Lng,Year,Month,Price
0,102001,,United States,37.09024,-95.712891,2014,1,1373
1,394913,NY,New York,40.712775,-74.005973,2014,1,2379
2,753899,CA,Los Angeles-Long Beach-Anaheim,34.052217,-118.243662,2014,1,1831
3,394463,IL,Chicago,41.878114,-87.629798,2014,1,1511
4,394514,TX,Dallas-Fort Worth,32.707875,-96.920913,2014,1,1199


In [15]:
#year data frames
ry14_df = rent_df.loc[rent_df["Year"] == "2014"]
ry14_df

Unnamed: 0,RegionID,State,City,Lat,Lng,Year,Month,Price
0,102001,,United States,37.090240,-95.712891,2014,01,1373
1,394913,NY,New York,40.712775,-74.005973,2014,01,2379
2,753899,CA,Los Angeles-Long Beach-Anaheim,34.052217,-118.243662,2014,01,1831
3,394463,IL,Chicago,41.878114,-87.629798,2014,01,1511
4,394514,TX,Dallas-Fort Worth,32.707875,-96.920913,2014,01,1199
...,...,...,...,...,...,...,...,...
1267,394995,FL,Port St. Lucie,27.273049,-80.358226,2014,12,1279
1268,394602,CO,Fort Collins,40.585260,-105.084423,2014,12,1329
1269,394405,CO,Boulder,40.014986,-105.270546,2014,12,1634
1270,394645,CO,Greeley,40.423314,-104.709132,2014,12,1301


In [16]:
ry15_df = rent_df.loc[rent_df["Year"] == "2015"]
ry15_df

Unnamed: 0,RegionID,State,City,Lat,Lng,Year,Month,Price
1272,102001,,United States,37.090240,-95.712891,2015,01,1458
1273,394913,NY,New York,40.712775,-74.005973,2015,01,2518
1274,753899,CA,Los Angeles-Long Beach-Anaheim,34.052217,-118.243662,2015,01,2024
1275,394463,IL,Chicago,41.878114,-87.629798,2015,01,1573
1276,394514,TX,Dallas-Fort Worth,32.707875,-96.920913,2015,01,1285
...,...,...,...,...,...,...,...,...
2539,394995,FL,Port St. Lucie,27.273049,-80.358226,2015,12,1358
2540,394602,CO,Fort Collins,40.585260,-105.084423,2015,12,1417
2541,394405,CO,Boulder,40.014986,-105.270546,2015,12,1764
2542,394645,CO,Greeley,40.423314,-104.709132,2015,12,1393


In [17]:
ry16_df = rent_df.loc[rent_df["Year"] == "2016"]
ry16_df

Unnamed: 0,RegionID,State,City,Lat,Lng,Year,Month,Price
2544,102001,,United States,37.090240,-95.712891,2016,01,1525
2545,394913,NY,New York,40.712775,-74.005973,2016,01,2613
2546,753899,CA,Los Angeles-Long Beach-Anaheim,34.052217,-118.243662,2016,01,2168
2547,394463,IL,Chicago,41.878114,-87.629798,2016,01,1629
2548,394514,TX,Dallas-Fort Worth,32.707875,-96.920913,2016,01,1352
...,...,...,...,...,...,...,...,...
3811,394995,FL,Port St. Lucie,27.273049,-80.358226,2016,12,1441
3812,394602,CO,Fort Collins,40.585260,-105.084423,2016,12,1454
3813,394405,CO,Boulder,40.014986,-105.270546,2016,12,1809
3814,394645,CO,Greeley,40.423314,-104.709132,2016,12,1441


In [18]:
ry17_df = rent_df.loc[rent_df["Year"] == "2017"]
ry17_df

Unnamed: 0,RegionID,State,City,Lat,Lng,Year,Month,Price
3816,102001,,United States,37.090240,-95.712891,2017,01,1579
3817,394913,NY,New York,40.712775,-74.005973,2017,01,2663
3818,753899,CA,Los Angeles-Long Beach-Anaheim,34.052217,-118.243662,2017,01,2295
3819,394463,IL,Chicago,41.878114,-87.629798,2017,01,1667
3820,394514,TX,Dallas-Fort Worth,32.707875,-96.920913,2017,01,1418
...,...,...,...,...,...,...,...,...
5083,394995,FL,Port St. Lucie,27.273049,-80.358226,2017,12,1523
5084,394602,CO,Fort Collins,40.585260,-105.084423,2017,12,1488
5085,394405,CO,Boulder,40.014986,-105.270546,2017,12,1866
5086,394645,CO,Greeley,40.423314,-104.709132,2017,12,1514


In [19]:
ry18_df = rent_df.loc[rent_df["Year"] == "2018"]
ry18_df

Unnamed: 0,RegionID,State,City,Lat,Lng,Year,Month,Price
5088,102001,,United States,37.090240,-95.712891,2018,01,1629
5089,394913,NY,New York,40.712775,-74.005973,2018,01,2700
5090,753899,CA,Los Angeles-Long Beach-Anaheim,34.052217,-118.243662,2018,01,2400
5091,394463,IL,Chicago,41.878114,-87.629798,2018,01,1697
5092,394514,TX,Dallas-Fort Worth,32.707875,-96.920913,2018,01,1468
...,...,...,...,...,...,...,...,...
6355,394995,FL,Port St. Lucie,27.273049,-80.358226,2018,12,1606
6356,394602,CO,Fort Collins,40.585260,-105.084423,2018,12,1540
6357,394405,CO,Boulder,40.014986,-105.270546,2018,12,1932
6358,394645,CO,Greeley,40.423314,-104.709132,2018,12,1585


In [20]:
ry19_df = rent_df.loc[rent_df["Year"] == "2019"]
ry19_df

Unnamed: 0,RegionID,State,City,Lat,Lng,Year,Month,Price
6360,102001,,United States,37.090240,-95.712891,2019,01,1690
6361,394913,NY,New York,40.712775,-74.005973,2019,01,2755
6362,753899,CA,Los Angeles-Long Beach-Anaheim,34.052217,-118.243662,2019,01,2500
6363,394463,IL,Chicago,41.878114,-87.629798,2019,01,1736
6364,394514,TX,Dallas-Fort Worth,32.707875,-96.920913,2019,01,1518
...,...,...,...,...,...,...,...,...
7627,394995,FL,Port St. Lucie,27.273049,-80.358226,2019,12,1682
7628,394602,CO,Fort Collins,40.585260,-105.084423,2019,12,1579
7629,394405,CO,Boulder,40.014986,-105.270546,2019,12,1984
7630,394645,CO,Greeley,40.423314,-104.709132,2019,12,1630


In [21]:
ry20_df = rent_df.loc[rent_df["Year"] == "2020"]
ry20_df

Unnamed: 0,RegionID,State,City,Lat,Lng,Year,Month,Price
7632,102001,,United States,37.090240,-95.712891,2020,01,1742
7633,394913,NY,New York,40.712775,-74.005973,2020,01,2799
7634,753899,CA,Los Angeles-Long Beach-Anaheim,34.052217,-118.243662,2020,01,2568
7635,394463,IL,Chicago,41.878114,-87.629798,2020,01,1774
7636,394514,TX,Dallas-Fort Worth,32.707875,-96.920913,2020,01,1561
...,...,...,...,...,...,...,...,...
8899,394995,FL,Port St. Lucie,27.273049,-80.358226,2020,12,1804
8900,394602,CO,Fort Collins,40.585260,-105.084423,2020,12,1614
8901,394405,CO,Boulder,40.014986,-105.270546,2020,12,2017
8902,394645,CO,Greeley,40.423314,-104.709132,2020,12,1677


In [22]:
ry21_df = rent_df.loc[rent_df["Year"] == "2021"]
ry21_df

Unnamed: 0,RegionID,State,City,Lat,Lng,Year,Month,Price
8904,102001,,United States,37.090240,-95.712891,2021,01,1748
8905,394913,NY,New York,40.712775,-74.005973,2021,01,2552
8906,753899,CA,Los Angeles-Long Beach-Anaheim,34.052217,-118.243662,2021,01,2553
8907,394463,IL,Chicago,41.878114,-87.629798,2021,01,1728
8908,394514,TX,Dallas-Fort Worth,32.707875,-96.920913,2021,01,1587
...,...,...,...,...,...,...,...,...
9005,394995,FL,Port St. Lucie,27.273049,-80.358226,2021,01,1815
9006,394602,CO,Fort Collins,40.585260,-105.084423,2021,01,1618
9007,394405,CO,Boulder,40.014986,-105.270546,2021,01,2020
9008,394645,CO,Greeley,40.423314,-104.709132,2021,01,1681


## housing data cleaning

In [27]:
#read data
house_df

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,1996-01-31,1996-02-29,1996-03-31,1996-04-30,1996-05-31,...,2020-04-30,2020-05-31,2020-06-30,2020-07-31,2020-08-31,2020-09-30,2020-10-31,2020-11-30,2020-12-31,2021-01-31
0,102001,0,United States,Country,,106811.0,106838.0,106888.0,107014.0,107156.0,...,250069.0,251258.0,252506.0,254019.0,255964.0,258235.0,260729.0,263427.0,266222.0,269039.0
1,394913,1,"New York, NY",Msa,NY,187591.0,187153.0,186875.0,186342.0,186025.0,...,483751.0,485191.0,486992.0,489417.0,492642.0,496858.0,501651.0,506833.0,511880.0,516732.0
2,753899,2,"Los Angeles-Long Beach-Anaheim, CA",Msa,CA,186111.0,186371.0,186390.0,186499.0,186473.0,...,698120.0,699783.0,701018.0,705448.0,712728.0,720482.0,727461.0,734522.0,741757.0,748532.0
3,394463,3,"Chicago, IL",Msa,IL,163099.0,162799.0,162404.0,161955.0,161353.0,...,245195.0,245715.0,246350.0,247476.0,248977.0,250996.0,253227.0,255446.0,257610.0,259459.0
4,394514,4,"Dallas-Fort Worth, TX",Msa,TX,111373.0,111437.0,111596.0,111914.0,112257.0,...,256755.0,257898.0,259226.0,260751.0,262469.0,264207.0,266483.0,268773.0,271372.0,273348.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
908,394767,929,"Lamesa, TX",Msa,TX,,,,,,...,74888.0,75130.0,75441.0,75725.0,76039.0,76385.0,76733.0,77319.0,78015.0,78762.0
909,753874,930,"Craig, CO",Msa,CO,61884.0,62113.0,62354.0,62824.0,63158.0,...,191181.0,191259.0,191465.0,191633.0,191837.0,192061.0,192424.0,193341.0,194323.0,195374.0
910,394968,931,"Pecos, TX",Msa,TX,,,,,,...,134245.0,134927.0,135804.0,136325.0,136118.0,134957.0,133241.0,131749.0,130777.0,129578.0
911,395188,932,"Vernon, TX",Msa,TX,,,,,,...,68440.0,68423.0,68364.0,68360.0,68365.0,68420.0,68363.0,68378.0,68339.0,68432.0


In [28]:
# getting lng and lat to add to table to create maps in future js layer
lat_col = []
lng_col = []
for city in house_df['RegionName']:
    target_url = ('https://maps.googleapis.com/maps/api/geocode/json?'
    'address={0}&key={1}').format(city, gkey)
    geo_data = requests.get(target_url).json()
    lat = geo_data["results"][0]["geometry"]["location"]["lat"]
    lng = geo_data["results"][0]["geometry"]["location"]["lng"]
    lat_col.append(lat)
    lng_col.append(lng)

In [30]:
# adding columns to dataframe
house_df['Lat'] = lat_col
house_df['Lng'] = lng_col
house_df

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,1996-01-31,1996-02-29,1996-03-31,1996-04-30,1996-05-31,...,2020-06-30,2020-07-31,2020-08-31,2020-09-30,2020-10-31,2020-11-30,2020-12-31,2021-01-31,Lat,Lng
0,102001,0,United States,Country,,106811.0,106838.0,106888.0,107014.0,107156.0,...,252506.0,254019.0,255964.0,258235.0,260729.0,263427.0,266222.0,269039.0,37.090240,-95.712891
1,394913,1,"New York, NY",Msa,NY,187591.0,187153.0,186875.0,186342.0,186025.0,...,486992.0,489417.0,492642.0,496858.0,501651.0,506833.0,511880.0,516732.0,40.712775,-74.005973
2,753899,2,"Los Angeles-Long Beach-Anaheim, CA",Msa,CA,186111.0,186371.0,186390.0,186499.0,186473.0,...,701018.0,705448.0,712728.0,720482.0,727461.0,734522.0,741757.0,748532.0,34.052217,-118.243662
3,394463,3,"Chicago, IL",Msa,IL,163099.0,162799.0,162404.0,161955.0,161353.0,...,246350.0,247476.0,248977.0,250996.0,253227.0,255446.0,257610.0,259459.0,41.878114,-87.629798
4,394514,4,"Dallas-Fort Worth, TX",Msa,TX,111373.0,111437.0,111596.0,111914.0,112257.0,...,259226.0,260751.0,262469.0,264207.0,266483.0,268773.0,271372.0,273348.0,32.707875,-96.920913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
908,394767,929,"Lamesa, TX",Msa,TX,,,,,,...,75441.0,75725.0,76039.0,76385.0,76733.0,77319.0,78015.0,78762.0,32.737600,-101.950992
909,753874,930,"Craig, CO",Msa,CO,61884.0,62113.0,62354.0,62824.0,63158.0,...,191465.0,191633.0,191837.0,192061.0,192424.0,193341.0,194323.0,195374.0,40.515249,-107.546454
910,394968,931,"Pecos, TX",Msa,TX,,,,,,...,135804.0,136325.0,136118.0,134957.0,133241.0,131749.0,130777.0,129578.0,31.422912,-103.493229
911,395188,932,"Vernon, TX",Msa,TX,,,,,,...,68364.0,68360.0,68365.0,68420.0,68363.0,68378.0,68339.0,68432.0,34.154531,-99.265080


In [31]:
#depivot the table from the time series
house_df = pd.melt(house_df, id_vars=['RegionID', 'RegionName','SizeRank','RegionType','StateName', 'Lat', 'Lng'], var_name='Date', value_name='Price')

In [32]:
house_df['StateName'].fillna('None', inplace=True) #filling NaN values with "None" so US is not accidentally dropped

In [33]:
house_df

Unnamed: 0,RegionID,RegionName,SizeRank,RegionType,StateName,Lat,Lng,Date,Price
0,102001,United States,0,Country,,37.090240,-95.712891,1996-01-31,106811.0
1,394913,"New York, NY",1,Msa,NY,40.712775,-74.005973,1996-01-31,187591.0
2,753899,"Los Angeles-Long Beach-Anaheim, CA",2,Msa,CA,34.052217,-118.243662,1996-01-31,186111.0
3,394463,"Chicago, IL",3,Msa,IL,41.878114,-87.629798,1996-01-31,163099.0
4,394514,"Dallas-Fort Worth, TX",4,Msa,TX,32.707875,-96.920913,1996-01-31,111373.0
...,...,...,...,...,...,...,...,...,...
274808,394767,"Lamesa, TX",929,Msa,TX,32.737600,-101.950992,2021-01-31,78762.0
274809,753874,"Craig, CO",930,Msa,CO,40.515249,-107.546454,2021-01-31,195374.0
274810,394968,"Pecos, TX",931,Msa,TX,31.422912,-103.493229,2021-01-31,129578.0
274811,395188,"Vernon, TX",932,Msa,TX,34.154531,-99.265080,2021-01-31,68432.0


In [34]:
#check for null values
house_df.isnull().sum()

RegionID          0
RegionName        0
SizeRank          0
RegionType        0
StateName         0
Lat               0
Lng               0
Date              0
Price         51625
dtype: int64

In [35]:
#drop the null values
house_df = house_df.dropna()

In [36]:
house_df.isnull().sum()

RegionID      0
RegionName    0
SizeRank      0
RegionType    0
StateName     0
Lat           0
Lng           0
Date          0
Price         0
dtype: int64

In [37]:
#check data types
house_df.dtypes

RegionID        int64
RegionName     object
SizeRank        int64
RegionType     object
StateName      object
Lat           float64
Lng           float64
Date           object
Price         float64
dtype: object

In [38]:
#convert data types for further analysis
house_df['Price'] = house_df['Price'].astype(float).astype(int)

In [39]:
house_df

Unnamed: 0,RegionID,RegionName,SizeRank,RegionType,StateName,Lat,Lng,Date,Price
0,102001,United States,0,Country,,37.090240,-95.712891,1996-01-31,106811
1,394913,"New York, NY",1,Msa,NY,40.712775,-74.005973,1996-01-31,187591
2,753899,"Los Angeles-Long Beach-Anaheim, CA",2,Msa,CA,34.052217,-118.243662,1996-01-31,186111
3,394463,"Chicago, IL",3,Msa,IL,41.878114,-87.629798,1996-01-31,163099
4,394514,"Dallas-Fort Worth, TX",4,Msa,TX,32.707875,-96.920913,1996-01-31,111373
...,...,...,...,...,...,...,...,...,...
274808,394767,"Lamesa, TX",929,Msa,TX,32.737600,-101.950992,2021-01-31,78762
274809,753874,"Craig, CO",930,Msa,CO,40.515249,-107.546454,2021-01-31,195374
274810,394968,"Pecos, TX",931,Msa,TX,31.422912,-103.493229,2021-01-31,129578
274811,395188,"Vernon, TX",932,Msa,TX,34.154531,-99.265080,2021-01-31,68432


In [40]:
#split Date into Year and Month columns 
house_df[['Year','Month',"date"]] = house_df.Date.str.split("-",expand=True)
#split RegionName into State and City columns 
house_df[['City','State']] = house_df.RegionName.str.split(",",expand=True)
house_df

Unnamed: 0,RegionID,RegionName,SizeRank,RegionType,StateName,Lat,Lng,Date,Price,Year,Month,date,City,State
0,102001,United States,0,Country,,37.090240,-95.712891,1996-01-31,106811,1996,01,31,United States,
1,394913,"New York, NY",1,Msa,NY,40.712775,-74.005973,1996-01-31,187591,1996,01,31,New York,NY
2,753899,"Los Angeles-Long Beach-Anaheim, CA",2,Msa,CA,34.052217,-118.243662,1996-01-31,186111,1996,01,31,Los Angeles-Long Beach-Anaheim,CA
3,394463,"Chicago, IL",3,Msa,IL,41.878114,-87.629798,1996-01-31,163099,1996,01,31,Chicago,IL
4,394514,"Dallas-Fort Worth, TX",4,Msa,TX,32.707875,-96.920913,1996-01-31,111373,1996,01,31,Dallas-Fort Worth,TX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274808,394767,"Lamesa, TX",929,Msa,TX,32.737600,-101.950992,2021-01-31,78762,2021,01,31,Lamesa,TX
274809,753874,"Craig, CO",930,Msa,CO,40.515249,-107.546454,2021-01-31,195374,2021,01,31,Craig,CO
274810,394968,"Pecos, TX",931,Msa,TX,31.422912,-103.493229,2021-01-31,129578,2021,01,31,Pecos,TX
274811,395188,"Vernon, TX",932,Msa,TX,34.154531,-99.265080,2021-01-31,68432,2021,01,31,Vernon,TX


In [41]:
#adjust the data frame columns
house_df = house_df[['RegionID','State','City','Lat', 'Lng', 'Year', 'Month','Price']]
house_df

Unnamed: 0,RegionID,State,City,Lat,Lng,Year,Month,Price
0,102001,,United States,37.090240,-95.712891,1996,01,106811
1,394913,NY,New York,40.712775,-74.005973,1996,01,187591
2,753899,CA,Los Angeles-Long Beach-Anaheim,34.052217,-118.243662,1996,01,186111
3,394463,IL,Chicago,41.878114,-87.629798,1996,01,163099
4,394514,TX,Dallas-Fort Worth,32.707875,-96.920913,1996,01,111373
...,...,...,...,...,...,...,...,...
274808,394767,TX,Lamesa,32.737600,-101.950992,2021,01,78762
274809,753874,CO,Craig,40.515249,-107.546454,2021,01,195374
274810,394968,TX,Pecos,31.422912,-103.493229,2021,01,129578
274811,395188,TX,Vernon,34.154531,-99.265080,2021,01,68432


In [42]:
# Filtered df to match the rent data
house_df['Year'] = house_df['Year'].astype(int)
house_filtered = house_df[house_df['Year'] >= 2014]  
house_filtered.head()

Unnamed: 0,RegionID,State,City,Lat,Lng,Year,Month,Price
197208,102001,,United States,37.09024,-95.712891,2014,1,181578
197209,394913,NY,New York,40.712775,-74.005973,2014,1,393733
197210,753899,CA,Los Angeles-Long Beach-Anaheim,34.052217,-118.243662,2014,1,515143
197211,394463,IL,Chicago,41.878114,-87.629798,2014,1,197262
197212,394514,TX,Dallas-Fort Worth,32.707875,-96.920913,2014,1,161947


In [43]:
house_df

Unnamed: 0,RegionID,State,City,Lat,Lng,Year,Month,Price
0,102001,,United States,37.090240,-95.712891,1996,01,106811
1,394913,NY,New York,40.712775,-74.005973,1996,01,187591
2,753899,CA,Los Angeles-Long Beach-Anaheim,34.052217,-118.243662,1996,01,186111
3,394463,IL,Chicago,41.878114,-87.629798,1996,01,163099
4,394514,TX,Dallas-Fort Worth,32.707875,-96.920913,1996,01,111373
...,...,...,...,...,...,...,...,...
274808,394767,TX,Lamesa,32.737600,-101.950992,2021,01,78762
274809,753874,CO,Craig,40.515249,-107.546454,2021,01,195374
274810,394968,TX,Pecos,31.422912,-103.493229,2021,01,129578
274811,395188,TX,Vernon,34.154531,-99.265080,2021,01,68432


In [44]:
# Saved to csv files
rent_df.to_csv('static/data/rent_cleaned.csv', index=False, header=True)
house_df.to_csv('static/data/house_cleaned.csv', index=False, header=True)
house_filtered.to_csv('static/data/house_filtered.csv', index=False, header=True)