In [1]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib import rcParams
import scipy.stats as sts
import os
from collections import Counter
import requests
import json

# API Keys
from api_keys import gkey

## Looking for the house price data in us cities

In [2]:
#importing housing sale data (downloaded from kaggle)
price_path=os.path.join('Resources', 'Sale_Prices_City.csv')
housing_price=pd.read_csv(price_path)

In [3]:
#looking for housing_price data
housing_price.head()

Unnamed: 0.1,Unnamed: 0,RegionID,RegionName,StateName,SizeRank,2008-03,2008-04,2008-05,2008-06,2008-07,...,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01,2020-02,2020-03
0,0,6181,New York,New York,1,,,,,,...,563200.0,570500.0,572800.0,569900.0,560800.0,571500.0,575100.0,571700.0,568300.0,573600.0
1,1,12447,Los Angeles,California,2,507600.0,489600.0,463000.0,453100.0,438100.0,...,706800.0,711800.0,717300.0,714100.0,711900.0,718400.0,727100.0,738200.0,760200.0,
2,2,39051,Houston,Texas,3,138400.0,135500.0,132200.0,131000.0,133400.0,...,209700.0,207400.0,207600.0,207000.0,211400.0,211500.0,217700.0,219200.0,223800.0,
3,3,17426,Chicago,Illinois,4,325100.0,314800.0,286900.0,274600.0,268500.0,...,271500.0,266500.0,264900.0,265000.0,264100.0,264300.0,270000.0,281400.0,302900.0,309200.0
4,4,6915,San Antonio,Texas,5,130900.0,131300.0,131200.0,131500.0,131600.0,...,197100.0,198700.0,200200.0,200800.0,203400.0,203800.0,205400.0,205400.0,208300.0,


In [4]:
#columns of housing price dataframe
housing_price.columns


Index(['Unnamed: 0', 'RegionID', 'RegionName', 'StateName', 'SizeRank',
       '2008-03', '2008-04', '2008-05', '2008-06', '2008-07',
       ...
       '2019-06', '2019-07', '2019-08', '2019-09', '2019-10', '2019-11',
       '2019-12', '2020-01', '2020-02', '2020-03'],
      dtype='object', length=150)

In [5]:
#housing price is for each month, we need to sum and average to ger price per year
#to get the average housing data 
for i in range(2009, 2020):
    housing_price[f'{i}']=((housing_price[[f'{i}-01',f'{i}-02',f'{i}-03',f'{i}-01',
                                      f'{i}-05',f'{i}-06',f'{i}-07',f'{i}-08',
                                     f'{i}-09',f'{i}-10',f'{i}-11',f'{i}-12']].sum(axis=1))/12)

In [7]:
#only selecting the columns that are needed for our project
housing=(housing_price[['RegionName', 'StateName', 'SizeRank','2015',
                       '2016','2017','2018','2019']])

In [8]:
housing.head()

Unnamed: 0,RegionName,StateName,SizeRank,2015,2016,2017,2018,2019
0,New York,New York,1,517833.333333,534275.0,542425.0,555416.666667,565016.666667
1,Los Angeles,California,2,491658.333333,530025.0,567958.333333,637291.666667,704691.666667
2,Houston,Texas,3,171058.333333,181208.333333,190675.0,196850.0,208958.333333
3,Chicago,Illinois,4,236341.666667,218975.0,228258.333333,245083.333333,265641.666667
4,San Antonio,Texas,5,164600.0,171308.333333,177666.666667,188108.333333,197250.0


In [9]:
#shape of housing dataset
print(f'No of rowsare {housing.shape[0]} and number of columns are {housing.shape[1]}')

No of rowsare 3728 and number of columns are 8


In [10]:
#info of the dataframe
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3728 entries, 0 to 3727
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   RegionName  3728 non-null   object 
 1   StateName   3728 non-null   object 
 2   SizeRank    3728 non-null   int64  
 3   2015        3728 non-null   float64
 4   2016        3728 non-null   float64
 5   2017        3728 non-null   float64
 6   2018        3728 non-null   float64
 7   2019        3728 non-null   float64
dtypes: float64(5), int64(1), object(2)
memory usage: 233.1+ KB


In [11]:
#total number of unique city
housing.value_counts('RegionName')

RegionName
Springfield       9
Franklin          7
Monroe            7
Farmington        6
Salem             6
                 ..
Hanford           1
Hannibal          1
Happy Valley      1
Harbor Springs    1
Zionsville        1
Length: 3158, dtype: int64

In [12]:
#totoal unique city
housing['RegionName'].nunique()

3158

In [13]:
#there are around 600 city with duplicate value
housing=housing.drop_duplicates(subset=['RegionName'])
#housing dataframe shape
housing.shape

(3158, 8)

In [14]:
#total unique city after droping duplicates
housing['RegionName'].nunique()

3158

In [15]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3158 entries, 0 to 3727
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   RegionName  3158 non-null   object 
 1   StateName   3158 non-null   object 
 2   SizeRank    3158 non-null   int64  
 3   2015        3158 non-null   float64
 4   2016        3158 non-null   float64
 5   2017        3158 non-null   float64
 6   2018        3158 non-null   float64
 7   2019        3158 non-null   float64
dtypes: float64(5), int64(1), object(2)
memory usage: 222.0+ KB


In [16]:
#saving the clean housing data to output folder
housing.to_csv('output_data/housing_price.csv', index=False)