In [None]:
# Download "Residential Building" and "Real Property Sales" from King County Website
# http://info.kingcounty.gov/assessor/DataDownload/default.aspx
# Put in the same folder as this file
# Output will be two new data files: 
# 1) "Renovated.csv" - Residential buildings that have been renovated
# 2) "MultiSale.csv" - Residential buildings that have been renovated and sold more than once

In [None]:
import pandas as pd

# I. import residential data
residential_dtype = {"Major": str, "Minor": str}
residential_df = pd.read_csv("EXTR_ResBldg.csv", dtype = residential_dtype)

In [71]:
residential_df.head(8)

Unnamed: 0,Major,Minor,BldgNbr,NbrLivingUnits,Address,BuildingNumber,Fraction,DirectionPrefix,StreetName,StreetType,...,FpMultiStory,FpFreestanding,FpAdditional,YrBuilt,YrRenovated,PcntComplete,Obsolescence,PcntNetCondition,Condition,AddnlCost
0,34570,100,1,1,2101 NE 23RD ST 98056,2101,,NE,23RD,ST,...,0,0,0,1981,0,0,0,0,3,0
1,34570,140,1,1,2207 NE 23RD ST 98056,2207,,NE,23RD,ST,...,0,1,0,1981,0,0,0,0,3,0
2,34570,350,1,1,2320 BLAINE AVE NE 98056,2320,,,BLAINE,AVE,...,0,0,0,1981,2003,0,0,0,3,3500
3,34800,90,1,1,523 S 17TH ST 98055,523,,S,17TH,ST,...,1,0,1,1967,0,0,0,0,3,0
4,34800,95,1,1,517 S 17TH ST 98055,517,,S,17TH,ST,...,1,0,1,1967,0,0,0,0,3,0
5,34800,100,1,1,1716 MORRIS AVE S 98055,1716,,,MORRIS,AVE,...,1,0,1,1967,0,0,0,0,4,0
6,34800,105,1,1,1722 MORRIS AVE S 98055,1722,,,MORRIS,AVE,...,1,0,1,1969,0,0,0,0,3,0
7,34940,140,1,1,2243 SCANDIA AVE 98022,2243,,,SCANDIA,AVE,...,0,0,0,1980,0,0,0,0,4,0


In [None]:
residential_df.iloc[0]

In [None]:
# Check missing values
na = (residential_df.isnull().sum() / len(residential_df)) * 100
na = na.drop(na[na == 0].index).sort_values(ascending=False)
na_df = pd.DataFrame({'Missing Ratio' :na})
na_df

In [40]:
residential_df.YrRenovated.value_counts().head()
sum(residential_df.YrRenovated.value_counts()[1:])

23150

In [59]:
# keep only houses that have been renovated
renovated_df = residential_df[residential_df.YrRenovated != 0].copy()
renovated_df.shape

(23150, 50)

In [None]:
# add ParcelId column and check uniqueness
renovated_df["ParcelId"] = renovated_df["Major"]+renovated_df["Minor"]
renovated_df.head()
len(renovated_df.ParcelId.unique())
renovated_df.ParcelId.value_counts().sort_values()

In [62]:
# remove duplicates
renovated_df = renovated_df.drop_duplicates("ParcelId", keep = False)
renovated_df.shape

(22945, 51)

In [104]:
# export renovted_df
renovated_df.to_csv("Renovated.csv", index = False)

In [76]:
# II. import sale data
sale_dtype = {"ExciseTaxNbr": str, "Major": str, "Minor": str}
sale_df = pd.read_csv("EXTR_RPSale.csv", dtype = sale_dtype)
sale_df.shape

(1957142, 24)

In [67]:
sale_df.head()

Unnamed: 0,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,...,PropertyType,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning
0,714942,284150,10,04/06/1983,91500,198304110267,100.0,57.0,284150.0,P,...,2,6,2,N,N,N,,1,8,
1,1729614,172204,9157,12/21/1999,0,19991229001498,,,,,...,3,11,15,N,N,N,N,10,8,31 45
2,1729614,172204,9005,12/21/1999,0,19991229001498,,,,,...,3,11,15,N,N,N,N,10,8,31 45
3,2254430,192304,9020,12/05/2006,0,20061207002200,,,,,...,3,11,15,N,N,N,N,18,2,18 45
4,685277,885730,120,08/11/1982,0,198208170380,86.0,75.0,885730.0,P,...,3,2,15,N,N,N,,1,3,11


In [None]:
sale_df.iloc[1]

In [69]:
# check missing values
na = (sale_df.isnull().sum() / len(sale_df)) * 100
na = na.drop(na[na == 0].index).sort_values(ascending=False)
na_df = pd.DataFrame({'Missing Ratio' :na})
na_df

Unnamed: 0,Missing Ratio


In [95]:
sale_df["ParcelId"] = sale_df["Major"]+sale_df["Minor"]
# keep only those with sale price above zero
saleAboveZero_df = sale_df[sale_df.SalePrice > 0].copy()
saleAboveZero_df.shape

(1277528, 25)

In [80]:
# keep only those in "renovated_df"
renovatedFlag = saleAboveZero_df.ParcelId.isin(renovated_df.ParcelId)
saleAndRenovated_df = saleAboveZero_df[renovatedFlag].copy()
saleAndRenovated_df.shape

(42946, 25)

In [112]:
# keep only ParcelIds that appear more than once
multiFlag = saleAndRenovated_df.duplicated("ParcelId", keep=False)
multiSale_df = saleAndRenovated_df[multiFlag].copy()
multiSale_df.shape

(35654, 25)

In [114]:
# export
multiSale_df.to_csv("MultiSale.csv", index = False)