# Clean Parcel Data
Step required to make the Probabilistic Housing Unit Allocation work.

County Parcel Data needs to be combined with Tax Accessor Data, Census Block, Census Place and PUMA data.

Initial Parcel Data provides Parcel ID and zoning infomration information but does not identify the Census Geography.
The Census Geogrpahy will help to check to make sure that all buildings within the parcel boundaries and the Census Place Boundary are matched. If there are parcels in the Census Place boundary that should have buildings but do not this information may help to prioritize efforts to update the Building Inventory.    

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import pysal as ps
import math as math
import numpy as np
import geopandas as gpd
import pandas as pd
import shapely
import descartes
import folium as fm # folium has more dynamic maps - but requires internet connection
import os # For saving output to path

  from .sqlite import head_to_sql, start_sql


In [2]:
# Display versions being used - important information for replication
import sys
print("Python Version     ", sys.version)
print("pysal version:     ", ps.__version__)
print("numpy version:     ", np.__version__)
print("geopandas version: ", gpd.__version__)
print("pandas version:    ", pd.__version__)
print("shapely version:   ", shapely.__version__)
# print("descartes version:   ", descartes.__version__)  1.1.0
print("folium version:    ", fm.__version__)
# print("os version:    ", os.__version__)

Python Version      3.7.3 (default, Apr 24 2019, 15:29:51) [MSC v.1915 64 bit (AMD64)]
pysal version:      2.0.0
numpy version:      1.16.4
geopandas version:  0.5.0
pandas version:     0.24.2
shapely version:    1.6.4.post1
folium version:     0.9.1


In [3]:
# Store Program Name for output files to have the same name
programname = "IN-CORE_1bv2_Joplin_CleanParcelData_2019-07-11"
# Make directory to save output
if not os.path.exists(programname):
    os.mkdir(programname)

## Read in Parcel Data
Jasper County MO (FIPS 29097) provided Parcel Data for the entire county. 
At the time of this program Newton County MO (FIPS 29145) was not provided. 

In [4]:
parceldata_shp = '../../SourceData/joplin_footprints/confuence_joplin_datasets/jcmo_shapes/parcels.shp'
parceldata_gdf = gpd.read_file(parceldata_shp)
parceldata_gdf.head()

Unnamed: 0,PIN,Graphic_Ac,Legal_Ac,Address,Notes,Zoning,Own_Name,Own_Addres,SHAPE_Leng,SHAPE_Area,geometry
0,13401800000014001,23.6552,23.6552,0 COUNTY RD 100,,,"BRINSON, FRANKLIN L & CHRYSTAL G",5113 E RICHLAND AVE ;CHILLICOTHE;IL;61523,4202.319148,1030421.0,"POLYGON ((2865172.3909375 355416.8737500012, 2..."
1,20300810003013001,0.9522,0.9522,4575 E 13TH ST,,,MACC GROUP,5293 E BALTIC LN ;JOPLIN;MO;64801,819.860123,41478.23,"POLYGON ((2803423.38625 331477.0365625024, 280..."
2,16502110001010001,0.4072,0.42,0 COUNTY RD,,,"SELFRIDGE, STEPHEN W",26164 FIR RD ;JOPLIN;MO;64801,711.23501,17737.05,"POLYGON ((2779427.922812503 355518.2256250047,..."
3,17101200000029002,3.0026,3.0303,29146 HUNTER RD,,,"WALD, DAVID L",29146 HUNTER RD ;CARL JUNCTION;MO;64834,1453.963248,130792.7,"POLYGON ((2764652.6171875 364145.4900000021, 2..."
4,11703500000012000,26.855,26.78,677 JEWEL RD,,,MSJR CORPORATION,7437 LAWRENCE COUNTY AVE ;LA RUSSELL;MO;64848,4591.19516,1169805.0,"POLYGON ((2915297.200625 371798.0109375, 29152..."


In [5]:
parceldata_gdf.crs

{'proj': 'tmerc',
 'lat_0': 36.16666666666666,
 'lon_0': -94.5,
 'k': 0.9999411764705882,
 'x_0': 849999.9999999999,
 'y_0': 0,
 'datum': 'NAD83',
 'units': 'us-ft',
 'no_defs': True}

In [6]:
# Get error in to_crs that can be fixed with the following command
parceldata_gdf = parceldata_gdf[parceldata_gdf.geometry.notnull()]
# Parcel Data projected to have lat Lon for building rep point
latlong_crs = {'init':'epsg:4326'}
parceldata_gdf = parceldata_gdf.to_crs(latlong_crs)
parceldata_gdf.head()

Unnamed: 0,PIN,Graphic_Ac,Legal_Ac,Address,Notes,Zoning,Own_Name,Own_Addres,SHAPE_Leng,SHAPE_Area,geometry
0,13401800000014001,23.6552,23.6552,0 COUNTY RD 100,,,"BRINSON, FRANKLIN L & CHRYSTAL G",5113 E RICHLAND AVE ;CHILLICOTHE;IL;61523,4202.319148,1030421.0,POLYGON ((-94.23765986801661 37.14264461623644...
1,20300810003013001,0.9522,0.9522,4575 E 13TH ST,,,MACC GROUP,5293 E BALTIC LN ;JOPLIN;MO;64801,819.860123,41478.23,POLYGON ((-94.44955759890132 37.07717068019272...
2,16502110001010001,0.4072,0.42,0 COUNTY RD,,,"SELFRIDGE, STEPHEN W",26164 FIR RD ;JOPLIN;MO;64801,711.23501,17737.05,POLYGON ((-94.53184037806825 37.14320902194232...
3,17101200000029002,3.0026,3.0303,29146 HUNTER RD,,,"WALD, DAVID L",29146 HUNTER RD ;CARL JUNCTION;MO;64834,1453.963248,130792.7,POLYGON ((-94.58255904726742 37.16688013485778...
4,11703500000012000,26.855,26.78,677 JEWEL RD,,,MSJR CORPORATION,7437 LAWRENCE COUNTY AVE ;LA RUSSELL;MO;64848,4591.19516,1169805.0,POLYGON ((-94.06543302536771 37.18713022533776...


In [7]:
# Add Representative Point
parceldata_gdf.loc[parceldata_gdf.index, 'rppnt4326'] = parceldata_gdf['geometry'].representative_point()
parceldata_gdf['rppnt4326'].label = "Representative Point EPSG 4326 (WKT)"
parceldata_gdf['rppnt4326'].notes = "Internal Point within parcel poly EPSG 4326"

# Add Column that Duplicates Polygon Geometry - allows for swithcing between point and polygon geometries for spatial join
parceldata_gdf.loc[parceldata_gdf.index, 'prcl4326'] = parceldata_gdf['geometry']
parceldata_gdf['prcl4326'].label = "Parcel Polygon EPSG 4326 (WKT)"
parceldata_gdf['prcl4326'].notes = "Polygon Shape Points for Parcel Polygon EPSG 4326"

## Does the Parcel Data have a Unique ID

In [8]:
# Count the number of Unique Values
parceldata_gdf[['PIN']].describe()

Unnamed: 0,PIN
count,57302
unique,57235
top,ISLAND
freq,14


In [9]:
# Are there any missing values for the unique id?
parceldata_gdf.loc[parceldata_gdf['PIN'].isnull()]

Unnamed: 0,PIN,Graphic_Ac,Legal_Ac,Address,Notes,Zoning,Own_Name,Own_Addres,SHAPE_Leng,SHAPE_Area,geometry,rppnt4326,prcl4326
50277,,0.0,0.0,,,,,,6820.817115,2290892.0,POLYGON ((-94.42260311421195 37.08057721265357...,POINT (-94.41848334555092 37.08130172885706),POLYGON ((-94.42260311421195 37.08057721265357...
50278,,0.0,0.0,,,,,,1819.195223,74365.22,POLYGON ((-94.42242013382311 37.08106111848689...,POINT (-94.42213046784147 37.0818570045905),POLYGON ((-94.42242013382311 37.08106111848689...


In [10]:
# List duplicates for the Unique ID
pd.crosstab(index=parceldata_gdf.duplicated(subset=['PIN']), columns="count", margins=True, margins_name="Total")
#parceldata_gdf.loc[parceldata_gdf.duplicated(subset=['PIN'])]

col_0,count,Total
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
False,57236,57236
True,68,68
Total,57304,57304


It appears that PIN is a mix of numbers and strings. With 57,302 values and 57,235 unique values. There are 68 duplicates and 2 missing values.

In [11]:
# Collapse Parcels By Parcel ID
parceldata_gdf_parcelcount = parceldata_gdf[['PIN']]
parceldata_gdf_parcelcount['parcel_count'] = 1
parceldata_gdf_parcelcount_sum = parceldata_gdf_parcelcount.groupby(['PIN']).sum()
parceldata_gdf_parcelcount_sum['parcel_count'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


count    57235.000000
mean         1.001171
std          0.084112
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max         14.000000
Name: parcel_count, dtype: float64

In [12]:
parceldata_gdf_parcelcount_sum.loc[parceldata_gdf_parcelcount_sum['parcel_count']>1]

Unnamed: 0_level_0,parcel_count
PIN,Unnamed: 1_level_1
07601400000001000,2
07601400000001001,2
07802800000004005,2
08501520011014000,2
08803400000009008,3
08902900000011023,2
08903140009019000,2
09703500000054000,2
13402000000020000,2
14200330025010000,2


In [13]:
parceldata_gdf[['PIN','geometry']].loc[parceldata_gdf['PIN']== '14401700000023000']

Unnamed: 0,PIN,geometry
42758,14401700000023000,"POLYGON ((-94.34279600734241 37.1547468787871,..."
56967,14401700000023000,(POLYGON ((-94.3375649304737 37.14519332473801...
56968,14401700000023000,POLYGON ((-94.34336092343381 37.15221214624182...


### For the duplicate Parcels Errors in Geometry
It looks like the duplicate parcels do not have the same geographies. Parcel PIN 14401700000023000 has 3 polygons, and one of the polygons is a multipolygon.
#### Use Dissolve to combine Polygons by PIN

In [14]:
parceldata_gdf_dissolve = parceldata_gdf[['PIN','SHAPE_Area']]
parceldata_gdf_dissolve = parceldata_gdf.dissolve(by=parceldata_gdf['PIN'])
parceldata_gdf_dissolve.loc[parceldata_gdf_dissolve['PIN']== '14401700000023000']

Unnamed: 0_level_0,geometry,PIN,Graphic_Ac,Legal_Ac,Address,Notes,Zoning,Own_Name,Own_Addres,SHAPE_Leng,SHAPE_Area,rppnt4326,prcl4326
PIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
14401700000023000,(POLYGON ((-94.34279600734241 37.1547468787871...,14401700000023000,209.741,209.741,"15823 15879,15895,15889,15915 FIR RD",,,"BLANKENSHIP, PHILLIP W",2012 S QUEST LN ;CARTHAGE;MO;64836,11372.485293,3940629.0,POINT (-94.34191422085354 37.15072198711439),"POLYGON ((-94.34279600734241 37.1547468787871,..."


In [15]:
pd.crosstab(parceldata_gdf_dissolve['Zoning'], 'count',
            margins=True, margins_name="Total")

col_0,count,Total
Zoning,Unnamed: 1_level_1,Unnamed: 2_level_1
C1,338,338
C1-PD,50,50
C2,303,303
C2-PD,28,28
C3,432,432
C3-PD,26,26
CO,164,164
CO-PD,39,39
M1,7,7
M1-PD,77,77


In [16]:
pd.crosstab(parceldata_gdf['Zoning'], 'count',
            margins=True, margins_name="Total")

col_0,count,Total
Zoning,Unnamed: 1_level_1,Unnamed: 2_level_1
C1,339,339
C1-PD,50,50
C2,303,303
C2-PD,29,29
C3,433,433
C3-PD,26,26
CO,164,164
CO-PD,39,39
M1,7,7
M1-PD,77,77


In [17]:
# List duplicates for the Unique ID
pd.crosstab(index=parceldata_gdf_dissolve.duplicated(subset=['PIN']), columns="count", margins=True, margins_name="Total")

col_0,count,Total
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
False,57235,57235
Total,57235,57235


### It looks like the dissolve solves the duplicate parcel problem

In [18]:
# Save Work at this point - can not have multiple columns with WKT information when saving
wkt_columns = ['rppnt4326','prcl4326']
# remove wkt columns from list of cols
cols = [col for col in parceldata_gdf_dissolve if col not in wkt_columns]
savefile = sys.path[0]+"/"+programname+"/"+programname+".shp"
parceldata_gdf_dissolve[cols].to_file(savefile)

## Clean Parcel Tax Data
The parcel Shape File has the location of parcels with a Parcel ID.

### Residential and Commercial 2010 Tax Details 
Harvey Cuttler and the CSU Econ team collected data on Parcel Values.

This data should merge with the Parcel ID.

In [19]:
parceldatavalues = '../../SourceData/joplin_footprints/confuence_joplin_datasets/Copy of 2010- 2012 Data Request Res  CommClass.xlsx'

In [20]:
parceldatavalues_2010res_df = pd.read_excel(parceldatavalues, sheet_name='2010 Res')
parceldatavalues_2010res_df.head()

Unnamed: 0,#,TAXYR,PARID,OWN1,ADRNO,ADRDIR,ADRSTR,CITYNAME,STATECODE,ZIP1,ADRNO.1,ADRDIR.1,ADRSTR.1,CLASS,YRBLT,CARD,ACRES,APRBLDG,APRLAND,APRTOT
0,1,2010,1401700000002000,"NEIDIGH, JOSEPH A & LISA L",5269.0,,THORN,JASPER,MO,64755.0,5296.0,,THORN,R,1970,1,158.22,77070.0,63870.0,140940
1,2,2010,1401700000003000,"RILEY, DONALD W & ROSA",1564.0,N,ARLENE,JOPLIN,MO,64801.0,5699.0,,THORN,R,1960,1,317.32,93500.0,87380.0,180880
2,3,2010,1401800000001000,"POTTS, LOREN A JR & GEORGIA H",20912.0,,COUNTY RD 70,JASPER,MO,64755.0,20912.0,,COUNTY RD 70,R,1955,1,313.5,55610.0,79370.0,134980
3,4,2010,1401800000002000,"BRUMMETT, BRANDON & KERI WHITTLE",20214.0,,COUNTY RD 70,JASPER,MO,64755.0,20214.0,,COUNTY RD 70,R,1955,1,151.0,55920.0,53230.0,109150
4,5,2010,1401800000002001,"REED, JEFFERY A",16032.0,,COUNTY RD 30,GOLDEN CITY,MO,64748.0,6445.0,,THORN,R,2005,1,52.73,113400.0,21290.0,134690


In [21]:
parceldatavalues_2010comm_df = pd.read_excel(parceldatavalues, sheet_name='2010 Comm')
parceldatavalues_2010comm_df.head()

Unnamed: 0,#,TAXYR,PARID,OWN1,ADRNO,ADRDIR,ADRSTR,CITYNAME,STATECODE,ZIP1,...,ADRDIR.1,ADRSTR.1,ACRES,APRBLDG,APRLAND,APRTOT,CARD,YRBLT,CLASS,STRUCTURE
0,1,2010,1401700000004000,PLEASANT VIEW CEMETERY ASSN,,,,GOLDEN CITY,MO,64748.0,...,,THORN,2.48,10890.0,14880,25770,1,1900,E,620.0
1,2,2010,2501500000002000,"LEAMING, LEO DON & CATHERINE J",20654.0,,COUNTY RD 100,JASPER,MO,64755.0,...,,COUNTY RD 100,326.06,82940.0,84990,167930,2,2006,A,354.0
2,3,2010,2702600000001000,"RUSH, ROCKY H",8378.0,,SUMAC,JASPER,MO,64755.0,...,,SUMAC,140.32,112130.0,64840,176970,2,2006,C,342.0
3,4,2010,3501500000006000,"DODSON, DAVID L & LOIS",15515.0,,THORN,JASPER,MO,64755.0,...,,THORN,4.16,102950.0,9400,112350,2,1987,C,342.0
4,5,2010,3601330001004000,"JONES, CARL J & BESSIE",,,PO BOX 236,JASPER,MO,64755.0,...,,4TH,1.12,24550.0,6480,31030,1,1980,C,336.0


In [22]:
residential_count = parceldatavalues_2010res_df['PARID'].count()
commericial_count =parceldatavalues_2010comm_df['PARID'].count()
expected_append_count = residential_count + commericial_count
print("The original data had ",residential_count," residential observations and ",commericial_count," commerical observations")
print("The combined data will have ",expected_append_count)

The original data had  40850  residential observations and  4608  commerical observations
The combined data will have  45458


In [23]:
cols2010res = [col for col in parceldatavalues_2010res_df]
cols2010comm = [col for col in parceldatavalues_2010comm_df]
# Before appending recognize that the two files have different columns
list(set(cols2010comm) - set(cols2010res))

['STRUCTURE']

In [24]:
# Add variable that identifies the source sheet from the Excel Sheet
parceldatavalues_2010res_df['taxsheet'] = '2010 Res'
parceldatavalues_2010comm_df['taxsheet'] = '2010 Comm'

In [25]:
# Append Commericial and Residential Data
parceldatavalues_2010_df = parceldatavalues_2010comm_df.append(parceldatavalues_2010res_df, sort=True)
parceldatavalues_2010_df['PARID'].count()

45458

In [26]:
cols2010 = [col for col in parceldatavalues_2010_df]
cols2010

['#',
 'ACRES',
 'ADRDIR',
 'ADRDIR.1',
 'ADRNO',
 'ADRNO.1',
 'ADRSTR',
 'ADRSTR.1',
 'APRBLDG',
 'APRLAND',
 'APRTOT',
 'CARD',
 'CITYNAME',
 'CLASS',
 'OWN1',
 'PARID',
 'STATECODE',
 'STRUCTURE',
 'TAXYR',
 'YRBLT',
 'ZIP1',
 'taxsheet']

### Check Append Worked as expected
Compare values for Year Built and Structure. Structure was only on the Commercial Tax Sheet.

In [27]:
parceldatavalues_2010res_df['YRBLT'].describe()

count    40850.000000
mean      1956.480245
std         40.457549
min          0.000000
25%       1925.000000
50%       1960.000000
75%       1991.000000
max       2010.000000
Name: YRBLT, dtype: float64

In [28]:
parceldatavalues_2010comm_df['YRBLT'].describe()

count    4608.000000
mean     1960.659505
std        93.207121
min         0.000000
25%      1940.000000
50%      1975.000000
75%      1992.000000
max      2010.000000
Name: YRBLT, dtype: float64

In [29]:
parceldatavalues_2010_df['YRBLT'].describe()

count    45458.000000
mean      1956.903889
std         48.507236
min          0.000000
25%       1925.000000
50%       1962.000000
75%       1992.000000
max       2010.000000
Name: YRBLT, dtype: float64

In [30]:
parceldatavalues_2010comm_df['STRUCTURE'].describe()

count    4604.000000
mean      365.170721
std        86.250923
min       101.000000
25%       335.000000
50%       345.000000
75%       396.000000
max       721.000000
Name: STRUCTURE, dtype: float64

In [31]:
parceldatavalues_2010_df['STRUCTURE'].describe()

count    4604.000000
mean      365.170721
std        86.250923
min       101.000000
25%       335.000000
50%       345.000000
75%       396.000000
max       721.000000
Name: STRUCTURE, dtype: float64

In [32]:
pd.crosstab(parceldatavalues_2010_df['CLASS'], parceldatavalues_2010_df['taxsheet'],
            margins=True, margins_name="Total")

taxsheet,2010 Comm,2010 Res,Total
CLASS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,7,7,14
C,2927,70,2997
E,242,182,424
R,366,30578,30944
Total,3542,30837,34379


## Does the Parcel Tax Data have a Unique ID

In [33]:
# Count the number of Unique Values
parceldatavalues_2010_df[['PARID']].describe()

Unnamed: 0,PARID
count,45458.0
mean,1.647011e+16
std,3786165000000000.0
min,1401700000000000.0
25%,1.450151e+16
50%,1.670253e+16
75%,1.920103e+16
max,2.36014e+16


In [34]:
# Count the number of Unique Values
parceldatavalues_2010_df[['PARID']].nunique()

PARID    42146
dtype: int64

In [35]:
# Are there any missing values for the unique id?
parceldatavalues_2010_df.loc[parceldatavalues_2010_df['PARID'].isnull()]

Unnamed: 0,#,ACRES,ADRDIR,ADRDIR.1,ADRNO,ADRNO.1,ADRSTR,ADRSTR.1,APRBLDG,APRLAND,...,CITYNAME,CLASS,OWN1,PARID,STATECODE,STRUCTURE,TAXYR,YRBLT,ZIP1,taxsheet


In [36]:
# List duplicates for the Unique ID
pd.crosstab(index=parceldatavalues_2010_df.duplicated(subset=['PARID']), columns="count", margins=True, margins_name="Total")
#parceldata_gdf.loc[parceldata_gdf.duplicated(subset=['PIN'])]

col_0,count,Total
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
False,42146,42146
True,3312,3312
Total,45458,45458


It appears that PARID is only numeric. With 45,458 values and 42,146 unique values. There are 3,312 duplicates and no missing values.

It is possible that for many of the duplicate values the individual appraisal values are the same. 

Let's collapse the data by PARID and see if the sum and count of the parcel values produces the same values.

In [37]:
# Collapse Parcels By Parcel ID
parceldatavalues_2010_df_parcelcount = parceldatavalues_2010_df[['PARID','APRBLDG','APRLAND','APRTOT']]
parceldatavalues_2010_df_parcelcount['parcel_count'] = 1
parceldatavalues_2010_df_parcelcount_sum = parceldatavalues_2010_df_parcelcount.groupby(['PARID']).sum()
parceldatavalues_2010_df_parcelcount_sum.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,APRBLDG,APRLAND,APRTOT,parcel_count
PARID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1401700000002000,77070.0,63870.0,140940,1
1401700000003000,93500.0,87380.0,180880,1
1401700000004000,10890.0,14880.0,25770,1
1401800000001000,55610.0,79370.0,134980,1
1401800000002000,55920.0,53230.0,109150,1


In [38]:
# Group by moves the unique id to the index - make a new variable for merge
parceldatavalues_2010_df_parcelcount_sum['parid'] = parceldatavalues_2010_df_parcelcount_sum.index
parceldatavalues_2010_df_parcelcount_sum.head()

Unnamed: 0_level_0,APRBLDG,APRLAND,APRTOT,parcel_count,parid
PARID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1401700000002000,77070.0,63870.0,140940,1,1401700000002000
1401700000003000,93500.0,87380.0,180880,1,1401700000003000
1401700000004000,10890.0,14880.0,25770,1,1401700000004000
1401800000001000,55610.0,79370.0,134980,1,1401800000001000
1401800000002000,55920.0,53230.0,109150,1,1401800000002000


In [39]:
# Count the number of Unique Values
parceldatavalues_2010_df_parcelcount_sum[['parid']].nunique()

parid    42146
dtype: int64

In [40]:
parceldatavalues_2010_df_parcelcount_sum[['parid']].describe()

Unnamed: 0,parid
count,42146.0
mean,1.643548e+16
std,3782676000000000.0
min,1401700000000000.0
25%,1.450151e+16
50%,1.660244e+16
75%,1.920103e+16
max,2.36014e+16


### Compare parcels with multiple counts to see if sum of appraised values is accurate
The question is - do duplicate parcels have the same appraised values and therefore the appraised values should not be summed or if the appraised values need to be summed to provide accurate total value.

40,056 out of 42,146 parcels have 1 parcel id in the tax sheets. But 2,090 parcels have counts greater than 1.
Four parcels are the the tax sheets 22 times.

From exploring the data it appears that the duplicates should simply use the single value. In the cases explored the duplicate values are identical appraisal values across observations and across tax sheets.

In [41]:
pd.crosstab(index=parceldatavalues_2010_df_parcelcount_sum['parcel_count'], columns="count", margins=True, margins_name="Total")

col_0,count,Total
parcel_count,Unnamed: 1_level_1,Unnamed: 2_level_1
1,40056,40056
2,1629,1629
3,236,236
4,104,104
5,37,37
6,24,24
7,9,9
8,7,7
9,8,8
10,7,7


In [42]:
parceldatavalues_2010_df_parcelcount_sum.loc[parceldatavalues_2010_df_parcelcount_sum['parcel_count']>10]

Unnamed: 0_level_0,APRBLDG,APRLAND,APRTOT,parcel_count,parid
PARID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9803300000001000,4872340.0,3888500.0,8760840,22,9803300000001000
14200410007001000,15579820.0,579700.0,16159520,17,14200410007001000
14200930001004000,20232520.0,963200.0,21195720,14,14200930001004000
14200930001015000,21092400.0,1560000.0,22652400,15,14200930001015000
14501500000012000,10659000.0,1407600.0,12066600,12,14501500000012000
14501520005040000,3569280.0,2205060.0,5774340,11,14501520005040000
15401820002001000,27019460.0,643450.0,27662910,17,15401820002001000
15401840020003000,3546560.0,426240.0,3972800,16,15401840020003000
15601300000009000,9572000.0,22429600.0,32001600,16,15601300000009000
15903000000027000,12199800.0,669600.0,12869400,12,15903000000027000


In [43]:
parceldatavalues_2010_df[['PARID','ADRSTR','APRBLDG','APRLAND','APRTOT','OWN1']].loc[parceldatavalues_2010_df['PARID']==
                                                                                     20300720012001000]

Unnamed: 0,PARID,ADRSTR,APRBLDG,APRLAND,APRTOT,OWN1
38570,20300720012001000,BRAMER,583360.0,29700.0,613060,GUS ENTERPRISES
38571,20300720012001000,BRAMER,583360.0,29700.0,613060,GUS ENTERPRISES
38572,20300720012001000,BRAMER,583360.0,29700.0,613060,GUS ENTERPRISES
38573,20300720012001000,BRAMER,583360.0,29700.0,613060,GUS ENTERPRISES
38574,20300720012001000,BRAMER,583360.0,29700.0,613060,GUS ENTERPRISES
38575,20300720012001000,BRAMER,583360.0,29700.0,613060,GUS ENTERPRISES
38576,20300720012001000,BRAMER,583360.0,29700.0,613060,GUS ENTERPRISES
38577,20300720012001000,BRAMER,583360.0,29700.0,613060,GUS ENTERPRISES
38578,20300720012001000,BRAMER,583360.0,29700.0,613060,GUS ENTERPRISES
38579,20300720012001000,BRAMER,583360.0,29700.0,613060,GUS ENTERPRISES


In [44]:
parceldatavalues_2010_df[['PARID','ADRSTR','APRBLDG','APRLAND','APRTOT','OWN1']].loc[parceldatavalues_2010_df['PARID']==
                                                                                     19601310012016000]

Unnamed: 0,PARID,ADRSTR,APRBLDG,APRLAND,APRTOT,OWN1
35036,19601310012016000,PO BOX 973,3066760.0,92970.0,3159730,"BIRD, CHARLES R TR"
35037,19601310012016000,PO BOX 973,3066760.0,92970.0,3159730,"BIRD, CHARLES R TR"
35038,19601310012016000,PO BOX 973,3066760.0,92970.0,3159730,"BIRD, CHARLES R TR"
35039,19601310012016000,PO BOX 973,3066760.0,92970.0,3159730,"BIRD, CHARLES R TR"
35040,19601310012016000,PO BOX 973,3066760.0,92970.0,3159730,"BIRD, CHARLES R TR"
35041,19601310012016000,PO BOX 973,3066760.0,92970.0,3159730,"BIRD, CHARLES R TR"
35042,19601310012016000,PO BOX 973,3066760.0,92970.0,3159730,"BIRD, CHARLES R TR"
35043,19601310012016000,PO BOX 973,3066760.0,92970.0,3159730,"BIRD, CHARLES R TR"
35044,19601310012016000,PO BOX 973,3066760.0,92970.0,3159730,"BIRD, CHARLES R TR"
35045,19601310012016000,PO BOX 973,3066760.0,92970.0,3159730,"BIRD, CHARLES R TR"


In [45]:
parceldatavalues_2010_df[['PARID','ADRSTR','APRBLDG','APRLAND','APRTOT','OWN1']].loc[parceldatavalues_2010_df['PARID']==
                                                                                     15903030003002006]

Unnamed: 0,PARID,ADRSTR,APRBLDG,APRLAND,APRTOT,OWN1
1628,15903030003002006,GENEVA,1240140.0,199160.0,1439300,"BOX, DAVID JR & CARRIE"
1629,15903030003002006,GENEVA,1240140.0,199160.0,1439300,"BOX, DAVID JR & CARRIE"
1630,15903030003002006,GENEVA,1240140.0,199160.0,1439300,"BOX, DAVID JR & CARRIE"
1631,15903030003002006,GENEVA,1240140.0,199160.0,1439300,"BOX, DAVID JR & CARRIE"
1632,15903030003002006,GENEVA,1240140.0,199160.0,1439300,"BOX, DAVID JR & CARRIE"
1633,15903030003002006,GENEVA,1240140.0,199160.0,1439300,"BOX, DAVID JR & CARRIE"
1634,15903030003002006,GENEVA,1240140.0,199160.0,1439300,"BOX, DAVID JR & CARRIE"
1635,15903030003002006,GENEVA,1240140.0,199160.0,1439300,"BOX, DAVID JR & CARRIE"
1636,15903030003002006,GENEVA,1240140.0,199160.0,1439300,"BOX, DAVID JR & CARRIE"
1637,15903030003002006,GENEVA,1240140.0,199160.0,1439300,"BOX, DAVID JR & CARRIE"


In [46]:
# Collapse Parcels By Parcel ID
parceldatavalues_2010_df_parceldata = parceldatavalues_2010_df[['PARID','CLASS','taxsheet']]
#if blank values are NaN first replace to ''
parceldatavalues_2010_df_parceldata = parceldatavalues_2010_df_parceldata.fillna('')
parceldatavalues_2010_df_parcelcount_unique = parceldatavalues_2010_df_parceldata.groupby(['PARID']).agg(lambda x: ''.join(x.unique()))
parceldatavalues_2010_df_parcelcount_unique.head()

Unnamed: 0_level_0,CLASS,taxsheet
PARID,Unnamed: 1_level_1,Unnamed: 2_level_1
1401700000002000,R,2010 Res
1401700000003000,R,2010 Res
1401700000004000,E,2010 Comm
1401800000001000,R,2010 Res
1401800000002000,R,2010 Res


In [47]:
pd.crosstab(parceldatavalues_2010_df_parcelcount_unique['CLASS'], parceldatavalues_2010_df_parcelcount_unique['taxsheet'],
            margins=True, margins_name="Total")

taxsheet,2010 Comm,2010 Comm2010 Res,2010 Res,Total
CLASS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,756,6,9532,10294
A,4,0,6,10
AR,0,3,1,4
C,2069,20,50,2139
CR,8,177,5,190
E,209,18,111,338
EC,1,1,0,2
ER,0,1,2,3
R,124,45,28989,29158
RC,5,3,0,8


In [48]:
parceldatavalues_2010_df_parcelcount_unique.loc[parceldatavalues_2010_df_parcelcount_unique['CLASS']=="CR"]

Unnamed: 0_level_0,CLASS,taxsheet
PARID,Unnamed: 1_level_1,Unnamed: 2_level_1
2702600000001000,CR,2010 Comm2010 Res
3501500000006000,CR,2010 Comm2010 Res
3602300000019000,CR,2010 Comm2010 Res
3602420006008001,CR,2010 Comm2010 Res
3602420007001000,CR,2010 Comm2010 Res
3602420024002000,CR,2010 Comm2010 Res
3703500000009000,CR,2010 Comm2010 Res
7702600000003004,CR,2010 Comm2010 Res
7703500000027000,CR,2010 Comm2010 Res
7703640001001000,CR,2010 Comm2010 Res


In [49]:
parceldatavalues_2010_df[['PARID','ADRSTR','APRBLDG','APRLAND','APRTOT','OWN1','taxsheet','CLASS']].loc[parceldatavalues_2010_df['PARID']==
                                                                                     23300830017007000]

Unnamed: 0,PARID,ADRSTR,APRBLDG,APRLAND,APRTOT,OWN1,taxsheet,CLASS
4524,23300830017007000,CLARENCE,49590.0,3610.0,53200,"BLACK, BENJAMIN F & RUTH M",2010 Comm,C
40452,23300830017007000,CLARENCE,49590.0,3610.0,53200,"BLACK, BENJAMIN F & RUTH M",2010 Res,R


In [50]:
parceldatavalues_2010_df[['PARID','ADRSTR','APRBLDG','APRLAND','APRTOT','OWN1','taxsheet','CLASS']].loc[parceldatavalues_2010_df['PARID']==
                                                                                     20401800000032000]

Unnamed: 0,PARID,ADRSTR,APRBLDG,APRLAND,APRTOT,OWN1,taxsheet,CLASS
4315,20401800000032000,PO BOX 25025,442480.0,316100.0,758580,STORAGE EQUITIES INC PS PARTNERS IV,2010 Comm,C
39417,20401800000032000,PO BOX 25025,442480.0,316100.0,758580,STORAGE EQUITIES INC PS PARTNERS IV,2010 Res,R


In [51]:
# Group by moves the unique id to the index - make a new variable for merge
parceldatavalues_2010_df_parcelcount_unique['parid'] = parceldatavalues_2010_df_parcelcount_unique.index
parceldatavalues_2010_df_parcelcount_unique['classv2'] = parceldatavalues_2010_df_parcelcount_unique['CLASS']
parceldatavalues_2010_df_parcelcount_unique['taxsheetv2'] = parceldatavalues_2010_df_parcelcount_unique['taxsheet']
parceldatavalues_2010_df_parcelcount_unique.head()

Unnamed: 0_level_0,CLASS,taxsheet,parid,classv2,taxsheetv2
PARID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1401700000002000,R,2010 Res,1401700000002000,R,2010 Res
1401700000003000,R,2010 Res,1401700000003000,R,2010 Res
1401700000004000,E,2010 Comm,1401700000004000,E,2010 Comm
1401800000001000,R,2010 Res,1401800000001000,R,2010 Res
1401800000002000,R,2010 Res,1401800000002000,R,2010 Res


### One last check - Generate Min Max Appraisal Values 

In [52]:
# Collapse Parcels By Parcel ID
parceldatavalues_2010_df_parcelcount = parceldatavalues_2010_df[['PARID','APRBLDG','APRLAND','APRTOT']]
parceldatavalues_2010_df_parcelcount['parcel_count'] = 1
parceldatavalues_2010_df_parcelcount_min = parceldatavalues_2010_df_parcelcount.groupby(['PARID']).min()
parceldatavalues_2010_df_parcelcount_max = parceldatavalues_2010_df_parcelcount.groupby(['PARID']).max()
parceldatavalues_2010_df_parcelcount_minmax = pd.merge(parceldatavalues_2010_df_parcelcount_min, 
                                                       parceldatavalues_2010_df_parcelcount_max,
                                                       left_on='PARID', right_on='PARID', how='left')
parceldatavalues_2010_df_parcelcount_minmax['diff_aprbldg'] = parceldatavalues_2010_df_parcelcount_minmax['APRBLDG_x'] - parceldatavalues_2010_df_parcelcount_minmax['APRBLDG_y']
parceldatavalues_2010_df_parcelcount_minmax['diff_aprbldg'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


count    42141.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: diff_aprbldg, dtype: float64

### Based on the exploration duplicate parcels ids should be dropped

In [53]:
parceldatavalues_2010_df_nodup = parceldatavalues_2010_df.drop_duplicates(subset=['PARID'])
parceldatavalues_2010_df_nodup['PARID'].describe()

count    4.214600e+04
mean     1.643548e+16
std      3.782676e+15
min      1.401700e+15
25%      1.450151e+16
50%      1.660244e+16
75%      1.920103e+16
max      2.360140e+16
Name: PARID, dtype: float64

In [54]:
parceldatavalues_2010_df_nodup['PARID'].nunique()

42146

### To keep track of parcel data merge in Tax Sheet and Parcel Count Information

In [55]:
parceldatavalues_2010_df_nodupv2 = pd.merge(parceldatavalues_2010_df_nodup, 
                                    parceldatavalues_2010_df_parcelcount_sum[['parid','parcel_count']], 
                                    left_on='PARID', right_on='parid', how='left')
parceldatavalues_2010_df_nodupv2 = pd.merge(parceldatavalues_2010_df_nodupv2, 
                                    parceldatavalues_2010_df_parcelcount_unique[['parid','classv2','taxsheetv2']], 
                                    left_on='PARID', right_on='parid', how='left')
# Drop duplicate columns for parid
dropcols = ['parid_x','parid_y']
cols = [col for col in parceldatavalues_2010_df_nodupv2 if col not in dropcols]
parceldatavalues_2010_df_nodupv2 = parceldatavalues_2010_df_nodupv2[cols]
parceldatavalues_2010_df_nodupv2.head()

Unnamed: 0,#,ACRES,ADRDIR,ADRDIR.1,ADRNO,ADRNO.1,ADRSTR,ADRSTR.1,APRBLDG,APRLAND,...,PARID,STATECODE,STRUCTURE,TAXYR,YRBLT,ZIP1,taxsheet,parcel_count,classv2,taxsheetv2
0,1,2.48,,,,5503.0,,THORN,10890.0,14880.0,...,1401700000004000,MO,620.0,2010,1900,64748.0,2010 Comm,1,E,2010 Comm
1,2,326.06,,,20654.0,20654.0,COUNTY RD 100,COUNTY RD 100,82940.0,84990.0,...,2501500000002000,MO,354.0,2010,2006,64755.0,2010 Comm,2,AR,2010 Comm2010 Res
2,3,140.32,,,8378.0,8378.0,SUMAC,SUMAC,112130.0,64840.0,...,2702600000001000,MO,342.0,2010,2006,64755.0,2010 Comm,2,CR,2010 Comm2010 Res
3,4,4.16,,,15515.0,15515.0,THORN,THORN,102950.0,9400.0,...,3501500000006000,MO,342.0,2010,1987,64755.0,2010 Comm,2,CR,2010 Comm2010 Res
4,5,1.12,,,,,PO BOX 236,4TH,24550.0,6480.0,...,3601330001004000,MO,336.0,2010,1980,64755.0,2010 Comm,3,C,2010 Comm


In [56]:
pd.crosstab(parceldatavalues_2010_df_nodupv2['classv2'], parceldatavalues_2010_df_nodupv2['taxsheetv2'],
            margins=True, margins_name="Total")

taxsheetv2,2010 Comm,2010 Comm2010 Res,2010 Res,Total
classv2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,756,6,9532,10294
A,4,0,6,10
AR,0,3,1,4
C,2069,20,50,2139
CR,8,177,5,190
E,209,18,111,338
EC,1,1,0,2
ER,0,1,2,3
R,124,45,28989,29158
RC,5,3,0,8


## Add Cleaned up Parcel Tax Data to Parcel GeoDataFrame

In [57]:
parceldata_gdf_dissolve['parid'] = parceldata_gdf_dissolve.index
parceldata_gdf_dissolve.head()

Unnamed: 0_level_0,geometry,PIN,Graphic_Ac,Legal_Ac,Address,Notes,Zoning,Own_Name,Own_Addres,SHAPE_Leng,SHAPE_Area,rppnt4326,prcl4326,parid
PIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1401700000001000,POLYGON ((-94.13699017271023 37.35055489878356...,1401700000001000,159.589,160.0,0 THORN & COUNTY RD 50,,,"SEELA, RAYMOND E & FLORENCE A",19483 COUNTY RD 50 ;JASPER;MO;64755,13159.744243,6951680.0,POINT (-94.13949023525831 37.34331162232674),POLYGON ((-94.13699017271023 37.35055489878356...,1401700000001000
1401700000002000,POLYGON ((-94.14188721000529 37.33973595930397...,1401700000002000,161.077,158.22,5296 THORN RD,,,"NEIDIGH, JOSEPH A & LISA L",5269 THORN RD ;JASPER;MO;64755,13194.656023,7016507.0,POINT (-94.14408989691671 37.34343354130833),POLYGON ((-94.14188721000529 37.33973595930397...,1401700000002000
1401700000003000,POLYGON ((-94.14613735909428 37.35070183976294...,1401700000003000,323.007,317.32,5699 THORN RD,,,"WILSON, SAM L & KRISTI L",1005 DALTON RD ;CEDAR VALE;KS;67024,15873.444758,14070200.0,POINT (-94.15103837923394 37.34357968919036),POLYGON ((-94.14613735909428 37.35070183976294...,1401700000003000
1401700000004000,POLYGON ((-94.14660779875197 37.33704609240082...,1401700000004000,2.47968,2.48,5503 THORN RD,,,PLEASANT VIEW CEMETERY ASSN,0 ;GOLDEN CITY;MO;64748,1333.926784,108015.0,POINT (-94.14644183959757 37.33666245802258),POLYGON ((-94.14660779875197 37.33704609240082...,1401700000004000
1401800000001000,POLYGON ((-94.15538782377411 37.35084969271262...,1401800000001000,313.296,313.5,20912 COUNTY RD 70,,,"POTTS, LOREN A JR & GEORGIA H",20912 COUNTY RD 70 ;JASPER;MO;64755,15641.42532,13647260.0,POINT (-94.16443045230865 37.34744841332255),POLYGON ((-94.15538782377411 37.35084969271262...,1401800000001000


In [58]:
parceldata_gdf_dissolve['parid'].describe()

count                 57235
unique                57235
top       07121100000006000
freq                      1
Name: parid, dtype: object

In [59]:
# Make Merge ID a string
parceldatavalues_2010_df_nodupv2['parid'] = parceldatavalues_2010_df_nodupv2['PARID'] .apply(lambda x : str((x)))
parceldatavalues_2010_df_nodupv2['parid'].describe()

count                 42146
unique                42146
top       14200420001012000
freq                      1
Name: parid, dtype: object

In [60]:
parceldatavalues_2010_df_nodupv2.head()

Unnamed: 0,#,ACRES,ADRDIR,ADRDIR.1,ADRNO,ADRNO.1,ADRSTR,ADRSTR.1,APRBLDG,APRLAND,...,STATECODE,STRUCTURE,TAXYR,YRBLT,ZIP1,taxsheet,parcel_count,classv2,taxsheetv2,parid
0,1,2.48,,,,5503.0,,THORN,10890.0,14880.0,...,MO,620.0,2010,1900,64748.0,2010 Comm,1,E,2010 Comm,1401700000004000
1,2,326.06,,,20654.0,20654.0,COUNTY RD 100,COUNTY RD 100,82940.0,84990.0,...,MO,354.0,2010,2006,64755.0,2010 Comm,2,AR,2010 Comm2010 Res,2501500000002000
2,3,140.32,,,8378.0,8378.0,SUMAC,SUMAC,112130.0,64840.0,...,MO,342.0,2010,2006,64755.0,2010 Comm,2,CR,2010 Comm2010 Res,2702600000001000
3,4,4.16,,,15515.0,15515.0,THORN,THORN,102950.0,9400.0,...,MO,342.0,2010,1987,64755.0,2010 Comm,2,CR,2010 Comm2010 Res,3501500000006000
4,5,1.12,,,,,PO BOX 236,4TH,24550.0,6480.0,...,MO,336.0,2010,1980,64755.0,2010 Comm,3,C,2010 Comm,3601330001004000


In [61]:
# Merge Tax data with Parcel Shape Data
parceldata_gdf_dissolve_2010values = pd.merge(parceldata_gdf_dissolve, parceldatavalues_2010_df_nodupv2, 
                                              left_on='parid', right_on='parid', how='left')
parceldata_gdf_dissolve_2010values.head()

Unnamed: 0,geometry,PIN,Graphic_Ac,Legal_Ac,Address,Notes,Zoning,Own_Name,Own_Addres,SHAPE_Leng,...,PARID,STATECODE,STRUCTURE,TAXYR,YRBLT,ZIP1,taxsheet,parcel_count,classv2,taxsheetv2
0,POLYGON ((-94.13699017271023 37.35055489878356...,1401700000001000,159.589,160.0,0 THORN & COUNTY RD 50,,,"SEELA, RAYMOND E & FLORENCE A",19483 COUNTY RD 50 ;JASPER;MO;64755,13159.744243,...,,,,,,,,,,
1,POLYGON ((-94.14188721000529 37.33973595930397...,1401700000002000,161.077,158.22,5296 THORN RD,,,"NEIDIGH, JOSEPH A & LISA L",5269 THORN RD ;JASPER;MO;64755,13194.656023,...,,,,,,,,,,
2,POLYGON ((-94.14613735909428 37.35070183976294...,1401700000003000,323.007,317.32,5699 THORN RD,,,"WILSON, SAM L & KRISTI L",1005 DALTON RD ;CEDAR VALE;KS;67024,15873.444758,...,,,,,,,,,,
3,POLYGON ((-94.14660779875197 37.33704609240082...,1401700000004000,2.47968,2.48,5503 THORN RD,,,PLEASANT VIEW CEMETERY ASSN,0 ;GOLDEN CITY;MO;64748,1333.926784,...,,,,,,,,,,
4,POLYGON ((-94.15538782377411 37.35084969271262...,1401800000001000,313.296,313.5,20912 COUNTY RD 70,,,"POTTS, LOREN A JR & GEORGIA H",20912 COUNTY RD 70 ;JASPER;MO;64755,15641.42532,...,,,,,,,,,,


In [62]:
parceldata_gdf_dissolve_2010values['parid'].describe()

count                 57235
unique                57235
top       07121100000006000
freq                      1
Name: parid, dtype: object

In [63]:
pd.crosstab(parceldata_gdf_dissolve_2010values['classv2'], parceldata_gdf_dissolve_2010values['taxsheetv2'],margins=True, margins_name="Total")

taxsheetv2,2010 Comm,2010 Comm2010 Res,2010 Res,Total
classv2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,700,5,9406,10111
A,4,0,3,7
AR,0,2,0,2
C,1886,17,44,1947
CR,7,156,4,167
E,177,17,101,295
EC,1,1,0,2
ER,0,0,1,1
R,119,39,25138,25296
RC,5,3,0,8


In [64]:
pd.crosstab(parceldata_gdf_dissolve_2010values['Zoning'], parceldata_gdf_dissolve_2010values['classv2'],margins=True, margins_name="Total")

classv2,Unnamed: 1_level_0,A,C,CR,E,EC,ER,R,RC,Total
Zoning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C1,90,0,91,1,8,0,0,51,0,241
C1-PD,11,0,25,0,1,0,0,0,0,37
C2,64,0,126,2,9,0,0,17,0,218
C2-PD,6,0,10,0,0,0,0,1,0,17
C3,132,0,199,5,6,0,0,23,2,367
C3-PD,8,0,7,0,0,0,0,3,0,18
CO,39,0,19,1,2,0,0,40,0,101
CO-PD,7,0,23,0,0,0,0,0,0,30
M1,2,0,1,0,0,0,0,0,0,3
M1-PD,23,0,0,0,0,0,0,19,0,42


## Read in Census Block Data
Census Blocks provide an estimate of how many residiential address points (housing units) should be located in each block.

In [65]:
source_program = 'IN-CORE_1av2_Joplin_CleanBlockData_2019-07-10'
census_blocks_csv = source_program+"/"+source_program+"EPSG4269.csv"
census_blocks_df = pd.read_csv(census_blocks_csv)
census_blocks_df.head()

Unnamed: 0.1,Unnamed: 0,STATEFP10,COUNTYFP10,TRACTCE10,BLOCKCE,BLOCKID10,PARTFLG,HOUSING10,POP10,geometry,...,blk104269,blockid,apcount,pop10,gqpop10,popdiff,PLCGEOID10,PLCNAME10,PUMGEOID10,PUMNAME10
0,0,29,97,12100,1047,290970121001047,N,2,4,"POLYGON ((-94.13775 37.32550000000001, -94.138...",...,"POLYGON ((-94.13775 37.32550000000001, -94.138...",290970100000000.0,2.0,4.0,0.0,0.0,,,2902800,Jasper & Newton Counties PUMA
1,1,29,97,12100,1050,290970121001050,N,3,7,"POLYGON ((-94.137637 37.328675, -94.119315 37....",...,"POLYGON ((-94.137637 37.328675, -94.119315 37....",290970100000000.0,3.0,7.0,0.0,0.0,,,2902800,Jasper & Newton Counties PUMA
2,2,29,97,12100,1094,290970121001094,N,4,13,"POLYGON ((-94.214761 37.293836, -94.230751 37....",...,"POLYGON ((-94.214761 37.293836, -94.230751 37....",290970100000000.0,4.0,13.0,0.0,0.0,,,2902800,Jasper & Newton Counties PUMA
3,3,29,97,12100,1093,290970121001093,N,5,5,"POLYGON ((-94.197294 37.27723599999999, -94.19...",...,"POLYGON ((-94.197294 37.27723599999999, -94.19...",290970100000000.0,5.0,5.0,0.0,0.0,,,2902800,Jasper & Newton Counties PUMA
4,4,29,97,12100,1130,290970121001130,N,4,9,"POLYGON ((-94.151792 37.276275, -94.1519139999...",...,"POLYGON ((-94.151792 37.276275, -94.1519139999...",290970100000000.0,4.0,9.0,0.0,0.0,,,2902800,Jasper & Newton Counties PUMA


In [66]:
census_blocks_gdf = gpd.GeoDataFrame(census_blocks_df)
census_blocks_gdf.head()

Unnamed: 0.1,Unnamed: 0,STATEFP10,COUNTYFP10,TRACTCE10,BLOCKCE,BLOCKID10,PARTFLG,HOUSING10,POP10,geometry,...,blk104269,blockid,apcount,pop10,gqpop10,popdiff,PLCGEOID10,PLCNAME10,PUMGEOID10,PUMNAME10
0,0,29,97,12100,1047,290970121001047,N,2,4,"POLYGON ((-94.13775 37.32550000000001, -94.138...",...,"POLYGON ((-94.13775 37.32550000000001, -94.138...",290970100000000.0,2.0,4.0,0.0,0.0,,,2902800,Jasper & Newton Counties PUMA
1,1,29,97,12100,1050,290970121001050,N,3,7,"POLYGON ((-94.137637 37.328675, -94.119315 37....",...,"POLYGON ((-94.137637 37.328675, -94.119315 37....",290970100000000.0,3.0,7.0,0.0,0.0,,,2902800,Jasper & Newton Counties PUMA
2,2,29,97,12100,1094,290970121001094,N,4,13,"POLYGON ((-94.214761 37.293836, -94.230751 37....",...,"POLYGON ((-94.214761 37.293836, -94.230751 37....",290970100000000.0,4.0,13.0,0.0,0.0,,,2902800,Jasper & Newton Counties PUMA
3,3,29,97,12100,1093,290970121001093,N,5,5,"POLYGON ((-94.197294 37.27723599999999, -94.19...",...,"POLYGON ((-94.197294 37.27723599999999, -94.19...",290970100000000.0,5.0,5.0,0.0,0.0,,,2902800,Jasper & Newton Counties PUMA
4,4,29,97,12100,1130,290970121001130,N,4,9,"POLYGON ((-94.151792 37.276275, -94.1519139999...",...,"POLYGON ((-94.151792 37.276275, -94.1519139999...",290970100000000.0,4.0,9.0,0.0,0.0,,,2902800,Jasper & Newton Counties PUMA


In [67]:
census_blocks_gdf['geometry'].geom_type.describe()

count     0
unique    0
dtype: int64

In [68]:
# Use shapely.wkt loads to convert WKT to GeoSeries
from shapely.wkt import loads

census_blocks_gdf['geometry'] = census_blocks_gdf['geometry'].apply(lambda x: loads(x))
census_blocks_gdf['geometry'].geom_type.describe()

count        9621
unique          2
top       Polygon
freq         9615
dtype: object

In [69]:
census_blocks_gdf = census_blocks_gdf.set_geometry(census_blocks_gdf['geometry'])
census_blocks_gdf.crs = {'init':'epsg:4269'}
census_blocks_gdf.head()

Unnamed: 0.1,Unnamed: 0,STATEFP10,COUNTYFP10,TRACTCE10,BLOCKCE,BLOCKID10,PARTFLG,HOUSING10,POP10,geometry,...,blk104269,blockid,apcount,pop10,gqpop10,popdiff,PLCGEOID10,PLCNAME10,PUMGEOID10,PUMNAME10
0,0,29,97,12100,1047,290970121001047,N,2,4,"POLYGON ((-94.13775 37.32550000000001, -94.138...",...,"POLYGON ((-94.13775 37.32550000000001, -94.138...",290970100000000.0,2.0,4.0,0.0,0.0,,,2902800,Jasper & Newton Counties PUMA
1,1,29,97,12100,1050,290970121001050,N,3,7,"POLYGON ((-94.137637 37.328675, -94.119315 37....",...,"POLYGON ((-94.137637 37.328675, -94.119315 37....",290970100000000.0,3.0,7.0,0.0,0.0,,,2902800,Jasper & Newton Counties PUMA
2,2,29,97,12100,1094,290970121001094,N,4,13,"POLYGON ((-94.214761 37.293836, -94.230751 37....",...,"POLYGON ((-94.214761 37.293836, -94.230751 37....",290970100000000.0,4.0,13.0,0.0,0.0,,,2902800,Jasper & Newton Counties PUMA
3,3,29,97,12100,1093,290970121001093,N,5,5,"POLYGON ((-94.197294 37.27723599999999, -94.19...",...,"POLYGON ((-94.197294 37.27723599999999, -94.19...",290970100000000.0,5.0,5.0,0.0,0.0,,,2902800,Jasper & Newton Counties PUMA
4,4,29,97,12100,1130,290970121001130,N,4,9,"POLYGON ((-94.151792 37.276275, -94.1519139999...",...,"POLYGON ((-94.151792 37.276275, -94.1519139999...",290970100000000.0,4.0,9.0,0.0,0.0,,,2902800,Jasper & Newton Counties PUMA


In [70]:
# Convert Census Block CRS to Buildings CRS
census_blocks_gdf = census_blocks_gdf.to_crs({'init': 'epsg:4326'})
census_blocks_gdf.crs

{'init': 'epsg:4326'}

### Select Blocks inside Parcel Boundary

In [71]:
census_blocks_gdf['BLOCKID10'].describe()

count    9.621000e+03
mean     2.911818e+14
std      2.383822e+11
min      2.909701e+14
25%      2.909701e+14
50%      2.909701e+14
75%      2.914502e+14
max      2.914502e+14
Name: BLOCKID10, dtype: float64

In [72]:
# Find the bounds of the Census Block File
# Add Small Buffer for blocks on the edges
buffer = 0.001
minx = parceldata_gdf_dissolve_2010values.bounds.minx.min() + buffer
miny = parceldata_gdf_dissolve_2010values.bounds.miny.min() + buffer
maxx = parceldata_gdf_dissolve_2010values.bounds.maxx.max() + buffer
maxy = parceldata_gdf_dissolve_2010values.bounds.maxy.max() + buffer
parceldata_gdf_dissolve_2010values_bounds = [minx, miny, maxx, maxy]
parceldata_gdf_dissolve_2010values_bounds

[-94.617280153866, 37.04921672450786, -94.05142215228076, 37.36517424596878]

In [73]:
# Select Blocks within Bounds of Study Area
# build the r-tree index - for blocks
sindex_census_blocks_gdf = census_blocks_gdf.sindex
possible_matches_index = list(sindex_census_blocks_gdf.intersection(parceldata_gdf_dissolve_2010values_bounds))
census_blocks_parcels_gdf = census_blocks_gdf.iloc[possible_matches_index]
census_blocks_parcels_gdf['BLOCKID10'].describe()

count    5.615000e+03
mean     2.909906e+14
std      9.711969e+10
min      2.909701e+14
25%      2.909701e+14
50%      2.909701e+14
75%      2.909701e+14
max      2.914502e+14
Name: BLOCKID10, dtype: float64

### Spatial JoinBlock Information to Parcels

In [74]:
# Confirm Count of Unique ID in layer to which data will be added
parceldata_gdf_dissolve_2010values['parid'].describe()

count                 57235
unique                57235
top       07121100000006000
freq                      1
Name: parid, dtype: object

In [75]:
# build the r-tree index - Using Representative Point
parceldata_gdf_dissolve_2010values.loc[parceldata_gdf_dissolve_2010values.index,'geometry'] = parceldata_gdf_dissolve_2010values['rppnt4326']
sindex_parceldata_gdf_dissolve_2010values = parceldata_gdf_dissolve_2010values.sindex
sindex_parceldata_gdf_dissolve_2010values

<geopandas.sindex.SpatialIndex at 0x24d5362fa20>

In [76]:
# find the points that intersect with each subpolygon and add ID to Point
for index, block in census_blocks_parcels_gdf.iterrows():
    if index%100==0:
        print(index)
        
    # find approximate matches with r-tree, then precise matches from those approximate ones
    possible_matches_index = list(sindex_parceldata_gdf_dissolve_2010values.intersection(block['geometry'].bounds))
    possible_matches = parceldata_gdf_dissolve_2010values.iloc[possible_matches_index]
    precise_matches = possible_matches[possible_matches.intersects(block['geometry'])]
    parceldata_gdf_dissolve_2010values.loc[precise_matches.index,'BLOCKID10'] = block['BLOCKID10']
    parceldata_gdf_dissolve_2010values.loc[precise_matches.index,'STATEFP10'] = block['STATEFP10']
    parceldata_gdf_dissolve_2010values.loc[precise_matches.index,'COUNTYFP10'] = block['COUNTYFP10']
    parceldata_gdf_dissolve_2010values.loc[precise_matches.index,'TRACTCE10'] = block['TRACTCE10']
    parceldata_gdf_dissolve_2010values.loc[precise_matches.index,'PUMGEOID10'] = block['PUMGEOID10']
    parceldata_gdf_dissolve_2010values.loc[precise_matches.index,'PUMNAME10'] = block['PUMNAME10']
    parceldata_gdf_dissolve_2010values.loc[precise_matches.index,'PLCGEOID10'] = block['PLCGEOID10']
    parceldata_gdf_dissolve_2010values.loc[precise_matches.index,'PLCNAME10'] = block['PLCNAME10']

3800
0
600
2400
2800
3200
4400
1700
4500
400
3100
800
4600
1800
200
4300
100
900
700
1200
5200
1000
1900
3700
5000
500
3600
3300
1300
3900
5300
4000
2000
4700
2200
2900
2700
1600
1500
2100
3500
4800
2600
4100
300
5100
4200
3000
1100
4900
1400
3400
2500
7900
9500
2300
9200


In [77]:
# Confirm Count of Unique ID in layer to which data will be added
parceldata_gdf_dissolve_2010values['parid'].describe()

count                 57235
unique                57235
top       07121100000006000
freq                      1
Name: parid, dtype: object

In [78]:
# Switch Block Geography back to polygons
parceldata_gdf_dissolve_2010values.loc[parceldata_gdf_dissolve_2010values.index,'geometry'] = parceldata_gdf_dissolve_2010values['prcl4326']
parceldata_gdf_dissolve_2010values.head()

Unnamed: 0,geometry,PIN,Graphic_Ac,Legal_Ac,Address,Notes,Zoning,Own_Name,Own_Addres,SHAPE_Leng,...,classv2,taxsheetv2,BLOCKID10,STATEFP10,COUNTYFP10,TRACTCE10,PUMGEOID10,PUMNAME10,PLCGEOID10,PLCNAME10
0,POLYGON ((-94.13699017271023 37.35055489878356...,1401700000001000,159.589,160.0,0 THORN & COUNTY RD 50,,,"SEELA, RAYMOND E & FLORENCE A",19483 COUNTY RD 50 ;JASPER;MO;64755,13159.744243,...,,,290970100000000.0,29.0,97.0,12100.0,2902800.0,Jasper & Newton Counties PUMA,,
1,POLYGON ((-94.14188721000529 37.33973595930397...,1401700000002000,161.077,158.22,5296 THORN RD,,,"NEIDIGH, JOSEPH A & LISA L",5269 THORN RD ;JASPER;MO;64755,13194.656023,...,,,290970100000000.0,29.0,97.0,12100.0,2902800.0,Jasper & Newton Counties PUMA,,
2,POLYGON ((-94.14613735909428 37.35070183976294...,1401700000003000,323.007,317.32,5699 THORN RD,,,"WILSON, SAM L & KRISTI L",1005 DALTON RD ;CEDAR VALE;KS;67024,15873.444758,...,,,290970100000000.0,29.0,97.0,12100.0,2902800.0,Jasper & Newton Counties PUMA,,
3,POLYGON ((-94.14660779875197 37.33704609240082...,1401700000004000,2.47968,2.48,5503 THORN RD,,,PLEASANT VIEW CEMETERY ASSN,0 ;GOLDEN CITY;MO;64748,1333.926784,...,,,290970100000000.0,29.0,97.0,12100.0,2902800.0,Jasper & Newton Counties PUMA,,
4,POLYGON ((-94.15538782377411 37.35084969271262...,1401800000001000,313.296,313.5,20912 COUNTY RD 70,,,"POTTS, LOREN A JR & GEORGIA H",20912 COUNTY RD 70 ;JASPER;MO;64755,15641.42532,...,,,290970100000000.0,29.0,97.0,12100.0,2902800.0,Jasper & Newton Counties PUMA,,


In [79]:
# Look at One Place plot the intersections and the city
place_gdf_map = fm.Map(location=[(miny+maxy)/2,(minx+maxx)/2], zoom_start=10)
joplin_blocks_gdf = census_blocks_gdf[census_blocks_gdf['PLCNAME10']=='Joplin']
joplin_parcels_gdf = parceldata_gdf_dissolve_2010values[parceldata_gdf_dissolve_2010values['PLCNAME10']=='Joplin']
blockstyle_function = lambda x: {'color':'green','fillColor': 'transparent' }

fm.GeoJson(joplin_blocks_gdf['geometry'],name='Census Blocks',style_function=blockstyle_function).add_to(place_gdf_map)
fm.GeoJson(joplin_parcels_gdf['geometry'],name='Parcels').add_to(place_gdf_map)
fm.LayerControl().add_to(place_gdf_map)
place_gdf_map.save(programname+'/'+programname+'joplin_parcels_blocks.html')
# Error Displaying Map display(neosho_place_gdf_map)

## How many parcels do not have blockids?

In [80]:
parceldata_gdf_dissolve_2010values.loc[parceldata_gdf_dissolve_2010values['BLOCKID10'].isnull()]

Unnamed: 0,geometry,PIN,Graphic_Ac,Legal_Ac,Address,Notes,Zoning,Own_Name,Own_Addres,SHAPE_Leng,...,classv2,taxsheetv2,BLOCKID10,STATEFP10,COUNTYFP10,TRACTCE10,PUMGEOID10,PUMNAME10,PLCGEOID10,PLCNAME10
1326,"POLYGON ((-94.60737629192255 37.3640126364568,...",5421800000001000,15.4264,0.0,,EMPIRE DISTRICT ELECTRIC,,EMPIRE DISTRICT ELECTRIC,,13972.212999,...,,,,,,,,,,
34979,POLYGON ((-94.61762151360274 37.14196682824367...,17502200000001001,6.11827,7.3,5700 STATE LINE AVE,,,"SCHROEDER, RALPH F TR",3609 E 20TH ST ;JOPLIN;MO;64801,4507.231749,...,R,2010 Res,,,,,,,,
34980,POLYGON ((-94.61776285811932 37.13833241481854...,17502240001001000,3.91112,2.37,0,,,"SCHROEDER, RALPH F TR",5700 STATE LINE RD ;JOPLIN;MO;64804,5425.919356,...,,,,,,,,,,


Parcels without Block IDS appear to be sliver polygons along the edge of the county line. Probably due to a mismatch between parcel polygons and block polygons.

## How many blocks do not have parcels?

In [81]:
# Collapse Blocks By Place Name and Count Blocks 
parceldata_gdf_dissolve_2010values_parcelcount = parceldata_gdf_dissolve_2010values[['BLOCKID10']]
parceldata_gdf_dissolve_2010values_parcelcount['bldgcount_parcel_sum'] = 1
parceldata_gdf_dissolve_2010values_parcelcount_sum = parceldata_gdf_dissolve_2010values_parcelcount.groupby(['BLOCKID10']).sum()
parceldata_gdf_dissolve_2010values_parcelcount_sum['bldgcount_parcel_sum'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


count    4509.000000
mean       12.692837
std        17.904977
min         1.000000
25%         5.000000
50%         9.000000
75%        15.000000
max       448.000000
Name: parcel_count, dtype: float64

In [82]:
# Add Block Count to Place Data
census_blocks_gdf_checkparcelcount = pd.merge(census_blocks_gdf, parceldata_gdf_dissolve_2010values_parcelcount_sum,
                                  left_on='BLOCKID10', right_on='BLOCKID10', how='left')
census_blocks_gdf_checkparcelcount.loc[(census_blocks_gdf_checkparcelcount['parcel_count'].isnull()) & 
                                      (census_blocks_gdf_checkparcelcount['COUNTYFP10'] == 97) &
                                      (census_blocks_gdf_checkparcelcount['HOUSING10'] > 0) & 
                                      (census_blocks_gdf_checkparcelcount['PLCNAME10'] == 'Joplin')].sort_values(by=['HOUSING10'])

Unnamed: 0.1,Unnamed: 0,STATEFP10,COUNTYFP10,TRACTCE10,BLOCKCE,BLOCKID10,PARTFLG,HOUSING10,POP10,geometry,...,blockid,apcount,pop10,gqpop10,popdiff,PLCGEOID10,PLCNAME10,PUMGEOID10,PUMNAME10,parcel_count
356,356,29,97,10200,2009,290970102002009,N,1,3,"POLYGON ((-94.50909899999999 37.113606, -94.50...",...,290970100000000.0,1.0,3.0,0.0,0.0,2937592.0,Joplin,2902800,Jasper & Newton Counties PUMA,
665,665,29,97,11900,3034,290970119003034,N,1,2,"POLYGON ((-94.41245499999999 37.068743, -94.41...",...,290970100000000.0,1.0,2.0,0.0,0.0,2937592.0,Joplin,2902800,Jasper & Newton Counties PUMA,
1718,1718,29,97,10200,2032,290970102002032,N,1,2,"POLYGON ((-94.508617 37.113221, -94.508611 37....",...,290970100000000.0,1.0,2.0,0.0,0.0,2937592.0,Joplin,2902800,Jasper & Newton Counties PUMA,
2701,2701,29,97,10400,3046,290970104003046,N,1,2,"POLYGON ((-94.44092499999999 37.055165, -94.44...",...,290970100000000.0,1.0,2.0,0.0,0.0,2937592.0,Joplin,2902800,Jasper & Newton Counties PUMA,
3957,3957,29,97,10900,3002,290970109003002,N,1,3,"POLYGON ((-94.54062599999999 37.084461, -94.54...",...,290970100000000.0,1.0,3.0,0.0,0.0,2937592.0,Joplin,2902800,Jasper & Newton Counties PUMA,
4014,4014,29,97,10400,3050,290970104003050,N,1,2,"POLYGON ((-94.44053699999999 37.05513, -94.439...",...,290970100000000.0,1.0,2.0,0.0,0.0,2937592.0,Joplin,2902800,Jasper & Newton Counties PUMA,
1404,1404,29,97,11000,5053,290970110005053,N,2,2,"POLYGON ((-94.53138899999999 37.077306, -94.52...",...,290970100000000.0,2.0,2.0,0.0,0.0,2937592.0,Joplin,2902800,Jasper & Newton Counties PUMA,
2365,2365,29,97,11900,1017,290970119001017,N,2,4,"POLYGON ((-94.432965 37.098214, -94.4322339999...",...,290970100000000.0,2.0,4.0,0.0,0.0,2937592.0,Joplin,2902800,Jasper & Newton Counties PUMA,
3476,3476,29,97,11900,1018,290970119001018,N,2,5,"POLYGON ((-94.433638 37.097731, -94.433593 37....",...,290970100000000.0,2.0,5.0,0.0,0.0,2937592.0,Joplin,2902800,Jasper & Newton Counties PUMA,
4476,4476,29,97,11000,5057,290970110005057,N,2,4,"POLYGON ((-94.52174199999999 37.077092, -94.52...",...,290970100000000.0,2.0,4.0,0.0,0.0,2937592.0,Joplin,2902800,Jasper & Newton Counties PUMA,


#### Block 290970104001006
The block is much smaller than the parcel. However when the buildings are assigned to the blocks and the parcel indidividually this block will have parcel data.

#### Block 290970108003036
Also a block that is smaller than the parcel. The Parcel is in 2 different blocks. The parcel does have 13 buildings, 6 in block 290970108003036 and 6 in block 290970108003024. The parcel has the address 605 W 25TH ST, which is in the path of the tornado. The zoning is R3 which would fit with the housing unit count of 18 (6 buildings with 3 units each)


#### Block 290970119001014
This is an interesting block. The parcels are long and most of the parcel is in a different block. But the street side of the parcel is in block 290970119001014. The neighboring block 290970119001013 will be assigned the parcels. In this case the block boundaries and parcel boundaries do not match well. The block layout follows the roads but the parcels do not follow the roads.


In [83]:
cols = [col for col in parceldata_gdf_dissolve_2010values]
cols

['geometry',
 'PIN',
 'Graphic_Ac',
 'Legal_Ac',
 'Address',
 'Notes',
 'Zoning',
 'Own_Name',
 'Own_Addres',
 'SHAPE_Leng',
 'SHAPE_Area',
 'rppnt4326',
 'prcl4326',
 'parid',
 '#',
 'ACRES',
 'ADRDIR',
 'ADRDIR.1',
 'ADRNO',
 'ADRNO.1',
 'ADRSTR',
 'ADRSTR.1',
 'APRBLDG',
 'APRLAND',
 'APRTOT',
 'CARD',
 'CITYNAME',
 'CLASS',
 'OWN1',
 'PARID',
 'STATECODE',
 'STRUCTURE',
 'TAXYR',
 'YRBLT',
 'ZIP1',
 'taxsheet',
 'parcel_count',
 'classv2',
 'taxsheetv2',
 'BLOCKID10',
 'STATEFP10',
 'COUNTYFP10',
 'TRACTCE10',
 'PUMGEOID10',
 'PUMNAME10',
 'PLCGEOID10',
 'PLCNAME10']

In [84]:
# Move Primary Key Column to first Column
cols = ['parid']  + [col for col in parceldata_gdf_dissolve_2010values if col != 'parid']
parceldata_gdf_dissolve_2010values = parceldata_gdf_dissolve_2010values[cols]

# remove redudant columns
redudant_columns = ['PIN_x','#','PIN_y','ADRDIR.1','ADRNO.1','ADRSTR.1','PIN','PARID']
cols = [col for col in parceldata_gdf_dissolve_2010values if col not in redudant_columns]
parceldata_gdf_dissolve_2010values = parceldata_gdf_dissolve_2010values[cols]

cols = [col for col in parceldata_gdf_dissolve_2010values]
cols

['parid',
 'geometry',
 'PIN',
 'Graphic_Ac',
 'Legal_Ac',
 'Address',
 'Notes',
 'Zoning',
 'Own_Name',
 'Own_Addres',
 'SHAPE_Leng',
 'SHAPE_Area',
 'rppnt4326',
 'prcl4326',
 'ACRES',
 'ADRDIR',
 'ADRNO',
 'ADRSTR',
 'APRBLDG',
 'APRLAND',
 'APRTOT',
 'CARD',
 'CITYNAME',
 'CLASS',
 'OWN1',
 'PARID',
 'STATECODE',
 'STRUCTURE',
 'TAXYR',
 'YRBLT',
 'ZIP1',
 'taxsheet',
 'parcel_count',
 'classv2',
 'taxsheetv2',
 'BLOCKID10',
 'STATEFP10',
 'COUNTYFP10',
 'TRACTCE10',
 'PUMGEOID10',
 'PUMNAME10',
 'PLCGEOID10',
 'PLCNAME10']

In [86]:
# Make parid a string by adding letter the front
parceldata_gdf_dissolve_2010values['parid'] = parceldata_gdf_dissolve_2010values['parid'].apply(lambda x : "P"+str(x))
parceldata_gdf_dissolve_2010values['parid'].head()

0    P01401700000001000
1    P01401700000002000
2    P01401700000003000
3    P01401700000004000
4    P01401800000001000
Name: parid, dtype: object

In [87]:
# Save Work at this point as CSV
savefile = sys.path[0]+"/"+programname+"/"+programname+"_EPSG4326.csv"
parceldata_gdf_dissolve_2010values.to_csv(savefile)