## Test file for observing County and Places U.S. Census Bureau Building Permit Survey data

#### Used to explore table structure and formatting

In [1]:
import pandas as pd
import numpy as np

In [2]:
from geo import stco

### County Data

In [3]:
# Uses the test file path for annual County data
# See https://www.census.gov/construction/bps/sample.html for more information 
path_co = 'https://www.census.gov/construction/bps/sample/co2002a.txt'

In [4]:
# Read in the sample file, header rows are split into top two rows
# separator needs to be fixed! r'\,|\t'
df = pd.read_table(path_co,header=0,sep=',',index_col=False,skipinitialspace=True,engine='python')
df.head()

Unnamed: 0,Survey,FIPS,FIPS.1,Region,Division,County,Unnamed: 6,1-unit,Unnamed: 8,Unnamed: 9,...,1-unit rep,Unnamed: 20,Unnamed: 21,2-units rep,Unnamed: 23,Unnamed: 24,3-4 units rep,Unnamed: 26,Unnamed: 27,5+units rep
0,Date,State,County,Code,Code,Name,Bldgs,Units,Value,Bldgs,...,Units,Value,Bldgs,Units,Value,Bldgs,Units,Value,Bldgs,Units
1,2002,01,001,3,6,Autauga County,276,276,13856975,0,...,276,13856975,0,0,0,0,0,0,0,0
2,2002,01,003,3,6,Baldwin County,2009,2009,269671818,8,...,1964,263888785,8,16,1181904,2,8,613450,10,173
3,2002,01,005,3,6,Barbour County,21,21,1600783,0,...,20,1515783,0,0,0,0,0,0,0,0
4,2002,01,007,3,6,Bibb County,2,2,140000,0,...,2,140000,0,0,0,0,0,0,0,0


In [5]:
# Because header rows are split, some columns are unnamed
df.columns[0:]

Index(['Survey', 'FIPS', 'FIPS.1', 'Region', 'Division', 'County',
       'Unnamed: 6', '1-unit', 'Unnamed: 8', 'Unnamed: 9', '2-units',
       'Unnamed: 11', 'Unnamed: 12', '3-4 units', 'Unnamed: 14', 'Unnamed: 15',
       '5+ units', 'Unnamed: 17', 'Unnamed: 18', '1-unit rep', 'Unnamed: 20',
       'Unnamed: 21', '2-units rep', 'Unnamed: 23', 'Unnamed: 24',
       '3-4 units rep', 'Unnamed: 26', 'Unnamed: 27', '5+units rep'],
      dtype='object')

In [6]:
# fix the split column header name, clean up the table
df.columns = df.columns + ' ' + df.iloc[0,:]
df = df.iloc[1:].reset_index(drop=True)

In [7]:
df.head()

Unnamed: 0,Survey Date,FIPS State,FIPS.1 County,Region Code,Division Code,County Name,Unnamed: 6 Bldgs,1-unit Units,Unnamed: 8 Value,Unnamed: 9 Bldgs,...,1-unit rep Units,Unnamed: 20 Value,Unnamed: 21 Bldgs,2-units rep Units,Unnamed: 23 Value,Unnamed: 24 Bldgs,3-4 units rep Units,Unnamed: 26 Value,Unnamed: 27 Bldgs,5+units rep Units
0,2002,1,1,3,6,Autauga County,276,276,13856975,0,...,276,13856975,0,0,0,0,0,0,0,0
1,2002,1,3,3,6,Baldwin County,2009,2009,269671818,8,...,1964,263888785,8,16,1181904,2,8,613450,10,173
2,2002,1,5,3,6,Barbour County,21,21,1600783,0,...,20,1515783,0,0,0,0,0,0,0,0
3,2002,1,7,3,6,Bibb County,2,2,140000,0,...,2,140000,0,0,0,0,0,0,0,0
4,2002,1,9,3,6,Blount County,45,45,6029991,0,...,45,6029991,0,0,0,1,3,121500,0,0


In [8]:
df.columns

Index(['Survey Date', 'FIPS State', 'FIPS.1 County', 'Region Code',
       'Division Code', 'County Name', 'Unnamed: 6 Bldgs', '1-unit Units',
       'Unnamed: 8 Value', 'Unnamed: 9 Bldgs', '2-units Units',
       'Unnamed: 11 Value', 'Unnamed: 12 Bldgs', '3-4 units Units',
       'Unnamed: 14 Value', 'Unnamed: 15 Bldgs', '5+ units Units',
       'Unnamed: 17 Value', 'Unnamed: 18 Bldgs', '1-unit rep Units',
       'Unnamed: 20 Value', 'Unnamed: 21 Bldgs', '2-units rep Units',
       'Unnamed: 23 Value', 'Unnamed: 24 Bldgs', '3-4 units rep Units',
       'Unnamed: 26 Value', 'Unnamed: 27 Bldgs', '5+units rep Units'],
      dtype='object')

In [9]:
dfdf.rename(columns={'FIPS.1 County':'FIPS County'})

Unnamed: 0,Survey Date,FIPS State,FIPS County,Region Code,Division Code,County Name,Unnamed: 6 Bldgs,1-unit Units,Unnamed: 8 Value,Unnamed: 9 Bldgs,...,1-unit rep Units,Unnamed: 20 Value,Unnamed: 21 Bldgs,2-units rep Units,Unnamed: 23 Value,Unnamed: 24 Bldgs,3-4 units rep Units,Unnamed: 26 Value,Unnamed: 27 Bldgs,5+units rep Units
0,2002,01,001,3,6,Autauga County,276,276,13856975,0,...,276,13856975,0,0,0,0,0,0,0,0
1,2002,01,003,3,6,Baldwin County,2009,2009,269671818,8,...,1964,263888785,8,16,1181904,2,8,613450,10,173
2,2002,01,005,3,6,Barbour County,21,21,1600783,0,...,20,1515783,0,0,0,0,0,0,0,0
3,2002,01,007,3,6,Bibb County,2,2,140000,0,...,2,140000,0,0,0,0,0,0,0,0
4,2002,01,009,3,6,Blount County,45,45,6029991,0,...,45,6029991,0,0,0,1,3,121500,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3008,2002,56,037,4,8,Sweetwater County,48,48,7394084,0,...,37,5950530,0,0,0,0,0,0,0,0
3009,2002,56,039,4,8,Teton County,131,131,62976579,3,...,131,62976579,3,6,389718,4,16,4442400,1,44
3010,2002,56,041,4,8,Uinta County,58,58,5082750,0,...,32,2860617,0,0,0,0,0,0,0,0
3011,2002,56,043,4,8,Washakie County,3,3,113000,0,...,3,113000,0,0,0,0,0,0,0,0


In [18]:
df.dtypes

Survey Date            object
FIPS State             object
FIPS.1 County          object
Region Code            object
Division Code          object
County Name            object
Unnamed: 6 Bldgs       object
1-unit Units           object
Unnamed: 8 Value       object
Unnamed: 9 Bldgs       object
2-units Units          object
Unnamed: 11 Value      object
Unnamed: 12 Bldgs      object
3-4 units Units        object
Unnamed: 14 Value      object
Unnamed: 15 Bldgs      object
5+ units Units         object
Unnamed: 17 Value      object
Unnamed: 18 Bldgs      object
1-unit rep Units       object
Unnamed: 20 Value      object
Unnamed: 21 Bldgs      object
2-units rep Units      object
Unnamed: 23 Value      object
Unnamed: 24 Bldgs      object
3-4 units rep Units    object
Unnamed: 26 Value      object
Unnamed: 27 Bldgs      object
5+units rep Units      object
dtype: object

In [17]:
df['FIPS STCO'] = df['FIPS State'] + df['FIPS County']

KeyError: 'FIPS County'

In [10]:
# rename 'FIPS.1 County' to 'FIPS County'
# fix "unnamed: " & rename columns so they are identified for 1-unit buildings, 1-unit units, 1-unit value, etc
# do this for both imputed and reported ('rep') data columns

### Places Data

In [11]:
# Uses the test file path for annual Places data
# See https://www.census.gov/construction/bps/sample.html for more information
path_pl = 'https://www.census.gov/construction/bps/sample/so2006a.txt'

In [12]:
# Read in the sample file, header rows are split into top two rows
# separator needs to be fixed! r'\,|\t'
dff = pd.read_table(path_pl,header=0,sep=',',index_col=False,skipinitialspace=True,engine='python')
dff.head()

Unnamed: 0,Survey,State,6-Digit,County,Census Place,FIPS Place,FIPS MCD,Pop,CSA,CBSA,...,1-unit rep,Unnamed: 31,Unnamed: 32,2-units rep,Unnamed: 34,Unnamed: 35,3-4 units rep,Unnamed: 37,Unnamed: 38,5+ units rep
0,Date,Code,ID,Code,Code,Code,Code,,Code,Code,...,Units,Value,Bldgs,Units,Value,Bldgs,Units,Value,Bldgs,Units
1,2006,01,001000,067,0005,00124,90009,2987.0,222,20020,...,1,202000,0,0,0,0,0,0,0,0
2,2006,01,002000,073,0010,00460,91404,4965.0,142,13820,...,6,742940,0,0,0,0,0,0,0,0
3,2006,01,004000,065,0020,00676,93015,521.0,999,46220,...,0,0,0,0,0,0,0,0,0,0
4,2006,01,005000,117,0025,00820,92205,22619.0,142,13820,...,268,40442142,0,0,0,0,0,0,0,0


In [13]:
# Because header rows are split, some columns are unnamed
dff.columns[0:]

Index(['Survey', 'State', '6-Digit', 'County', 'Census Place', 'FIPS Place',
       'FIPS MCD', 'Pop', 'CSA', 'CBSA', 'Footnote', 'Central', 'Zip',
       'Region', 'Division', 'Number of', 'Place', 'Unnamed: 17', '1-unit',
       'Unnamed: 19', 'Unnamed: 20', '2-units', 'Unnamed: 22', 'Unnamed: 23',
       '3-4 units', 'Unnamed: 25', 'Unnamed: 26', '5+ units', 'Unnamed: 28',
       'Unnamed: 29', '1-unit rep', 'Unnamed: 31', 'Unnamed: 32',
       '2-units rep', 'Unnamed: 34', 'Unnamed: 35', '3-4 units rep',
       'Unnamed: 37', 'Unnamed: 38', '5+ units rep'],
      dtype='object')

In [14]:
# fix the split column header name, clean up the table
dff.columns = dff.columns + ' ' + dff.iloc[0,:]
dff = dff.iloc[1:].reset_index(drop=True)

In [15]:
dff.head()

Unnamed: 0,Survey Date,State Code,6-Digit ID,County Code,Census Place Code,FIPS Place Code,FIPS MCD Code,NaN,CSA Code,CBSA Code,...,1-unit rep Units,Unnamed: 31 Value,Unnamed: 32 Bldgs,2-units rep Units,Unnamed: 34 Value,Unnamed: 35 Bldgs,3-4 units rep Units,Unnamed: 37 Value,Unnamed: 38 Bldgs,5+ units rep Units
0,2006,1,1000,67,5,124,90009,2987.0,222,20020,...,1,202000,0,0,0,0,0,0,0,0
1,2006,1,2000,73,10,460,91404,4965.0,142,13820,...,6,742940,0,0,0,0,0,0,0,0
2,2006,1,4000,65,20,676,93015,521.0,999,46220,...,0,0,0,0,0,0,0,0,0,0
3,2006,1,5000,117,25,820,92205,22619.0,142,13820,...,268,40442142,0,0,0,0,0,0,0,0
4,2006,1,6000,95,30,988,91458,17247.0,999,99999,...,54,8931000,11,22,1260000,0,0,0,9,72


In [16]:
dff.columns

Index([         'Survey Date',           'State Code',           '6-Digit ID',
                'County Code',    'Census Place Code',      'FIPS Place Code',
              'FIPS MCD Code',                    nan,             'CSA Code',
                  'CBSA Code',        'Footnote Code',         'Central City',
                   'Zip Code',          'Region Code',        'Division Code',
       'Number of Months Rep',           'Place Name',    'Unnamed: 17 Bldgs',
               '1-unit Units',    'Unnamed: 19 Value',    'Unnamed: 20 Bldgs',
              '2-units Units',    'Unnamed: 22 Value',    'Unnamed: 23 Bldgs',
            '3-4 units Units',    'Unnamed: 25 Value',    'Unnamed: 26 Bldgs',
             '5+ units Units',    'Unnamed: 28 Value',    'Unnamed: 29 Bldgs',
           '1-unit rep Units',    'Unnamed: 31 Value',    'Unnamed: 32 Bldgs',
          '2-units rep Units',    'Unnamed: 34 Value',    'Unnamed: 35 Bldgs',
        '3-4 units rep Units',    'Unnamed: 37 Value

In [None]:
# fix NaN for "Pop" column
# fix "unnamed: " & rename columns so they are identified for 1-unit buildings, 1-unit units, 1-unit value, etc
# do this for both imputed and reported ('rep') data columns