In [1]:
import pandas as pd
import numpy as np

In [2]:
# Uses the test file path for annual places data
# See https://www.census.gov/construction/bps/sample.html for more information
path = 'https://www.census.gov/construction/bps/sample/so2006a.txt'

In [3]:
# Read in the sample file, header rows are split into top two rows
# separator needs to be fixed! r'\,|\t'
df = pd.read_table(path,header='infer',sep=r',',index_col=False,skipinitialspace=True,engine='python')
df.head()

Unnamed: 0,Survey,State,6-Digit,County,Census Place,FIPS Place,FIPS MCD,Pop,CSA,CBSA,...,1-unit rep,Unnamed: 31,Unnamed: 32,2-units rep,Unnamed: 34,Unnamed: 35,3-4 units rep,Unnamed: 37,Unnamed: 38,5+ units rep
0,Date,Code,ID,Code,Code,Code,Code,,Code,Code,...,Units,Value,Bldgs,Units,Value,Bldgs,Units,Value,Bldgs,Units
1,2006,01,001000,067,0005,00124,90009,2987.0,222,20020,...,1,202000,0,0,0,0,0,0,0,0
2,2006,01,002000,073,0010,00460,91404,4965.0,142,13820,...,6,742940,0,0,0,0,0,0,0,0
3,2006,01,004000,065,0020,00676,93015,521.0,999,46220,...,0,0,0,0,0,0,0,0,0,0
4,2006,01,005000,117,0025,00820,92205,22619.0,142,13820,...,268,40442142,0,0,0,0,0,0,0,0


In [4]:
# Because header rows are split, some columns are unnamed
df.columns[0:]

Index(['Survey', 'State', '6-Digit', 'County', 'Census Place', 'FIPS Place',
       'FIPS MCD', 'Pop', 'CSA', 'CBSA', 'Footnote', 'Central', 'Zip',
       'Region', 'Division', 'Number of', 'Place', 'Unnamed: 17', '1-unit',
       'Unnamed: 19', 'Unnamed: 20', '2-units', 'Unnamed: 22', 'Unnamed: 23',
       '3-4 units', 'Unnamed: 25', 'Unnamed: 26', '5+ units', 'Unnamed: 28',
       'Unnamed: 29', '1-unit rep', 'Unnamed: 31', 'Unnamed: 32',
       '2-units rep', 'Unnamed: 34', 'Unnamed: 35', '3-4 units rep',
       'Unnamed: 37', 'Unnamed: 38', '5+ units rep'],
      dtype='object')

In [5]:
# fix the split column header name, clean up the table
df.columns = df.columns + ' ' + df.iloc[0,:]
df = df.iloc[1:].reset_index(drop=True)

In [6]:
df.head()

Unnamed: 0,Survey Date,State Code,6-Digit ID,County Code,Census Place Code,FIPS Place Code,FIPS MCD Code,NaN,CSA Code,CBSA Code,...,1-unit rep Units,Unnamed: 31 Value,Unnamed: 32 Bldgs,2-units rep Units,Unnamed: 34 Value,Unnamed: 35 Bldgs,3-4 units rep Units,Unnamed: 37 Value,Unnamed: 38 Bldgs,5+ units rep Units
0,2006,1,1000,67,5,124,90009,2987.0,222,20020,...,1,202000,0,0,0,0,0,0,0,0
1,2006,1,2000,73,10,460,91404,4965.0,142,13820,...,6,742940,0,0,0,0,0,0,0,0
2,2006,1,4000,65,20,676,93015,521.0,999,46220,...,0,0,0,0,0,0,0,0,0,0
3,2006,1,5000,117,25,820,92205,22619.0,142,13820,...,268,40442142,0,0,0,0,0,0,0,0
4,2006,1,6000,95,30,988,91458,17247.0,999,99999,...,54,8931000,11,22,1260000,0,0,0,9,72


In [7]:
df.columns

Index([         'Survey Date',           'State Code',           '6-Digit ID',
                'County Code',    'Census Place Code',      'FIPS Place Code',
              'FIPS MCD Code',                    nan,             'CSA Code',
                  'CBSA Code',        'Footnote Code',         'Central City',
                   'Zip Code',          'Region Code',        'Division Code',
       'Number of Months Rep',           'Place Name',    'Unnamed: 17 Bldgs',
               '1-unit Units',    'Unnamed: 19 Value',    'Unnamed: 20 Bldgs',
              '2-units Units',    'Unnamed: 22 Value',    'Unnamed: 23 Bldgs',
            '3-4 units Units',    'Unnamed: 25 Value',    'Unnamed: 26 Bldgs',
             '5+ units Units',    'Unnamed: 28 Value',    'Unnamed: 29 Bldgs',
           '1-unit rep Units',    'Unnamed: 31 Value',    'Unnamed: 32 Bldgs',
          '2-units rep Units',    'Unnamed: 34 Value',    'Unnamed: 35 Bldgs',
        '3-4 units rep Units',    'Unnamed: 37 Value

In [None]:
# rename columns so they are sorted for 1-unit buildings, units, value, 2 units, buildings, values, etc.
# fix NaN for "Pop" column
# do this for both reported and imputed data