In [1]:
import pandas as pd
import requests
import os
import secrets
import sys
sys.path.append('./../cbp')
import utils

In [6]:
# READ 2006 DATA FILE 
cbp06 = pd.read_csv('./cbp06co.txt')

In [7]:
# Filter for Texas
tx06 = cbp06[cbp06.fipstate == 48]
tx06.head()

Unnamed: 0,fipstate,fipscty,naics,empflag,emp,qp1,ap,est,n1_4,n5_9,...,n100_249,n250_499,n500_999,n1000,n1000_1,n1000_2,n1000_3,n1000_4,censtate,cencty
1791801,48,1,------,,11591,79324,344842,935,510,205,...,7,3,2,0,0,0,0,0,74,1
1791802,48,1,11----,A,0,0,0,2,0,2,...,0,0,0,0,0,0,0,0,74,1
1791803,48,1,113///,A,0,0,0,2,0,2,...,0,0,0,0,0,0,0,0,74,1
1791804,48,1,1131//,A,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,74,1
1791805,48,1,11311/,A,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,74,1


In [10]:
tx06.reset_index(inplace=True)

In [17]:
tx06.loc[99998]

index       1891799
fipstate         48
fipscty         347
naics        5313//
empflag         NaN
emp              25
qp1             104
ap              420
est               6
n1_4              4
n5_9              0
n10_19            2
n20_49            0
n50_99            0
n100_249          0
n250_499          0
n500_999          0
n1000             0
n1000_1           0
n1000_2           0
n1000_3           0
n1000_4           0
censtate         74
cencty          347
Name: 99998, dtype: object

In [4]:
# Clean up NAICS codes
tx06.naics = tx06.naics.str.replace('-', '')
tx06.naics = tx06.naics.str.replace('/', '')
tx06.naics = tx06.apply(lambda row: '00' if row.naics == '' else row.naics, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [5]:
# Clean up County FIPS codes
def county_fips(row):
    
    fips_str = str(row['fipscty'])
    if len(fips_str) == 1:
        fips = '00' + fips_str
    elif len(fips_str) == 2:
        fips = '0' + fips_str
    else:
        fips = fips_str
        
    return fips

tx06['county'] = tx06.apply(county_fips, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [6]:
tx06.head()

Unnamed: 0,fipstate,fipscty,naics,empflag,emp,qp1,ap,est,n1_4,n5_9,...,n250_499,n500_999,n1000,n1000_1,n1000_2,n1000_3,n1000_4,censtate,cencty,county
1791801,48,1,0,,11591,79324,344842,935,510,205,...,3,2,0,0,0,0,0,74,1,1
1791802,48,1,11,A,0,0,0,2,0,2,...,0,0,0,0,0,0,0,74,1,1
1791803,48,1,113,A,0,0,0,2,0,2,...,0,0,0,0,0,0,0,74,1,1
1791804,48,1,1131,A,0,0,0,1,0,1,...,0,0,0,0,0,0,0,74,1,1
1791805,48,1,11311,A,0,0,0,1,0,1,...,0,0,0,0,0,0,0,74,1,1


In [7]:
# Reduce to relevant columns

tx06 = tx06.loc[:, ['emp', 'est', 'naics', 'fipstate', 'county']]

In [8]:
tx06.head()

Unnamed: 0,emp,est,naics,fipstate,county
1791801,11591,935,0,48,1
1791802,0,2,11,48,1
1791803,0,2,113,48,1
1791804,0,1,1131,48,1
1791805,0,1,11311,48,1


In [9]:
naics2007 = utils.update_naics(years='2002-2007', old_series=tx06.naics)
naics2012 = utils.update_naics(years='2007-2012', old_series=naics2007)

In [10]:
tx06['naics2012'] = naics2012

In [11]:
tx06.head()

Unnamed: 0,emp,est,naics,fipstate,county,naics2012
1791801,11591,935,0,48,1,0
1791802,0,2,11,48,1,11
1791803,0,2,113,48,1,113
1791804,0,1,1131,48,1,1131
1791805,0,1,11311,48,1,11311


In [18]:
tx06.fipstate = tx06.fipstate.astype('str')

In [24]:
tx06.dtypes

EMP           int64
ESTAB         int64
old_naics    object
state        object
county       object
NAICS2012    object
dtype: object

In [20]:
tx06.columns = ['EMP', 'ESTAB', 'old_naics', 'state', 'county', 'NAICS2012']

In [30]:
tx06.NAICS2012.replace(to_replace='31', value='31-33', inplace=True)
tx06.NAICS2012.replace(to_replace='44', value='44-45', inplace=True)
tx06.NAICS2012.replace(to_replace='48', value='48-49', inplace=True)

In [31]:
tx06.to_csv('./texas_cbp_2006.csv', index=False)

In [8]:
tx06_clean = pd.read_csv('./texas_cbp_2006.csv')

In [19]:
tx06_clean = tx06_clean.merge(tx06[['empflag']], left_index=True, right_index=True)

In [20]:
tx06_clean

Unnamed: 0,EMP,ESTAB,old_naics,state,county,NAICS2012,empflag
0,11591,935,0,48,1,00,
1,0,2,11,48,1,11,A
2,0,2,113,48,1,113,A
3,0,1,1131,48,1,1131,A
4,0,1,11311,48,1,11311,A
5,0,1,113110,48,1,113110,A
6,0,1,1133,48,1,1133,A
7,0,1,11331,48,1,11331,A
8,0,1,113310,48,1,113310,A
9,526,24,21,48,1,21,


In [21]:
emp_imputation = {'A': 10,
                  'B': 60,
                  'C': 175,
                  'E': 375,
                  'F': 750,
                  'G': 1750,
                  'H': 3750,
                  'I': 7500,
                  'J': 17500,
                  'K': 37500,
                  'L': 75000,
                  'M': 110000}

In [22]:
tx06_clean.dtypes

EMP           int64
ESTAB         int64
old_naics     int64
state         int64
county        int64
NAICS2012    object
empflag      object
dtype: object

In [None]:
tx06_clean['EMP'] = tx06_clean.apply(
    lambda x: emp_imputation[x.empflag] if pd.notnull(x.empflag) else x.EMP,
    axis=1)

In [37]:
tx06_clean.drop('empflag', axis=1, inplace=True)

In [38]:
tx06_clean.to_csv('./texas_cbp_2006.csv')