In [493]:
import pandas as pd
import urllib.request
import csv
import zipfile
import subprocess
import numpy as np
import os
import io
from bs4 import BeautifulSoup
from pandas.tseries.offsets import DateOffset
import re


### Download raw files

You can use this section to get the monthly zip. It can be run fairly harmlessly as it shouldnt overwrite old versions you have

##### Function definition

In [404]:
def get_raw_marts(save_path='../raw/retail_sales_services/marts/staging/'
                  ,file_name='MARTS-mf.zip'):
    urllib.request.urlretrieve(
        'https://www.census.gov/econ/currentdata/datasets/MARTS-mf.zip'
        ,save_path+file_name
    )

In [565]:
def open_save_and_parse_mrts(mrts_path='../raw/retail_sales_services/marts/staging/'
                            ,file_name='MARTS-mf.zip'
                            ,save_path='../raw/retail_sales_services/marts/releases/'):
    zf = zipfile.ZipFile(mrts_path+file_name)
    output=str(subprocess.check_output('unzip -l '+mrts_path+file_name
                                ,shell=True),encoding='utf-8')
    
    file_context = []
    for row in output.split('\n')[3:-3]:
        file_context.append(
            pd.to_datetime(
                row.strip().split('  ')[1].strip()
            )
        )
    max_date = str(np.max(file_context))[:10].replace('-','_')
    
    save_path = save_path+'{}'.format(max_date)
    
    try:
        os.mkdir(save_path)
    except:
        ''
        
    sections = []
    with zf.open('README') as f:
        for row in f:
            if ' Section' in str(row):
                sections.append(str(row,'utf-8').replace(' Section','').strip())        
                
    path=save_path
    with zf.open('MARTS-mf.csv') as f:
        martsreader = csv.reader(io.TextIOWrapper(f,'utf8'), delimiter=','
                                ,quotechar='"',quoting=csv.QUOTE_MINIMAL)

        for line in martsreader:
            if len(line)==1:
                strip_value = line[0].strip()
            else:
                strip_value=''
            if strip_value in sections:
                name=strip_value.lower().replace(' ','_')
                try:
                    csvfile.close()
                except:
                    ''
                csvfile=open(path+name+'.csv','w')
                martswriter = csv.writer(csvfile,delimiter=',',quotechar='"'
                                        ,quoting=csv.QUOTE_MINIMAL)
            else:
                if len(line)>0:
                    martswriter.writerow(line)

    csvfile.close()                

##### Run Code

In [430]:
# with zf.open('README') as f:
#     readmereader = csv.reader(io.TextIOWrapper(f,'utf8'), delimiter='|')

In [431]:
# print(str(
#     io.BytesIO(
#         zf.open('README').read()
#     ).read()
# ,'utf-8'))

In [432]:
get_raw_marts()

In [433]:
open_save_and_parse_mrts()

In [47]:
save_path = '../raw/retail_sales_services/marts/releases/{}'.format(max_date)

## Get release history

In [440]:
def save_census_release_page(save_path='../raw/retail_sales_services/marts/releases/raw_release_history/meta/'
                  ,file_name='historical_marts.html'):
    urllib.request.urlretrieve(
        'https://www2.census.gov/retail/releases/historical/marts/'
        ,save_path+file_name
    )

In [561]:
def parse_census_release_page(save_path='../raw/retail_sales_services/marts/releases/raw_release_history/meta/'
                  ,file_name='historical_marts.html'):
    soup = BeautifulSoup(open(save_path+file_name), "html5lib")
    table_array=[]
    start_append=False
    for row in soup.find_all('table')[0].find_all('tr'):
        if start_append:
            current_row=[]
            for cell in row.find_all('td')[1:-1]:
                current_row.append(cell.text.strip())
            table_array.append(current_row)
        elif 'Parent Directory' in row.text:
            start_append=True
    all_files_df = pd.DataFrame(table_array[:-1],columns=['file_name','date_added','size'])
    
    all_files_df.loc[:,('file_type')]=all_files_df.loc[:,('file_name')].str.split('.').str[1]
    
    all_files_df.loc[:,('file_name_date')]=all_files_df.loc[:,('file_name')]\
    .apply(lambda x: re.sub('[^0-9]','', x))\
    .apply(lambda x: '20'+x[:2]+'-'+x[-2:] if (len(x)==4 and x[:2]<'40')
          else ( '19'+x[:2]+'-'+x[-2:] if len(x)==4 else ''))    
    
    return all_files_df

In [452]:
def retreve_save_release(file_name
                         ,save_path='../raw/retail_sales_services/marts/releases/raw_release_history/files/'):
    urllib.request.urlretrieve(
        'https://www2.census.gov/retail/releases/historical/marts/{}'.format(file_name)
        ,save_path+'{}'.format(file_name)
    )

In [1140]:
def parse_text_sic(text_file_path):
    
    table_dict={'1A':{0:[],1:[]},'1B':{0:[],1:[]}}
    start_append=False
    indices = [0,15,50,64,73]

    position=-1
    annual_fix=False
    current_table=''
    with open(text_file_path, encoding="latin-1") as f:
        for row in f:
            
            if '12 month total' in row:
                annual_fix=True
            if row.strip()[:5]=='TABLE':
                start_append=False
                if '1A' in row and not annual_fix:
                    current_table='1A'
                    position=-1
                    sales_as_of=pd.to_datetime(row.strip().split('--')[-1])
                elif '1B' in row and not annual_fix:
                    current_table='1B'
                    position=-1
                else:
                    break
                
            if len(row.strip())>0 and row.strip()[0] in ['2','1'] and len(row[0].strip())==0 and not annual_fix:
                position=position+1
                table_dict[current_table][position]=[]
                start_append=False
            


            if 'Retail trade, total' in row:                
                if annual_fix:
                    start_append=False
                    annual_fix=False
                else:
                    start_append=True
            
            if start_append:
#                print(row)
                table_dict[current_table][position].append(row)

            if row.strip()[:3]=='594':
                start_append=False
#    return table_dict
    split_position_dict={'1A':{},'1B':{}}
    for key in table_dict:
        for sub_key in table_dict[key]:
            split_position_dict[key][sub_key]=\
                max([x.rfind('...')+3 for x in table_dict[key][sub_key] if len(x)>0])
    
    for key in table_dict:
        for sub_key in table_dict[key]:
            fix_array=[]
            for row in table_dict[key][sub_key]:
                left=row[:split_position_dict[key][sub_key]]
                right=row[split_position_dict[key][sub_key]:]
                if len(left.strip())>0 or len(right.strip())>0:
                    fix_array.append([
                        left]+\
                        [x.strip() for x in right.split(' ') if len(x.strip())>0]
                    )
            table_dict[key][sub_key]=fix_array
            
    split_first_array_dict={'1A':{},'1B':{}}
    for key in table_dict:
        for sub_key in table_dict[key]:
            left=[x[0] for x in table_dict[key][sub_key]]
            split_first_array_dict[key][sub_key]=\
                max([len(re.search(re.compile("(?s:.*)[0-9]"), x).group()) for x in left
                    if re.search(re.compile("(?s:.*)[0-9]"), x)])


    for key in table_dict:
        for sub_key in table_dict[key]:
            fix_array=[]            
            for row in table_dict[key][sub_key]:
                left=row[0][:split_first_array_dict[key][sub_key]]
                right=row[0][split_first_array_dict[key][sub_key]:]
                temp =[left,right]+row[1:]
                temp = [x.strip() for x in temp]
                temp[1]=temp[1].replace('u"\u2026"','').rstrip('.')
                temp= temp[:2]+[x.replace(',','') for x in temp[2:]]
                
                fix_array.append(temp)
            
            table_dict[key][sub_key]=copy.deepcopy(fix_array)

    df_1A_0=pd.DataFrame(table_dict['1A'][0],columns=[
        'sic','description'
        ,sales_as_of
        ,sales_as_of-DateOffset(months=1)
        ,sales_as_of-DateOffset(months=2)]).set_index(['sic','description']).stack().reset_index()    
    df_1A_0.columns=['sic','description','as_of_date','value']
    df_1A_0.loc[:,('is_sa')]=1

    df_1A_1=pd.DataFrame(table_dict['1A'][1],columns=[
        'sic','description'
        ,sales_as_of-DateOffset(years=1)
        ,sales_as_of-DateOffset(months=1)-DateOffset(years=1)
        ]).set_index(['sic','description']).stack().reset_index()
    df_1A_1.columns=['sic','description','as_of_date','value']
    df_1A_1.loc[:,('is_sa')]=1

    df_1B_0=pd.DataFrame(table_dict['1B'][0],columns=[
        'sic','description'
        ,sales_as_of
        ,sales_as_of-DateOffset(months=1)
        ,sales_as_of-DateOffset(months=2)]).set_index(['sic','description']).stack().reset_index()
    df_1B_0.columns=['sic','description','as_of_date','value']
    df_1B_0.loc[:,('is_sa')]=0

    df_1B_1=pd.DataFrame(table_dict['1B'][1],columns=[
        'sic','description'
        ,sales_as_of-DateOffset(years=1)
        ,sales_as_of-DateOffset(months=1)-DateOffset(years=1)
    ]).set_index(['sic','description']).stack().reset_index()
    df_1B_1.columns=['sic','description','as_of_date','value']
    df_1B_1.loc[:,('is_sa')]=1        
    
    all_output=pd.concat([df_1A_0,df_1A_1,df_1B_0,df_1B_1])
    all_output=all_output.query('value!="(NA)"')
#    all_output.loc[:,('value')]=all_output.loc[:,('value')].apply(lambda x: float(x) if x!='(*)' else float('NaN')).copy()
    all_output.loc[:,('release_as_of_date')]=sales_as_of
    return all_output

In [444]:
save_census_release_page()

In [506]:
file_names=parse_census_release_page()

In [545]:
all_text_files = file_names.query('file_type=="txt"').sort_values('file_name_date')#\
#['file_name'].tolist()

In [1108]:
# i=0
# for name in all_text_files:
#     print(name)
#     time.sleep(2)    
#     retreve_save_release(name)

In [1047]:
sic_second_format=\
    all_text_files.query('file_name_date>="1999-01" and file_name_date<="2001-04"')

In [1284]:
sic_second_format=\
    all_text_files.query('file_name_date>="1994-01" and file_name_date<="2001-04"')
i=0
for name in sic_second_format['file_name']:
#    print(name)
    save_path_string='../raw/retail_sales_services/marts/releases/raw_release_history/files/'
    current_df = parse_text_sic(save_path_string+name)
    
    if i==0:
        final_all_release=current_df.copy()
        i=i+1
    else:
        final_all_release=pd.concat([final_all_release,current_df.copy()])


  unique_elements = set(islice(arg, check_count))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [1285]:
final_all_release.to_csv('../cleaned/retail_sales_services/marts/sic_history.csv',index=False)

In [1277]:
def parse_text_naics(text_file_path):
    
    table_dict={'1A':[],'1B':[]}
    start_append=False

#    annual_fix=False
    current_table=''
    with open(text_file_path, encoding="latin-1") as f:
        for row in f:
            

            if row.strip()[:5] in ['TABLE','TTABL'] or '1B.  ESTIMATED MONTHLY SALES FOR RETAIL AND FOOD SERVICES' in row:
                start_append=False
                if '1A' in row:
                    current_table='1A'
                    sales_as_of=pd.to_datetime(row.strip().split('--')[-1])
                elif '1B' in row:
                    current_table='1B'
                else:
                    break            


            if 'Retail & food services, total' in row:
                start_append=True
            
            if start_append:
                table_dict[current_table].append(row)

            if row.strip()[:3]=='722':
                start_append=False

    split_position_dict={'1A':{},'1B':{}}
    for key in table_dict:
        split_position_dict[key]=\
            max([x.rfind('...')+3 for x in table_dict[key] if len(x)>0])
    
    
    for key in table_dict:
        fix_array=[]
        for row in table_dict[key]:
            left=row[:split_position_dict[key]]
            right=row[split_position_dict[key]:]
            if len(left.strip())>0 or len(right.strip())>0:
                fix_array.append([
                        re.sub(r'\([0-9]\)', '', left)]+\
                        [x.strip() for x in right.split(' ') if len(x.strip())>0]
                    )
            table_dict[key]=fix_array

    split_first_array_dict={'1A':{},'1B':{}}
    for key in table_dict:
        left=[x[0] for x in table_dict[key]]
        split_first_array_dict[key]=\
                max([len(re.search(re.compile("(?s:.*)[0-9]"), x).group()) for x in left
                    if re.search(re.compile("(?s:.*)[0-9]"), x)])

    for key in table_dict:
        fix_array=[]            
        for row in table_dict[key]:
            left=row[0][:split_first_array_dict[key]]
            right=row[0][split_first_array_dict[key]:]
            temp =[left,right]+row[1:]
            temp = [x.strip() for x in temp]
            temp[1]=temp[1].replace('u"\u2026"','').rstrip('.')
            temp= temp[:2]+[x.replace(',','') for x in temp[2:]]
                
            fix_array.append(temp)
            
        table_dict[key]=fix_array
        
    df_1A=pd.DataFrame(table_dict['1A'],columns=[
        'naics','description'
        ,sales_as_of
        ,sales_as_of-DateOffset(months=1)
        ,sales_as_of-DateOffset(months=2)        
        ,sales_as_of-DateOffset(years=1)
        ,sales_as_of-DateOffset(years=1)-DateOffset(months=1)        
    ]).set_index(['naics','description']).stack().reset_index()    
    df_1A.columns=['naics','description','as_of_date','value']
    df_1A.loc[:,('is_sa')]=1

    df_1B=pd.DataFrame(table_dict['1B'],columns=[
        'naics','description'
        ,'year'
        ,'pct_change_from_prior'
        ,sales_as_of
        ,sales_as_of-DateOffset(months=1)
        ,sales_as_of-DateOffset(months=2)        
        ,sales_as_of-DateOffset(years=1)
        ,sales_as_of-DateOffset(years=1)-DateOffset(months=1)        
    ]).set_index(['naics','description']).drop(['year','pct_change_from_prior'],axis=1)\
    .stack().reset_index()    
    
    df_1B.columns=['naics','description','as_of_date','value']
    df_1B.loc[:,('is_sa')]=0        
    
    all_output=pd.concat([df_1A,df_1B])
    all_output=all_output.query('value!="(NA)"')
#    all_output.loc[:,('value')]=all_output.loc[:,('value')].apply(lambda x: float(x) if x!='(*)' else float('NaN')).copy()
    all_output.loc[:,('release_as_of_date')]=sales_as_of
    return all_output

In [1278]:
naics_format=\
    all_text_files.query('file_name_date>"2001-04"')
i=0
for name in naics_format['file_name']:
    print(name)
    save_path_string='../raw/retail_sales_services/marts/releases/raw_release_history/files/'
    current_df = parse_text_naics(save_path_string+name)
    
    if i==0:
        final_all_release=current_df.copy()
        i=i+1
    else:
        final_all_release=pd.concat([final_all_release,current_df.copy()])


rs0105.txt
rs0106.txt
rs0107.txt
rs0108.txt
rs0109.txt
rs0110.txt


  unique_elements = set(islice(arg, check_count))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


rs0111.txt
rs0112.txt
rs0201.txt
rs0202.txt
rs0203.txt
rs0204.txt
rs0205.txt
rs0206.txt
rs0207.txt
rs0208.txt
rs0209.txt
rs0210.txt
rs0211.txt
rs0212.txt
rs0301.txt
rs0302.txt
rs0303.txt
rs0304.txt
rs0305.txt
rs0306.txt
rs0307.txt
rs0308.txt
rs0309.txt
rs0310.txt
rs0311.txt
rs0312.txt
rs0401.txt
rs0402.txt
rs0403.txt
rs0404.txt
rs0405.txt
rs0406.txt
rs0407.txt
rs0408.txt
rs0409.txt
rs0410.txt
rs0411.txt
rs0412.txt
rs0501.txt
rs0502.txt
rs0503.txt
rs0504.txt
rs0505.txt
rs0506.txt
rs0507.txt
rs0508.txt
rs0509.txt
rs0510.txt
rs0511.txt
rs0512.txt
rs0601.txt
rs0602.txt
rs0603.txt
rs0604.txt
rs0605.txt
rs0606.txt
rs0607.txt
rs0608.txt
rs0609.txt
rs0610.txt
rs0611.txt
rs0612.txt
rs0701.txt
rs0702.txt
rs0703.txt
rs0704.txt
rs0705.txt
rs0706.txt
rs0707.txt
rs0708.txt
rs0709.txt
rs0710.txt
rs0711.txt
rs0712.txt
rs0801.txt
rs0802.txt
rs0803.txt
rs0804.txt
rs0805.txt
rs0806.txt
rs0807.txt
rs0808.txt
rs0809.txt
rs0810.txt
rs0811.txt
rs0812.txt
rs0901.txt
rs0902.txt
rs0903.txt
rs0904.txt
rs0905.txt

'GAFO'

In [1282]:
final_all_release.to_csv('../cleaned/retail_sales_services/marts/naics_history.csv',index=False)

In [1281]:
final_all_release

Unnamed: 0,naics,description,as_of_date,value,is_sa,release_as_of_date
0,,"Retail & food services, total",2001-05-01,291306,1,2001-05-01
1,,"Retail & food services, total",2001-04-01,291090,1,2001-05-01
2,,"Retail & food services, total",2001-03-01,287102,1,2001-05-01
3,,"Retail & food services, total",2000-05-01,280547,1,2001-05-01
4,,"Retail & food services, total",2000-04-01,279961,1,2001-05-01
...,...,...,...,...,...,...
185,722,Food services & drinking places,2012-12-01,45671,0,2012-12-01
186,722,Food services & drinking places,2012-11-01,42964,0,2012-12-01
187,722,Food services & drinking places,2012-10-01,44077,0,2012-12-01
188,722,Food services & drinking places,2011-12-01,43010,0,2012-12-01
