In [9]:
import os
from subprocess import check_call
from glob import glob
from collections import OrderedDict

import numpy as np
import pandas as pd

In [10]:
files = []
dirs = sorted(glob('data/????-????'))

regions = ['ENC', 'WNC']

all_files = []
for r in regions:
    for d in dirs:
        files_here = sorted(
            glob(f'{d}/{r}*.zip') +     glob(f'{d}/{r}*.ZIP')
        )
        all_files.extend(files_here)
all_files        

['data/1985-1987/ENCEN85.zip',
 'data/1985-1987/ENCEN86.zip',
 'data/1985-1987/ENCEN87.zip',
 'data/1988-1990/ENCEN88.zip',
 'data/1988-1990/ENCEN89.zip',
 'data/1988-1990/ENCEN90.zip',
 'data/1991-1993/ENCEN91.zip',
 'data/1991-1993/ENCEN92.zip',
 'data/1991-1993/ENCEN93.zip',
 'data/1994-1996/ENCEN94.zip',
 'data/1994-1996/ENCEN95.zip',
 'data/1994-1996/ENCEN96.zip',
 'data/1997-1999/ENCEN97.zip',
 'data/1997-1999/ENCEN98.zip',
 'data/1997-1999/ENCEN99.zip',
 'data/2000-2002/ENCEN00.zip',
 'data/2000-2002/ENCEN01.zip',
 'data/2000-2002/ENCEN02.zip',
 'data/2003-2005/ENCEN03.zip',
 'data/2003-2005/ENCEN04.zip',
 'data/2003-2005/ENCEN05.zip',
 'data/2006-2008/ENCEN06.ZIP',
 'data/2006-2008/ENCEN07.ZIP',
 'data/2006-2008/ENCEN08.zip',
 'data/2009-2011/ENCEN09.ZIP',
 'data/2009-2011/ENCEN10.zip',
 'data/2009-2011/ENCEN11.zip',
 'data/2012-2015/ENCEN12.zip',
 'data/2012-2015/ENCEN13.zip',
 'data/2012-2015/ENCEN14.zip',
 'data/2012-2015/ENCEN15.zip',
 'data/1985-1987/WNCEN85.zip',
 'data/1

In [11]:
def readline(line):
    """parse a single line of fertilizer data file"""
    
    column_defs = OrderedDict([
        ('Fertilizer Year', dict(nchar=2, numeric=True, scale_factor=1)), 
        ('Extra county data', dict(nchar=1, numeric=False)),
        ('State abbr', dict(nchar=2, numeric=False)),
        ('County FIPS code', dict(nchar=3, numeric=True, scale_factor=1)),
        ('Reporting period', dict(nchar=2, numeric=True, scale_factor=1)), 
        ('Quantity (tons)', dict(nchar=9, numeric=True, scale_factor=0.01)),
        ('Fertilizer code', dict(nchar=3, numeric=True, scale_factor=1)),
        ('Container', dict(nchar=1, numeric=True, scale_factor=1)), 
        ('Use', dict(nchar=1, numeric=True, scale_factor=1)), 
        ('Grade: N', dict(nchar=3, numeric=True, scale_factor=0.1)), 
        ('Grade: P', dict(nchar=3, numeric=True, scale_factor=0.1)),
        ('Grade: K', dict(nchar=3, numeric=True, scale_factor=0.1)),
    ])    
    
    ischar = ['Extra county data', 'State abbr']
    
    ndx0 = 0
    data = {}
    for name, info in column_defs.items():
        n = info['nchar']
        value = line[ndx0:ndx0+n]
        if info['numeric']:
            try:
                value = int(value) * info['scale_factor']        
            except:
                value = np.nan       
        data[name] = value
        ndx0 += n
    return data


def file_to_df(file_in):
    """convert dumbass data format to dataframe"""
    with open(file_in, 'r') as fid:
        lines = fid.readlines()
    lines = [l.strip() for l in lines]

    df_lines = []        
    for line in lines:
        df_lines.append(readline(line))
        
    return pd.DataFrame(df_lines)


def filter_df(df):
    """filter dataframe based on specified values; return dataframe including only matching rows"""
    
    filters = {
        'Fertilizer code': [16, 20, 24, 25, 27, 29, 31, 
                            50, 64, 73, 77, 207, 263, 265, 267, 
                            443, 463, 613, 629, 649, 652, 
                            661, 663, 665, 667, 702, 714, 720, 
                            726, 728, 732, 734, 736, 744, 754,   
                            770, 774, 780, 782, 783,],
    }
    
    sel = np.ones(len(df)).astype(np.bool)
    for key, values in filters.items():
        sel = sel & df[key].isin(values)
    return df.loc[sel]

In [12]:
diro = 'data/csv_output'
os.makedirs(diro, exist_ok=True)

dfs = []
for this_file in all_files:
    file_in = os.path.basename(this_file).replace('.zip', '.R')
    file_out = f'{diro}/{file_in}'.replace('.R', '.csv.gz')
    
    print(f'converting: {file_in} --> {file_out}')
    if not os.path.exists(file_in):
        check_call(['unzip', this_file])
    
    df = file_to_df(file_in)
    df = filter_df(df)    
    df.to_csv(file_out, compression='gzip')
    os.remove(file_in)
    
    dfs.append(df)
    
file_out = f'{diro}/{"-".join(regions)}-alldata.csv.gz'
df = pd.concat(dfs)
df.to_csv(file_out, compression='gzip')
df

converting: ENCEN85.R --> data/csv_output/ENCEN85.csv.gz
converting: ENCEN86.R --> data/csv_output/ENCEN86.csv.gz
converting: ENCEN87.R --> data/csv_output/ENCEN87.csv.gz
converting: ENCEN88.R --> data/csv_output/ENCEN88.csv.gz
converting: ENCEN89.R --> data/csv_output/ENCEN89.csv.gz
converting: ENCEN90.R --> data/csv_output/ENCEN90.csv.gz
converting: ENCEN91.R --> data/csv_output/ENCEN91.csv.gz
converting: ENCEN92.R --> data/csv_output/ENCEN92.csv.gz
converting: ENCEN93.R --> data/csv_output/ENCEN93.csv.gz
converting: ENCEN94.R --> data/csv_output/ENCEN94.csv.gz
converting: ENCEN95.R --> data/csv_output/ENCEN95.csv.gz
converting: ENCEN96.R --> data/csv_output/ENCEN96.csv.gz
converting: ENCEN97.R --> data/csv_output/ENCEN97.csv.gz
converting: ENCEN98.R --> data/csv_output/ENCEN98.csv.gz
converting: ENCEN99.R --> data/csv_output/ENCEN99.csv.gz
converting: ENCEN00.R --> data/csv_output/ENCEN00.csv.gz
converting: ENCEN01.R --> data/csv_output/ENCEN01.csv.gz
converting: ENCEN02.R --> data/

FileNotFoundError: [Errno 2] No such file or directory: 'ENCEN06.ZIP'