In [62]:
"""
Project for Olin College, Data Science, Spring 2017.

Data from:
https://www.nass.usda.gov/Surveys/Guide_to_NASS_Surveys/Bee_and_Honey/
http://usda.mannlib.cornell.edu/MannUsda/viewDocumentInfo.do?documentID=1191


unzip the csv zip files with

$  for VARIABLE in 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016; 
$  do   unzip *$VARIABLE.zip -d $VARIABLE; 
$  done

or pull them from my git.
"""

import numpy as np
import pandas
import os
import string

In [63]:
def get_all_csvs(csv_years=[2002+i for i in range(15)], csv_path="data/csvs/"):
    '''
    Returns a list of all csv files in the (hard-coded) year-directory combinations.
    '''
    all_csvs = []
    for i in csv_years:
        this_path = csv_path + str(i) + "/"
        these_files = os.listdir(this_path)
        these_csvs = [(this_path + i) for i in these_files if i[-4:] == ".csv"]
        all_csvs.extend(these_csvs)
    return all_csvs

In [64]:
class Csv_obj(object):
    def __init__(self, filename):
        self.name = filename
        year_start = filename.find("20")
        self.year = int(filename[year_start:year_start+4])
        self.data_type = filename[(filename.find("_")) : (filename.find(".csv"))]
        self.df = sanitize_then_load_csv(filename)
        
    def __str__(self):
        return str(self.year) + "\t" + self.data_type
    
    def isnull(self):
        if self.df is None:
            return True
        else:
            return False

In [71]:
def sanitize_then_load_csv(filename, min_col_length=9, max_col_length = None):
    """
    Gets around the metadata-y columns by scanning for the length of .split(',').
    TODO: make it ignore the "format"/grammatical commas which occur within a single entry in the csv.
    """
    f = open(filename)
    lines_to_skip = []
    for i, line in enumerate(f.readlines()):
        l_split = line.split(',') # will also split internal commas in strings. todo: fix. low priority.
        if len(l_split) < min_col_length:
            lines_to_skip.append(i)
        elif max_col_length != None:
            if len(l_split) > max_col_length:
                lines_to_skip.append(i)
    try:
        df = pandas.read_csv(filename, skiprows = lines_to_skip)
    except Exception as err:
        print "Cannot read: ", filename, "\t ", str(err)[str(err).find("Expected"):]
        df = None
    return df

In [72]:
all_csvs = get_all_csvs()

In [73]:
csv_objs = [Csv_obj(i) for i in all_csvs]

Cannot read:  data/csvs/2008/hony_all.csv 	  Expected 9 fields in line 155, saw 11

Cannot read:  data/csvs/2009/hony_all.csv 	  Expected 9 fields in line 155, saw 11

Cannot read:  data/csvs/2010/hony_all.csv 	  Expected 9 fields in line 155, saw 11

Cannot read:  data/csvs/2011/hony_all_tables.csv 	  Expected 9 fields in line 153, saw 11

Cannot read:  data/csvs/2012/hony_all_tables.csv 	  Expected 9 fields in line 152, saw 10

Cannot read:  data/csvs/2013/hony_all_tables.csv 	  Expected 9 fields in line 152, saw 10

Cannot read:  data/csvs/2014/hony_all_tables.csv 	  Expected 9 fields in line 151, saw 10

Cannot read:  data/csvs/2015/hony_all_tables.csv 	  Expected 9 fields in line 151, saw 10

Cannot read:  data/csvs/2016/hony_p00a_t005.csv 	  e
Cannot read:  data/csvs/2016/hony_all_tables.csv 	  Expected 9 fields in line 166, saw 10



In [74]:
csv_objs

[<__main__.Csv_obj at 0x7fae4cf4e190>,
 <__main__.Csv_obj at 0x7fae4ce22bd0>,
 <__main__.Csv_obj at 0x7fae4cf52790>,
 <__main__.Csv_obj at 0x7fae4d073490>,
 <__main__.Csv_obj at 0x7fae4cf4e7d0>,
 <__main__.Csv_obj at 0x7fae4ce18f50>,
 <__main__.Csv_obj at 0x7fae4ce18bd0>,
 <__main__.Csv_obj at 0x7fae4ce188d0>,
 <__main__.Csv_obj at 0x7fae4ce18090>,
 <__main__.Csv_obj at 0x7fae4ce181d0>,
 <__main__.Csv_obj at 0x7fae4ce18f10>,
 <__main__.Csv_obj at 0x7fae4ce18390>,
 <__main__.Csv_obj at 0x7fae4ce18250>,
 <__main__.Csv_obj at 0x7fae4ce18610>,
 <__main__.Csv_obj at 0x7fae4ce18750>,
 <__main__.Csv_obj at 0x7fae4ce187d0>,
 <__main__.Csv_obj at 0x7fae4ce18850>,
 <__main__.Csv_obj at 0x7fae4ce18590>,
 <__main__.Csv_obj at 0x7fae4ce183d0>,
 <__main__.Csv_obj at 0x7fae4ce18650>,
 <__main__.Csv_obj at 0x7fae4ce0f950>,
 <__main__.Csv_obj at 0x7fae4cdad4d0>,
 <__main__.Csv_obj at 0x7fae4cdad450>,
 <__main__.Csv_obj at 0x7fae4cdad650>,
 <__main__.Csv_obj at 0x7fae4cdad790>,
 <__main__.Csv_obj at 0x7