In [111]:
import os
import csv
import pprint
from collections import defaultdict

INFILE = '2008election_wv.csv'
OUTFILE = 'clean2008election_wv.csv'
# header names: cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,contbr_zip,contbr_employer,contbr_occupation,
#               contb_receipt_amt,contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num,tran_id,election_tp

def parse_file(datafile):
    """
    Reads in a CSV and creates a list of dictionaries representing each row of data
    
    Args:
      datafile: The input csv file to process
      
    Returns:
      data: A list of dictionaries representing each row in the csv file
    """
    data = []
    with open(datafile, "rb") as csvfile:
        
        reader = csv.DictReader(csvfile)
        
        for row in reader:
            data.append(row)
            
    return data

def write_file(data, datafile):
    
    with open(OUTFILE, 'wb') as csvfile:
        fieldnames = ['cmte_id','cand_id','cand_nm','contbr_nm','contbr_city','contbr_st','contbr_zip','contbr_employer',
                      'contbr_occupation','occupation_category','contb_receipt_amt','contb_receipt_dt',
                      'receipt_desc','memo_cd','memo_text','form_tp','file_num','tran_id','election_tp']
        writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

def audit_occupations(data):
    EDUCATION_OCCUPATIONS = ['TEACHER', 'HIGH SCHOOL', 'PRINCIPAL', 'EDUCATION', 'INSTRUCTOR',
                             'PROFESSOR', 'LIBRARIAN', 'DEAN', 'GUIDANCE COUNSELOR', 'GRADUATE',
                             'SCHOOL COUNSELOR', 'SUPERINTENDENT', 'VISITING LECTURER']
    
    LEGAL_OCCUPATIONS = ['ATTY', 'ATTORNEY', 'LAWYER', 'LAW CLERK', 'LEGAL ASSISTANT',
                         'LEGAL SECRETARY', 'PARALEGAL']
    
    EXECUTIVE_OCCUPATIONS = ['CEO', 'CFO', 'CHAIRMAN', 'MANAGER', 'MANAGING DIRECTOR',
                             'MANAGEMENT', 'OPERATIONS DIRECTOR', 'OWNER', 'PARTNER',
                             'PRESIDENT', 'VP']
    
    UNEMPLOYED_OCCUPATIONS = ['NOT EMPLOYED', 'UNEMPLOYED']
    
    HOMEMAKER_OCCUPATIONS = ['HOMEMAKER', 'HOUSE HUSBAND', 'HOUSEWIFE', 'MOM']
    
    SELF_EMPLOYED_OCCUPATIONS = ['SELF']
    
    POLITICAL_OCCUPATIONS = ['JUDGE', 'CITY COUNCIL', 'POLITICAL']
    
    RELIGIOUS_OCCUPATIONS = ['MINISTER', 'PASTOR']
    
    RETIRED_OCCUPATIONS = ['RETIRED']
    
    STUDENT_OCCUPATIONS = ['STUDENT']
    
    OCCUPATIONS = {'EDUCATION' : EDUCATION_OCCUPATIONS,
                   'LEGAL' : LEGAL_OCCUPATIONS,
                   'EXECUTIVE' : EXECUTIVE_OCCUPATIONS,
                   'UNEMPLOYED' : UNEMPLOYED_OCCUPATIONS,
                   'HOMEMAKER' : HOMEMAKER_OCCUPATIONS,
                   'SELF_EMPLOYED' : SELF_EMPLOYED_OCCUPATIONS,
                   'POLITICAL' : POLITICAL_OCCUPATIONS,
                   'RELIGIOUS' : RELIGIOUS_OCCUPATIONS,
                   'RETIRED' : RETIRED_OCCUPATIONS,
                   'STUDENT' : STUDENT_OCCUPATIONS
                  }
    
    
    occupations = defaultdict(int)
    
    categories = defaultdict(set)
    
    for i, row in enumerate(data):
        
        for key, occupation_list in OCCUPATIONS.iteritems():
        
            for occupation in occupation_list:
                
                if row['contbr_occupation'].find(occupation) != -1:
                    
                    categories[key].add(row['contbr_occupation'])
                    election_data_raw[i]['occupation_category'] = key
                
                    
                #else:
                #    election_data_raw[i]['occupation_category'] = 'OTHER'
            #occupations[row['contbr_occupation']] += 1
            
        if 'occupation_category' not in election_data_raw[i].keys():
            election_data_raw[i]['occupation_category'] = 'OTHER'
            
    for key in categories.keys():
        print key + ":"
        pprint.pprint(categories[key])        
    return data
   
    

election_data_raw = parse_file(INFILE)
audit_occupations(election_data_raw)
write_file(election_data_raw, OUTFILE)
#pprint.pprint(election_data_raw[1:25])

RETIRED:
set(['RETIRED',
     'RETIRED ACCOUNTANT',
     'RETIRED COAL MINER',
     'RETIRED COLLEGE PROFESSOR',
     'RETIRED SCHOOL TEACHER   PRINCEPAL',
     'RETIRED SCHOOL TEACHER + PRINCEPAL',
     'SEMI-RETIRED SUBSTITUTE TEACHER'])
SELF_EMPLOYED:
set(['SELF', 'SELF EMPLOYED', 'SELF-EMPLOYED'])
UNEMPLOYED:
set(['NOT EMPLOYED', 'UNEMPLOYED'])
EXECUTIVE:
set(['ABSTRACTOR, OFFICE MANAGER',
     'ACCOUNT MANAGER',
     'APPLICATIONS MANAGER',
     'AREA MANAGER',
     'ASSISTANT OFFICE MANAGER',
     'ATTORNEY / PRESIDENT',
     'BAKERY OWNER/MANAGAER',
     'BOND MANAGER',
     'BOOK STORE OWNER',
     'BUISNESS MANAGER',
     'BUSINESS MANAGER',
     'BUSINESS OWNER',
     'CASE MANAGER',
     'CEO',
     'CEO, COMMUNITY ORG., PLAYWRIGHT',
     'CEO/FURNITURE MANUFACTURER',
     'CEO/OWNER',
     'CFO',
     'CHAIRMAN',
     'CHAIRMAN & C.E.O.',
     'CHAIRMAN OF THE BOARD',
     'CIVILIAN NAVY MANAGER',
     'CLIENT PROGRAM MANAGER',
     'CO-OWNER',
     'COAL COMPANY MANAGER',
