In [19]:
import csv
import sys

class TopCounts(object):
    """
    Computes top counts for a read in CSV file 
    
    Args:
        input_filename (str): CSV file
        occupation_output_filename (str): filename where to save occupation output 
        state_output_filename (str): filename where to save state output 
        delimiter (str): CSV delimiter character 
        quotechar (str): Quote character in CSV file 

    Attributes:
        input_filename (str): CSV filename
        occupation_output_filename (str): occupation output filename
        state_output_filename (str): state output filename
        delimiter (str): delimiter character for read and write  
        quotechar (str): Quote character
    """
    def __init__(self, input_filename, occupation_output_filename, state_output_filename, 
                 delimiter=';', quotechar='"'):
        self.input_filename = input_filename
        self.occupation_output_filename = occupation_output_filename
        self.state_output_filename = state_output_filename
        self.delimiter = delimiter
        self.quotechar = quotechar
        
        
    def compute_counts(self, certified_value='CERTIFIED', status='STATUS'):
        """
        Computes counts for desired column field based on column names 
        Args:
            param1 (int): The first parameter.
            param2 (:obj:`str`, optional): The second parameter. Defaults to None.
                Second line of description should be indented.
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.

        Returns:
            counts (dict): dictionary of {desired_col : count}
            N (int): total number of certified applications regardless of occupation
        """
        occupation_counts = {}
        state_counts = {}
        with open(self.input_filename, 'r') as f:
            # read file 
            lines = csv.reader(f, quotechar=self.quotechar, delimiter=self.delimiter,
                             quoting=csv.QUOTE_ALL)
            # get column names
            columns = next(lines)
            print(columns)
            status_idx, occ_index, state_index = self._get_indices(columns, status=status)
            # get total number of certified applications regardless of occupation
            N = 0
            for line in lines:
                if line[status_idx] == certified_value:
                    occ = line[occ_index]
                    state = line[state_index]
                    N += 1
                    # increment count based on occupation
                    if occ not in occupation_counts:
                        occupation_counts[occ] = 0
                    occupation_counts[occ] += 1
                    # increment count based on state
                    if state not in state_counts:
                        state_counts[state] = 0
                    state_counts[state] += 1
                    
        # return count dicts and total number of certified applications regardless of occupation
        return occupation_counts, state_counts, N

    def _get_indices(self, header_columns, status):
        """
        return indices value of status, occupation and state columns
        """
        for i, split_col in enumerate([col.split('_') for col in header_columns]):
            for split in split_col:
                # UPDATE 
                if status in split:
                    status_idx = i
                # handles both `SOC_NAME` and `LCA_CASE_SOC_NAME`
                if 'SOC' and 'NAME' in split:
                    occ_index = i
                # handles both `EMPLOYER_STATE` and `LCA_CASE_EMPLOYER_STATE`
                if 'EMPLOYER' and 'STATE' in split:
                    state_index = i
        return status_idx, occ_index, state_index

    def sort_values(self, counts, top_N=10):
        """
        Sorts `counts` to return `top_N` columns and counts 
        Args:
            counts (dict): Count dictionary
            top_N (int): Number of top columns 
        
        Returns:
            top (list): top N list of tuples [(desired_col, counts), ...]
        """
#         top = sorted(counts.items(), key=lambda x: x[1], reverse=True)[:top_N]
        import operator
#         s = sorted(s, key = operator.itemgetter(1, 2))
#         top = sorted(counts.items(), key=operator.itemgetter(1, 0), reverse=True)[:top_N]
        top = sorted(counts.items(), key=lambda x:(-x[1],x[0]))[:top_N]
        return top
    
    def add_percentage(self, top, N):
        """
        Add percentage of applications that have been certified compared 
        to total number of certified applications regardless of state. 
        Args:
            top (list): top N list of tuples [(desired_col, counts), ...]
            N (int): total number of certified applications regardless of occupation
        
        Returns:
            (list): top N list of tuples [(desired_col, counts, percentage), ...]
        """
        print([top[i] + (((str(round((top[i][1] / N) * 100, 1)) + '%'),)) for i in range(len(top))])
        return [top[i] + (((str(round((top[i][1] / N) * 100, 1)) + '%'),)) for i in range(len(top))]
    
    def write_to_file(self, top, filename):
        """
        Saves output to output_filename
        Args:
            top (list): top N list of tuples [(desired_col, counts), ...]
        """
        name = filename.split('_')[-1].split('.')[0].upper()
        with open(filename, 'w') as f:
            # UPDATE 
            f.write('TOP_{}'.format(name) + self.delimiter + \
                    'NUMBER_CERTIFIED_APPLICATIONS'  + self.delimiter + \
                    'PERCENTAGE' + '\n')
            for line in top:
                line = self.delimiter.join(map(str, line))
                f.write(line + '\n')
        

def main():
    top_model = TopCounts(input_filename='../input/H1B_FY_2016.csv', 
                               occupation_output_filename='../output/top_10_occupations.txt',
                               state_output_filename='../output/top_10_states.txt', 
                               delimiter=';')
    occ_counts, state_counts, N = top_model.compute_counts(certified_value='CERTIFIED')

    top_occs = top_model.sort_values(occ_counts, top_N=10)
    top_states = top_model.sort_values(state_counts, top_N=10)
    top_occs = top_model.add_percentage(top_occs, N)
    top_states = top_model.add_percentage(top_states, N)
    top_model.write_to_file(top_occs, top_model.occupation_output_filename)
    top_model.write_to_file(top_states, top_model.state_output_filename)

if __name__ == '__main__':
    main()

['', 'CASE_NUMBER', 'CASE_STATUS', 'CASE_SUBMITTED', 'DECISION_DATE', 'VISA_CLASS', 'EMPLOYMENT_START_DATE', 'EMPLOYMENT_END_DATE', 'EMPLOYER_NAME', 'EMPLOYER_ADDRESS', 'EMPLOYER_CITY', 'EMPLOYER_STATE', 'EMPLOYER_POSTAL_CODE', 'EMPLOYER_COUNTRY', 'EMPLOYER_PROVINCE', 'EMPLOYER_PHONE', 'EMPLOYER_PHONE_EXT', 'AGENT_ATTORNEY_NAME', 'AGENT_ATTORNEY_CITY', 'AGENT_ATTORNEY_STATE', 'JOB_TITLE', 'SOC_CODE', 'SOC_NAME', 'NAIC_CODE', 'TOTAL_WORKERS', 'FULL_TIME_POSITION', 'PREVAILING_WAGE', 'PW_UNIT_OF_PAY', 'PW_WAGE_SOURCE', 'PW_SOURCE_YEAR', 'PW_SOURCE_OTHER', 'WAGE_RATE_OF_PAY_FROM', 'WAGE_RATE_OF_PAY_TO', 'WAGE_UNIT_OF_PAY', 'H-1B_DEPENDENT', 'WILLFUL_VIOLATOR', 'WORKSITE_CITY', 'WORKSITE_COUNTY', 'WORKSITE_STATE', 'WORKSITE_POSTAL_CODE', 'ORIGINAL_CERT_DATE']
[('SOFTWARE DEVELOPERS, APPLICATIONS', 106758, '18.7%'), ('COMPUTER SYSTEMS ANALYSTS', 88370, '15.5%'), ('COMPUTER PROGRAMMERS', 72112, '12.7%'), ('COMPUTER OCCUPATIONS, ALL OTHER', 48598, '8.5%'), ('SOFTWARE DEVELOPERS, SYSTEMS SOFTW