### Challenge Submission

### Goal 1:
> Create a lazy iterator that will produce a named tuple for each row of the data. <br>
Considerations:<br>
> - Don't hardcode the name of the fields for namedtuple <br>
> - The datatype should be correct for each field as in date should be date type <br>
> - Use lazy iteration, can't load the entire file in memory <br>

In [96]:
from functools import namedtuple
from functools import partial
from datetime import datetime

# Parse clean header
def clean_header(s):
    return s.replace(' ','_').lower()
 
# Parse rows skipping the header
def read_data(path):
    with open(path) as f:
        next(f)
        yield from f
        
# Parse integer
def parse_int(value, *, default=None):
    try:
        return int(value)
    except ValueError:
        return default
    
# Parse date
def parse_date(value, *, default=None):
    date_format = '%m/%d/%Y'
    try:
        return datetime.strptime(value, date_format).date()
    except ValueError:
        return default
    
# Parse string
def parse_string(string, *, default=None):
    try:
        cleaned = string.strip()
        # Empty string
        if not cleaned:
            return default
        else:
            return cleaned
    except ValueError:
        return default
    
def parse_row(row, *, default=None):
    # clean row
    fields = row.strip('\n').split(',')
    # parse data with propper dtype
    parsed_data = [func(field) 
                   for func, field in zip(column_parser, fields)]
    
    # skip rows with missing values
    if all(item is not None for item in parsed_data):
        return Ticket(*parsed_data)
    else:
        return default
    
def parsed_data(path):
    for row in read_data(path):
        parsed = parse_row(row)
        if parsed:
            yield parsed

In [155]:
if __name__ == '__main__':
    path = 'nyc_parking_tickets_extract.csv'
    
    with open(path) as f:
        # creat clean header
        column_header = clean_header(next(f))
        # create namedtuple
        Ticket = namedtuple('Ticket', column_header)
        
    # create tuple with all dtypes to parse. Used partial to set the default arguments.
    column_parser = (parse_int,  # summons_number, default is None
                      parse_string,  # plate_id, default is None
                      partial(parse_string, default=''),  # state
                      partial(parse_string, default=''),  # plate_type
                      parse_date,  # issue_date, default is None
                      parse_int,  # violation_code
                      partial(parse_string, default=''),  # body type
                      parse_string,  # make, default is None
                      lambda x: parse_string(x, default='')  # description
                     )
    
    # parse data
    data = parsed_data(path)
    
#     # test output
#     for _ in range(5):
#         print(next(data))

### Goal 2:
> - Calculate the number of violation per car make <br>
> - Use the lazy iteration from goal 1 <br>
> - Output should be a dict with key = car make and value = # viloations <br>

In [156]:
dt = {}

while True:
    try:
        make = next(data).vehicle_make
        if make in dt.keys():
            dt[make] += 1
        else:
            dt[make] = 1
    except StopIteration:
        break
        
{k:v for k, v in sorted(dt.items(), key=lambda x: x[1], reverse=True)}

{'TOYOT': 112,
 'HONDA': 106,
 'FORD': 104,
 'CHEVR': 76,
 'NISSA': 70,
 'DODGE': 45,
 'FRUEH': 44,
 'ME/BE': 38,
 'GMC': 35,
 'HYUND': 35,
 'BMW': 34,
 'LEXUS': 26,
 'INTER': 25,
 'JEEP': 22,
 'NS/OT': 18,
 'SUBAR': 18,
 'INFIN': 13,
 'LINCO': 12,
 'CHRYS': 12,
 'ACURA': 12,
 'AUDI': 12,
 'VOLVO': 12,
 'MITSU': 11,
 'ISUZU': 10,
 'CADIL': 9,
 'KIA': 8,
 'VOLKS': 8,
 'HIN': 6,
 'KENWO': 5,
 'ROVER': 5,
 'BUICK': 5,
 'MAZDA': 5,
 'MERCU': 4,
 'JAGUA': 3,
 'SMART': 3,
 'PORSC': 3,
 'WORKH': 2,
 'SATUR': 2,
 'SCION': 2,
 'SAAB': 2,
 'HINO': 2,
 'FIR': 1,
 'OLDSM': 1,
 'PETER': 1,
 'CITRO': 1,
 'GEO': 1,
 'YAMAH': 1,
 'BSA': 1,
 'MINI': 1,
 'PONTI': 1,
 'SPRI': 1,
 'PLYMO': 1,
 'UPS': 1,
 'FIAT': 1,
 'UD': 1,
 'UTILI': 1,
 'GMCQ': 1,
 'STAR': 1,
 'AM/T': 1,
 'MI/F': 1}

# Sandbox

In [73]:
with open(path) as f:
    print(clean_header(next(f)))

summons_number,plate_id,registration_state,plate_type,issue_date,violation_code,vehicle_body_type,vehicle_make,violation_description



In [51]:
path = 'nyc_parking_tickets_extract.csv'
def read_data():
    with open(path) as f:
        next(f)
        yield from f
        
raw_data = read_data()

for _ in range(5):
    print(next(raw_data))

4006478550,VAD7274,VA,PAS,10/5/2016,5,4D,BMW,BUS LANE VIOLATION

4006462396,22834JK,NY,COM,9/30/2016,5,VAN,CHEVR,BUS LANE VIOLATION

4007117810,21791MG,NY,COM,4/10/2017,5,VAN,DODGE,BUS LANE VIOLATION

4006265037,FZX9232,NY,PAS,8/23/2016,5,SUBN,FORD,BUS LANE VIOLATION

4006535600,N203399C,NY,OMT,10/19/2016,5,SUBN,FORD,BUS LANE VIOLATION



In [58]:
def parse_int(value, *, default=None):
    try:
        return int(value)
    except ValueError:
        return default
    
    
parse_int('5')

5

In [63]:
def parse_date(value, *, default=None):
    date_format = '%m/%d/%Y'
    try:
        return datetime.strptime(value, date_format).date()
    except ValueError:
        return default
    
parse_date('08/26/2020')

datetime.date(2020, 8, 26)

In [71]:
def parse_str(string, *, default=None):
    try:
        cleaned = string.strip()
        # Empty string
        if not cleaned:
            return default
        else:
            return cleaned
    except ValueError:
        return default
    
parse_str('1')

'1'

In [89]:
def parse_row(row, *, default=None):
    # clean row
    fields = row.strip('\n').split(',')
    # parse data with propper dtype
    parsed_data = [func(field) 
                   for func, field in zip(column_parser, fields)]
    
    # skip rows with missing values
    if all(item is not None for item in parsed_data):
        return Ticket(*parsed_data)
    else:
        return default
    
rows = read_data(path)
for _ in range(5):
    row = next(rows)
    parsed_data = parse_row(row)
    print(parsed_data)

4006478550 VAD7274 VA PAS 2016-10-05 5 4D BMW BUS LANE VIOLATION
Ticket(summons_number=4006478550, plate_id='VAD7274', registration_state='VA', plate_type='PAS', issue_date=datetime.date(2016, 10, 5), violation_code=5, vehicle_body_type='4D', vehicle_make='BMW', violation_description='BUS LANE VIOLATION')
4006462396 22834JK NY COM 2016-09-30 5 VAN CHEVR BUS LANE VIOLATION
Ticket(summons_number=4006462396, plate_id='22834JK', registration_state='NY', plate_type='COM', issue_date=datetime.date(2016, 9, 30), violation_code=5, vehicle_body_type='VAN', vehicle_make='CHEVR', violation_description='BUS LANE VIOLATION')
4007117810 21791MG NY COM 2017-04-10 5 VAN DODGE BUS LANE VIOLATION
Ticket(summons_number=4007117810, plate_id='21791MG', registration_state='NY', plate_type='COM', issue_date=datetime.date(2017, 4, 10), violation_code=5, vehicle_body_type='VAN', vehicle_make='DODGE', violation_description='BUS LANE VIOLATION')
4006265037 FZX9232 NY PAS 2016-08-23 5 SUBN FORD BUS LANE VIOLATION