### Project Solution - Goal 1

In [2]:
file_name = 'nyc_parking_tickets_extract.csv'

In [3]:
with open(file_name) as f:
    for _ in range(5):
        print(next(f))

Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Violation Description

4006478550,VAD7274,VA,PAS,10/5/2016,5,4D,BMW,BUS LANE VIOLATION

4006462396,22834JK,NY,COM,9/30/2016,5,VAN,CHEVR,BUS LANE VIOLATION

4007117810,21791MG,NY,COM,4/10/2017,5,VAN,DODGE,BUS LANE VIOLATION

4006265037,FZX9232,NY,PAS,8/23/2016,5,SUBN,FORD,BUS LANE VIOLATION



In [9]:
with open(file_name) as f:
    column_headers = next(f).strip('\n').split(',')
    sample_data = next(f).strip('\n').split(',')

In [10]:
column_headers

['Summons Number',
 'Plate ID',
 'Registration State',
 'Plate Type',
 'Issue Date',
 'Violation Code',
 'Vehicle Body Type',
 'Vehicle Make',
 'Violation Description']

In [11]:
sample_data

['4006478550',
 'VAD7274',
 'VA',
 'PAS',
 '10/5/2016',
 '5',
 '4D',
 'BMW',
 'BUS LANE VIOLATION']

In [14]:
column_names = [header.replace(' ', '_').lower() for header in column_headers]

In [15]:
column_names

['summons_number',
 'plate_id',
 'registration_state',
 'plate_type',
 'issue_date',
 'violation_code',
 'vehicle_body_type',
 'vehicle_make',
 'violation_description']

In [16]:
list(zip(column_names, sample_data))

[('summons_number', '4006478550'),
 ('plate_id', 'VAD7274'),
 ('registration_state', 'VA'),
 ('plate_type', 'PAS'),
 ('issue_date', '10/5/2016'),
 ('violation_code', '5'),
 ('vehicle_body_type', '4D'),
 ('vehicle_make', 'BMW'),
 ('violation_description', 'BUS LANE VIOLATION')]

- 'summons_number' - int  <--
- 'plate_id' - str
- 'registration_state' - str
- 'plate_type' - str
- 'issue_date' - date  <--
- 'violation_code' - int  <--
- 'vehicle_body_type' - str
- 'vehicle_make' - str
- 'violation_description' - str

In [17]:
from collections import namedtuple
Ticket = namedtuple('Ticket', column_names)

In [18]:
with open(file_name) as f:
    next(f)
    raw_data_row = next(f)

In [19]:
raw_data_row

'4006478550,VAD7274,VA,PAS,10/5/2016,5,4D,BMW,BUS LANE VIOLATION\n'

In [20]:
def read_data():
    with open(file_name) as f:
        next(f)
        yield from f

In [21]:
raw_data = read_data()

In [22]:
for _ in range(5):
    print(next(raw_data))

4006478550,VAD7274,VA,PAS,10/5/2016,5,4D,BMW,BUS LANE VIOLATION

4006462396,22834JK,NY,COM,9/30/2016,5,VAN,CHEVR,BUS LANE VIOLATION

4007117810,21791MG,NY,COM,4/10/2017,5,VAN,DODGE,BUS LANE VIOLATION

4006265037,FZX9232,NY,PAS,8/23/2016,5,SUBN,FORD,BUS LANE VIOLATION

4006535600,N203399C,NY,OMT,10/19/2016,5,SUBN,FORD,BUS LANE VIOLATION



In [23]:
def parse_int(value, *, default=None):
    try:
        return int(value)
    except ValueError:
        return default

In [24]:
from datetime import datetime

In [25]:
def parse_date(value, *, default=None):
    date_format = '%m/%d/%Y'
    try:
        return datetime.strptime(value, date_format).date()
    except ValueError:
        return default

In [26]:
parse_int('hello', default='N/A')

'N/A'

In [27]:
parse_date('3/28/2018')

datetime.date(2018, 3, 28)

In [28]:
parse_date('234556', default='N/A')

'N/A'

In [29]:
def parse_string(value, *, default=None):
    try:
        cleaned = value.strip()
        if not cleaned:
            return default
        else:
            return cleaned
    except ValueError:
        return default

In [30]:
parse_string('   hello  ')

'hello'

In [31]:
parse_string('  ', default='N/A')

'N/A'

In [None]:
column_names

['summons_number',
 'plate_id',
 'registration_state',
 'plate_type',
 'issue_date',
 'violation_code',
 'vehicle_body_type',
 'vehicle_make',
 'violation_description']

In [34]:
from functools import partial

column_parsers = (parse_int,
                  parse_string,
                  lambda x: parse_string(x, default=''),  # both work the same way, passing a function with an argument
                  partial(parse_string, default=''),  # both work the same way, passing a function with an argument
                  parse_date,
                  parse_int,
                  partial(parse_string, default=''),
                  parse_string,
                  lambda x: parse_string(x, default=''))


In [35]:
def parse_row(row):
    fields = row.strip('\n').split(',')
    parsed_data = (func(field)
                   for func, field in zip(column_parsers, fields))
    return parsed_data

In [37]:
rows = read_data()
for _ in range(5):
    row = next(rows)
    parsed_data = parse_row(row)
    print(list(parsed_data))

[4006478550, 'VAD7274', 'VA', 'PAS', datetime.date(2016, 10, 5), 5, '4D', 'BMW', 'BUS LANE VIOLATION']
[4006462396, '22834JK', 'NY', 'COM', datetime.date(2016, 9, 30), 5, 'VAN', 'CHEVR', 'BUS LANE VIOLATION']
[4007117810, '21791MG', 'NY', 'COM', datetime.date(2017, 4, 10), 5, 'VAN', 'DODGE', 'BUS LANE VIOLATION']
[4006265037, 'FZX9232', 'NY', 'PAS', datetime.date(2016, 8, 23), 5, 'SUBN', 'FORD', 'BUS LANE VIOLATION']
[4006535600, 'N203399C', 'NY', 'OMT', datetime.date(2016, 10, 19), 5, 'SUBN', 'FORD', 'BUS LANE VIOLATION']


In [39]:
all([10, 'hello', ''])

False

In [40]:
l = [10, '', 0]

In [41]:
all(l)

False

In [42]:
all(item is not None for item in l)

True

In [45]:
l1 = [10, '', 0, None]

In [46]:
all(item is not None for item in l1)

False

In [48]:
def parse_row(row, *, default=None):
    fields = row.strip('\n').split(',')
    parsed_data = [func(field)
                   for func, field in zip(column_parsers, fields)]

    if all(item is not None for item in parsed_data):
        return Ticket(*parsed_data)
    else:
        return default

In [49]:
rows = read_data()

In [50]:
for _ in range(5):
    row = next(rows)
    parsed_data = parse_row(row)
    print(parsed_data)

Ticket(summons_number=4006478550, plate_id='VAD7274', registration_state='VA', plate_type='PAS', issue_date=datetime.date(2016, 10, 5), violation_code=5, vehicle_body_type='4D', vehicle_make='BMW', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4006462396, plate_id='22834JK', registration_state='NY', plate_type='COM', issue_date=datetime.date(2016, 9, 30), violation_code=5, vehicle_body_type='VAN', vehicle_make='CHEVR', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4007117810, plate_id='21791MG', registration_state='NY', plate_type='COM', issue_date=datetime.date(2017, 4, 10), violation_code=5, vehicle_body_type='VAN', vehicle_make='DODGE', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4006265037, plate_id='FZX9232', registration_state='NY', plate_type='PAS', issue_date=datetime.date(2016, 8, 23), violation_code=5, vehicle_body_type='SUBN', vehicle_make='FORD', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4

In [51]:
for row in read_data():
    parsed_row = parse_row(row)
    if parsed_row is None:
        print(list(zip(column_names, row.strip('\n').split(','))), end='\n\n')

[('summons_number', '1413358512'), ('plate_id', '54295PC'), ('registration_state', 'NY'), ('plate_type', 'APP'), ('issue_date', '8/9/2016'), ('violation_code', '19'), ('vehicle_body_type', 'BUS'), ('vehicle_make', ''), ('violation_description', '')]

[('summons_number', '1418425369'), ('plate_id', 'JYW5248'), ('registration_state', 'PA'), ('plate_type', 'PAS'), ('issue_date', '3/21/2017'), ('violation_code', '21'), ('vehicle_body_type', 'SDN'), ('vehicle_make', ''), ('violation_description', '')]

[('summons_number', '1406925068'), ('plate_id', '19358JU'), ('registration_state', '99'), ('plate_type', 'COM'), ('issue_date', '8/23/2016'), ('violation_code', '46'), ('vehicle_body_type', 'DELV'), ('vehicle_make', ''), ('violation_description', '')]

[('summons_number', '8546468965'), ('plate_id', '37489BB'), ('registration_state', 'NY'), ('plate_type', 'OMR'), ('issue_date', '6/12/2017'), ('violation_code', '46'), ('vehicle_body_type', 'BUS'), ('vehicle_make', ''), ('violation_description'

In [53]:
def parsed_data():
    for row in read_data():
        parsed = parse_row(row)
        if parsed:
            yield parsed

In [54]:
parsed_rows = parsed_data()

In [56]:
for _ in range(5):
    print(next(parsed_rows))

Ticket(summons_number=4006478550, plate_id='VAD7274', registration_state='VA', plate_type='PAS', issue_date=datetime.date(2016, 10, 5), violation_code=5, vehicle_body_type='4D', vehicle_make='BMW', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4006462396, plate_id='22834JK', registration_state='NY', plate_type='COM', issue_date=datetime.date(2016, 9, 30), violation_code=5, vehicle_body_type='VAN', vehicle_make='CHEVR', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4007117810, plate_id='21791MG', registration_state='NY', plate_type='COM', issue_date=datetime.date(2017, 4, 10), violation_code=5, vehicle_body_type='VAN', vehicle_make='DODGE', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4006265037, plate_id='FZX9232', registration_state='NY', plate_type='PAS', issue_date=datetime.date(2016, 8, 23), violation_code=5, vehicle_body_type='SUBN', vehicle_make='FORD', violation_description='BUS LANE VIOLATION')
Ticket(summons_number=4