# Challenge submission

## Goal 1:
Create 4 lazy iterators, one for each of the 4 files that:
> returns named tuples <br>
> data types are appropriate for each field <br>
> the 4 iterators are independent of each other <br>

Hints:
> Use CSV module, specificaly `csv.reader` to parse correctly the data, and deal with the quotes issue.

In [6]:
# Constants

fname_employment = 'project_4_description/employment.csv'
fname_personal_info = 'project_4_description/personal_info.csv'
fname_status = 'project_4_description/update_status.csv'
fname_vehicles = 'project_4_description/vehicles.csv'
fnames = fname_employment, fname_personal_info, fname_status, fname_vehicles

# Parsers
personal_parser = (parse_str,parse_str,parse_str,parse_str,parse_str)
employment_parser = (parse_str,parse_str,parse_str,parse_str)
vehicle_parser = (parse_str,parse_str,parse_str,parse_int)
update_status_parser = (parse_str,parse_date, parse_date)
parsers = employment_parser, personal_parser, update_status_parser, vehicle_parser

# Named Tuple Names
personal_class_name = 'Personal'
employment_class_name = 'Employment'
vehicle_class_name = 'Vehicle'
update_status_class_name = 'UpdateStatus'
class_names = employment_class_name, personal_class_name, update_status_class_name, vehicle_class_name

In [12]:
# Course solution

import csv
import itertools
from datetime import datetime
from collections import namedtuple


# Parses the data from file to a list per row
def csv_parser(fname, *, delimiter=',', quotechar='"', include_header=False):
    with open(fname) as f:
        reader = csv.reader(f, delimiter=delimiter, quotechar=quotechar)
        if not include_header:
            next(f)
        yield from reader

# parses strings
def parse_str(s,*,default=None):
    try:
        return str(s)
    except ValueError:
        return default
    
# parses int
def parse_int(n,*,default=None):
    try:
        return int(n)
    except ValueError:
        return default
    
# parses dates with default format
def parse_date(value,*,fmt='%Y-%m-%dT%H:%M:%SZ',default=None):
    try:
        return datetime.strptime(value,fmt)
    except ValueError:
        return default


# extracts file header
def extract_field_names(fname):
    reader = csv_parser(fname, include_header=True)
    return next(reader)


# Creates an empty namedtuple from file
def create_named_tuple_class(fname, class_name):
    fields = extract_field_names(fname)
    return namedtuple(class_name, fields)


# Takes in a filename, class_name and a dtype parser and returns a namedtuple
def iter_file(fname, class_name, parser):
    nt_class = create_named_tuple_class(fname, class_name)
    reader = csv_parser(fname)
    
    for row in reader:
        parsed_data = (parse_fn(value) for value, parse_fn in zip(row, parser))
        yield nt_class(*parsed_data)
        


In [13]:
# File paths

if __name__ == '__main__':
    for fname, class_name, parser in zip(fnames, class_names, parsers):
        file_iter = iter_file(fname, class_name, parser)
        
        for _ in range(3):
            print(next(file_iter))
            
        print('\n\n')

Employment(employer='Stiedemann-Bailey', department='Research and Development', employee_id='29-0890771', ssn='100-53-9824')
Employment(employer='Nicolas and Sons', department='Sales', employee_id='41-6841359', ssn='101-71-4702')
Employment(employer='Connelly Group', department='Research and Development', employee_id='98-7952860', ssn='101-84-0356')



Personal(ssn='100-53-9824', first_name='Sebastiano', last_name='Tester', gender='Male', language='Icelandic')
Personal(ssn='101-71-4702', first_name='Cayla', last_name='MacDonagh', gender='Female', language='Lao')
Personal(ssn='101-84-0356', first_name='Nomi', last_name='Lipprose', gender='Female', language='Yiddish')



UpdateStatus(ssn='100-53-9824', last_updated=datetime.datetime(2017, 10, 7, 0, 14, 42), created=datetime.datetime(2016, 1, 24, 21, 19, 30))
UpdateStatus(ssn='101-71-4702', last_updated=datetime.datetime(2017, 1, 23, 11, 23, 17), created=datetime.datetime(2016, 1, 27, 4, 32, 57))
UpdateStatus(ssn='101-84-0356', last_updat

<br>

## Goal 2
Create a single iterable that combines all the data from all four files.
> The output will be 1 row per ssn containing data from all 4 previous iterators.

<br>

In [15]:
# Constants

# Field Inclusion/Exclusion
personal_fields_compress = [True, True, True, True, True]
employment_fields_compress = [True, True, True, False]
vehicle_fields_compress = [False, True, True, True]
update_status_fields_compress = [False, True, True]
compress_fields = (employment_fields_compress, personal_fields_compress, 
                   update_status_fields_compress, vehicle_fields_compress)

In [28]:
def create_combo_named_tuple_class(fnames, compress_fields):
    compress_fields = itertools.chain.from_iterable(compress_fields)
    field_names =  itertools.chain.from_iterable(extract_field_names(fname) for fname in fnames)
    compressed_field_names = itertools.compress(field_names, compress_fields)
    return namedtuple('Data', compressed_field_names)


def iter_combined(fnames, class_names, parsers, compress_fields):
    combo_nt = create_combo_named_tuple_class(fnames, compress_fields)
    # Zip all 4 iterators
    zipped_tuples = zip(*[iter_file(fname, class_name, parser)
               for fname, class_name, parser in zip(fnames, class_names, parsers)])
    
    # Merge multiple namedtuple rows into a single row
    merged_iter = (itertools.chain.from_iterable(zipped_tuple) for zipped_tuple in zipped_tuples)
    # Chain compress_fields lists into a single list. We transform it into a tuple so it does not
    # get exhausted
    compress_fields = tuple(itertools.chain.from_iterable(compress_fields))
    # Remove the unwanted ssn that are common in all 4 iterators
    for row in merged_iter:
        compressed_row = itertools.compress(row, compress_fields)
        yield combo_nt(*compressed_row)

In [31]:
result = iter_combined(fnames, class_names, parsers, compress_fields)

for i in itertools.islice(result,5):
    print(i)

Data(employer='Stiedemann-Bailey', department='Research and Development', employee_id='29-0890771', ssn='100-53-9824', first_name='Sebastiano', last_name='Tester', gender='Male', language='Icelandic', last_updated=datetime.datetime(2017, 10, 7, 0, 14, 42), created=datetime.datetime(2016, 1, 24, 21, 19, 30), vehicle_make='Oldsmobile', vehicle_model='Bravada', model_year=1993)
Data(employer='Nicolas and Sons', department='Sales', employee_id='41-6841359', ssn='101-71-4702', first_name='Cayla', last_name='MacDonagh', gender='Female', language='Lao', last_updated=datetime.datetime(2017, 1, 23, 11, 23, 17), created=datetime.datetime(2016, 1, 27, 4, 32, 57), vehicle_make='Ford', vehicle_model='Mustang', model_year=1997)
Data(employer='Connelly Group', department='Research and Development', employee_id='98-7952860', ssn='101-84-0356', first_name='Nomi', last_name='Lipprose', gender='Female', language='Yiddish', last_updated=datetime.datetime(2017, 10, 4, 11, 21, 30), created=datetime.datetime