In [1]:
import utils
import constants

### Goal 1
Create lazy iterators for each of the 4 files that:
- return named tuples for each row
- cast the values to appropriate data types

In [2]:
file_iterators = {}
for fname,dtypes,classname in zip(constants.fnames, constants.dtypes, constants.class_names):
    f = fname.split('.')[0]
    iter_ = utils.process_data(fname, dtypes, classname)
    file_iterators[f] = iter_

### Goal 2
Create a single iterable that combines all data across the 4 files

In [3]:
# Get ordered list of unique fields from all CSVs
unique_headers = []
for f in constants.fnames:
    unique_headers.extend(utils.get_headers(f))
unique_headers = utils.remove_dupes_from_list(unique_headers)  # remove duplicates while preserving order

# Create an iterable containing a single row for each row across all CSVs
all_data = utils.combine_multi(*file_iterators.values(), classname='Person', headers=unique_headers)

### Goal 3
Modify the iterator from **Goal 2** to filter out stale records (i.e. those with a last update date before 3/1/2017)

In [4]:
all_data_no_stale = utils.combine_multi_no_stale(*file_iterators.values(), classname='Person', headers=unique_headers)

In [5]:
# Using the v2 fn

# # Setup
# from datetime import datetime
# stale_cutoff = datetime(2018, 3, 1)
# key = lambda row: row.last_updated >= stale_cutoff

# # Filtered data iterator
# all_data_no_stale = utils.combine_multi_no_stale_v2(*file_iterators.values(), classname='Person', headers=unique_headers, key=key)

In [6]:
# Confirm earliest "last_updated" date is 3/1/2017
from datetime import datetime
stale_cutoff = datetime.strptime('3/1/2017', '%m/%d/%Y')
# all(dat.last_updated >= stale_cutoff for dat in list(all_data_no_stale))

True

### Goal 4
Generate lists of number of car makes by gender.

Female top owned cars should be Ford and Chevrolet (42 each). Male top owned cars should be Ford (40)

In [12]:
from collections import defaultdict

all_data_no_stale = utils.combine_multi_no_stale(*file_iterators.values(), classname='Person', headers=unique_headers)

cars_m = defaultdict(int)
cars_f = defaultdict(int)

for person in all_data_no_stale:
    if person.gender == 'Male':
        cars_m[person.vehicle_make] += 1
    else:
        cars_f[person.vehicle_make] += 1

In [8]:
sorted(cars_m, key=lambda x: cars_m[x], reverse=True)

['Ford',
 'Chevrolet',
 'GMC',
 'Mitsubishi',
 'Dodge',
 'Toyota',
 'Mercedes-Benz',
 'Volkswagen',
 'Audi',
 'Buick',
 'Mazda',
 'BMW',
 'Pontiac',
 'Mercury',
 'Volvo',
 'Cadillac',
 'Honda',
 'Subaru',
 'Hyundai',
 'Saab',
 'Acura',
 'Infiniti',
 'Jeep',
 'Lexus',
 'Nissan',
 'Oldsmobile',
 'Lincoln',
 'Kia',
 'Lotus',
 'Jaguar',
 'Plymouth',
 'Porsche',
 'Lamborghini',
 'Aston Martin',
 'Isuzu',
 'Maserati',
 'Chrysler',
 'Saturn',
 'Bentley',
 'Land Rover',
 'Maybach',
 'Panoz',
 'Geo',
 'Suzuki',
 'Scion',
 'Jensen',
 'Smart',
 'Rolls-Royce',
 'Corbin',
 'Daewoo',
 'Aptera',
 'Eagle',
 'Austin']

In [9]:
cars_m['Ford']

40

In [10]:
sorted(cars_f, key=lambda x: cars_f[x], reverse=True)

['Chevrolet',
 'Ford',
 'GMC',
 'Mitsubishi',
 'Toyota',
 'Mercedes-Benz',
 'Dodge',
 'Lexus',
 'Pontiac',
 'Volvo',
 'Audi',
 'Mazda',
 'BMW',
 'Nissan',
 'Suzuki',
 'Buick',
 'Volkswagen',
 'Acura',
 'Kia',
 'Infiniti',
 'Land Rover',
 'Honda',
 'Oldsmobile',
 'Chrysler',
 'Cadillac',
 'Subaru',
 'Jeep',
 'Mercury',
 'Lotus',
 'Bentley',
 'Hyundai',
 'Lincoln',
 'Isuzu',
 'Plymouth',
 'Saturn',
 'Porsche',
 'Saab',
 'Jaguar',
 'Scion',
 'Aston Martin',
 'Lamborghini',
 'Bugatti',
 'Rolls-Royce',
 'Eagle',
 'Geo',
 'Morgan',
 'Austin',
 'Panoz']

In [11]:
cars_f['Chevrolet'], cars_f['Chevrolet']

(42, 42)