In [35]:
#!/usr/bin/python

# Mapper for session generation.
# Here we examine event log entries

import sys
import re

INPUT = sys.stdin

field_name_list = ['user_id', 'event_action', 'event_target', 'timestamp', 'vin', 'condition', 'year', 'make', 'model', 'price', 'mileage' ]

def read_input(file):
    for line in file:
        # split the line into individual fields (fields are delimited by tab).
        temp = line.strip().split('\t')
        event_action, event_target = temp[1].split(' ',1)  #Split event into action and target on space
        temp[1] = event_action
        temp.insert(2, event_target)
        yield temp
        
def digest_log_entry(field_value_list):
    field_value_dict = {}
    for i in range(len(field_name_list)):
        field_value_dict[field_name_list[i]] = field_value_list[i]
    return field_value_dict

def main():
    # input comes from STDIN (standard input)
    # data is the generator that produces individual inputs
    data = read_input(open("dataSet4Tiny.tsv"))

    # For each log entry, digest all the fields,
    # output the user_id as the key,
    # output the digested log entry (a dictionary) as the value
  
    for log_entry in data:      #So here data is an entire list of lines
        digested_log_entry = digest_log_entry(log_entry)  #
        print '%s\t%s'% ( digested_log_entry['user_id'], digested_log_entry)  #
    



if __name__ == "__main__":
    main()


553320448	{'mileage': '33143', 'user_id': '553320448', 'timestamp': '2016-09-20 00:18:51.000000', 'make': 'Nissan', 'vin': '1N6AD0ERXCC460850', 'event_target': 'alternatives', 'year': '2012', 'event_action': 'display', 'model': 'Frontier', 'price': '0.0000', 'condition': 'Used'}
553320372	{'mileage': '33143', 'user_id': '553320372', 'timestamp': '2016-09-20 08:41:56.000000', 'make': 'Nissan', 'vin': '1N6AD0ERXCC460850', 'event_target': 'alternatives', 'year': '2012', 'event_action': 'display', 'model': 'Frontier', 'price': '0.0000', 'condition': 'Used'}
553320372	{'mileage': '33143', 'user_id': '553320372', 'timestamp': '2016-09-20 08:41:57.000000', 'make': 'Nissan', 'vin': '1N6AD0ERXCC460850', 'event_target': 'market report', 'year': '2012', 'event_action': 'visit', 'model': 'Frontier', 'price': '0.0000', 'condition': 'Used'}


In [39]:
#!/usr/bin/python

# Reducer for session generation.
# Here we assemble user sessions

import sys

def read_key_value(file):
    for line in file:
        # split the line into components, before and after the tab
        yield line.strip().split('\t', 1)

def session_classify(session):
    for i in session:
        if i['event_target'] == "contact form":
            return 'submitter'
        elif i['event_action'] == 'click':
            return 'clicker'
        elif i['event_action'] == 'show' or i['event_action'] == 'display':
            return 'shower'
        elif i['event_action'] == 'visit':
            return 'visitor'
        else:
            return 'other'
    
def get_cars_viewed(event_list):
    seen_vehicles = []
    vehicle_dictionary = {}
    for i in event_list:
        if not (i['vin'] in seen_vehicles):
            seen_vehicles.append(i['vin'])
            vehicle_dictionary[i['vin']] = {'condition':i['condition'], 'year':i['year'], 'make':i['make'], 'model':i['model'], 'price':i['price'], 'mileage':i['mileage']}
    return vehicle_dictionary

def main():
    current_user_id = None
    event_list = []

    for user_id, event_string in read_key_value(open("map_output.txt")):
        # eval() converts a data structure described on a string
        # into that internal data structure (for example, a dictionary).
        event = eval(event_string)
        print event
        
        # Assemble
        if user_id == current_user_id:
            event_list.append(event)
            continue
        else:
            #doesn't do anything the first time thru and goes straight to setting the current userid
            if current_user_id:
                classification = session_classify(event_list)
                cars_viewed = get_cars_viewed(event_list)
                for i in event_list:
                    i.pop('condition')
                    i.pop('year')
                    i.pop('make')
                    i.pop('model')
                    i.pop('price')
                    i.pop('mileage')
                    i.pop('user_id')
                print '{}:{}\t{}\t{}'.format(current_user_id, classification, event_list, cars_viewed)
            current_user_id = user_id
            event_list = [event]
    
    #This if statement only exists because when switching users the loop prior will
    #just set the user and exit on the last entry
    if user_id == current_user_id:
        classification = session_classify(event_list)
        cars_viewed = get_cars_viewed(event_list)
        for i in event_list:
            i.pop('condition')
            i.pop('year')
            i.pop('make')
            i.pop('model')
            i.pop('price')
            i.pop('mileage')
            i.pop('user_id')
        print '{}:{}\t{}\t{}'.format(current_user_id, classification, event_list, cars_viewed)

if __name__ == "__main__":
    main()

{'mileage': '33143', 'timestamp': '2016-09-20 00:18:51.000000', 'price': '0.0000', 'vin': '1N6AD0ERXCC460850', 'year': '2012', 'condition': 'Used', 'user_id': '553320448', 'make': 'Nissan', 'event_target': 'alternatives', 'event_action': 'display', 'model': 'Frontier'}
{'mileage': '33143', 'timestamp': '2016-09-20 08:41:56.000000', 'price': '0.0000', 'vin': '1N6AD0ERXCC460850', 'year': '2012', 'condition': 'Used', 'user_id': '553320372', 'make': 'Nissan', 'event_target': 'alternatives', 'event_action': 'display', 'model': 'Frontier'}
{'mileage': '33143', 'timestamp': '2016-09-20 08:41:57.000000', 'price': '0.0000', 'vin': '1N6AD0ERXCC460850', 'year': '2012', 'condition': 'Used', 'user_id': '553320372', 'make': 'Nissan', 'event_target': 'market report', 'event_action': 'visit', 'model': 'Frontier'}


'badge detail'