In [1]:
# run this first to load a data file from the hackathon S3 bucket

# data is for "cebu" and "manila" for 2016, split by week
# see "data_list.txt" for a complete list of avaialble files

import data # import download utils in ./data.py

# download weekly data file
path = data.getData("cebu", "2015-12-28")

Downloading data/cebu/week_2015-12-28.csv...
Downloaded data/cebu/week_2015-12-28.csv (4755033 lines)


In [2]:
# once a file is loaded use this to view the first few lines

import csv

# load csv file and print first ten lines

with open(path, "rb") as csvfile:
    dataReader = csv.reader(csvfile, delimiter=',')

    #
    rowCount = 0
    for row in dataReader:
        print ', '.join(row)
        rowCount += 1
        if rowCount > 10:
            break

    csvfile.close()


date, edge_id, day_of_week, hour_of_day, avg_speed
2015-12-31, 1001709331, 3, 22, 2.50
2016-01-03, 1001709345, 6, 7, 6.50
2016-01-03, 1001709345, 6, 19, 1.50
2016-01-01, 1001705785, 4, 11, 23.50
2015-12-29, 1001705786, 1, 22, 18.50
2016-01-01, 1001705785, 4, 11, 23.50
2015-12-29, 1001705786, 1, 22, 18.50
2016-01-01, 1001705785, 4, 11, 23.50
2015-12-29, 1001705786, 1, 22, 18.50
2015-12-29, 1001664325, 1, 3, 1.50


In [3]:
import numpy

# load csv file and calculate daily average speed

days = ['Mon', 'Tues', 'Weds', 'Thurs', 'Fri', 'Sat', 'Sun']

with open(path, "rb") as csvfile:
    dataReader = csv.reader(csvfile, delimiter=',')
    # return to top of file
    csvfile.seek(0)

    # skip csv header
    next(dataReader, None)

    # create daily average speed
    # In CSV day_of_week is a number from 0 to 6 representing day of week (0 = monday, 6=sunday)

    # there are lots of rows in the files
    # not feasible to keep all the numbers in memroy, let's calculate a running average instead
    dayOfWeekCount = numpy.zeros(7)
    dayOfWeekSum = numpy.zeros(7)

    dayOfWeekAvg = numpy.zeros(7)

    print "Processing..."

    rowCount = 0
    for row in dataReader:
        # increment the count for day of week
        dayOfWeekCount[int(row[2])] += 1

        #increment sum for speed
        dayOfWeekSum[int(row[2])] += float(row[4])



    csvfile.close()

    print "Average speeds by day (kph):"

    # loop over arrays and calculate averages
    for day in range(0,7):
        if dayOfWeekCount[day] == 0:
            print days[day] + ": (no data)"
        else:
            averageSpeed = dayOfWeekSum[day] / dayOfWeekCount[day]

            dayOfWeekAvg[day] = averageSpeed
            print days[day] + ": " + str(averageSpeed)




Processing...
Average speeds by day (kph):
Mon: 25.2782757264
Tues: 24.8899695606
Weds: 24.7548416524
Thurs: 24.617762767
Fri: 24.6281454738
Sat: 25.0311064731
Sun: 26.3122175455


In [4]:
import numpy

# load csv file and calculate hourly average speed

with open(path, "rb") as csvfile:
    dataReader = csv.reader(csvfile, delimiter=',')
    # return to top of file
    csvfile.seek(0)

    # skip csv header
    next(dataReader, None)

    # create daily average speed
    # In CSV hour_of_day is a number from 0 to 23, representing the hour bin for data
    # hours are expressed in UTC, Cebu and Manila are UTC+8 so the hour needs to be adjusted to local time

    utcAdustment = 8

    # there are lots of rows in the files
    # not feasible to keep all the numbers in memroy, let's calculate a running average instead
    hourOfDayCount = numpy.zeros(24)
    hourOfDaySum = numpy.zeros(24)

    hourOfDayAvg = numpy.zeros(24)

    print "Processing..."

    rowCount = 0
    for row in dataReader:

        # adjust time from UTC to UTC+8 local time

        hour = int(row[3]) + utcAdustment

        # if hour is next day wrap to 24 hours e.g. 26:00 becomes 02:00 the next day
        if hour > 23:
            hour -= 24

        # increment the count for day of week
        hourOfDayCount[hour] += 1

        #increment sum for speed
        hourOfDaySum[hour] += float(row[4])



    csvfile.close()

    print "Average speeds by hour of day (kph):"

    # loop over arrays and calculate averages
    for hour in range(0,24):
        if hourOfDayCount[hour] == 0:
            print hour[hour] + ": (no data)"
        else:
            averageSpeed = hourOfDaySum[hour] / hourOfDayCount[hour]

            hourOfDayAvg[hour] = averageSpeed
            print str(hour) + ": " + str(averageSpeed)

Processing...
Average speeds by hour of day (kph):
0: 28.5521958388
1: 30.159753268
2: 31.2186392225
3: 31.7999383614
4: 30.9081190649
5: 29.6909303726
6: 27.0200437659
7: 24.7745464369
8: 24.3639551084
9: 23.8050444552
10: 23.674758713
11: 23.6664346777
12: 24.0220475828
13: 24.0007665323
14: 23.443511155
15: 22.8919104281
16: 22.1441813486
17: 21.3324536062
18: 21.0921663624
19: 22.0492477638
20: 22.9284215609
21: 23.8180028013
22: 25.068702444
23: 27.12169148
