# MTA Data Challenges

In [79]:
from __future__ import division
import csv
from datetime import datetime
from collections import Counter
import matplotlib.pyplot as plt

### Challenge 1

In [None]:
# !curl -O http://web.mta.info/developers/data/nyct/turnstile/turnstile_150404.txt

In [16]:
with open('turnstile_150404.txt') as f:
    reader = csv.reader(f)
    rows = [[cell.strip() for cell in row] for row in reader] 

In [17]:
assert rows.pop(0) == ['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME',
                       'DIVISION', 'DATE', 'TIME', 'DESC', 'ENTRIES',
                       'EXITS']

In [60]:
raw_readings = {}
for row in rows:
    raw_readings.setdefault(tuple(row[:4]), []).append(tuple(row[4:]))
#print raw_readings

`raw_readings` is a solution to Challenge 1.

### Challenge 2

In [19]:
datetime_cumulative = {turnstile: [(datetime.strptime(date + time,
                                                      '%m/%d/%Y%X'),
                                    int(in_cumulative))
                                   for _, _, date, time,
                                       _, in_cumulative, _ in rows]
                       for turnstile, rows in raw_readings.items()}

In [20]:
for rows in datetime_cumulative.values():
    assert rows == sorted(rows)

In [21]:
datetime_count_times = {turnstile: [[rows[i][0],
                                     rows[i+1][1] - rows[i][1],
                                     rows[i+1][0] - rows[i][0]]
                                    for i in range(len(rows) - 1)]
                        for turnstile, rows in datetime_cumulative.items()}

In [22]:
all_counts = [count for rows in datetime_count_times.values() for _, count, _ in rows]
all_counts.sort()
print all_counts[-5:]

[3132, 3502, 3854, 407303, 407345]


In [40]:
#print all_counts[:1200]

In [24]:
all_times = [duration.seconds / 60 / 60
             for rows in datetime_count_times.values()
             for _, _, duration in rows]
print Counter(all_times).most_common(10)

[(4.0, 173343), (4.2, 10545), (8.0, 192), (4.433333333333334, 155), (4.000277777777778, 33), (3.999722222222222, 33), (0.02222222222222222, 32), (0.02777777777777778, 20), (0.15277777777777776, 17), (2.7925, 17)]


In [25]:
datetime_counts = {turnstile: [(time, count)
                               for (time, count, _) in rows
                               if 0 <= count <= 5000]
                   for turnstile, rows in datetime_count_times.items()}

`datetime_counts` is a solution to Challenge 2.

In [26]:
all_good_counts = [count for rows in datetime_counts.values() for _, count in rows]
print len(all_good_counts) / len(all_counts)

0.994133610238


In [27]:
all_good_counts.sort()
print all_good_counts[-5:]

[3045, 3052, 3132, 3502, 3854]


In [28]:
print all_good_counts[:5]

[0, 0, 0, 0, 0]


### Challenge 3

In [36]:
day_counts = {}
for turnstile, rows in datetime_counts.items():
    by_day = {}
    for time, count in rows:
        day = time.date()
        by_day[day] = by_day.get(day, 0) + count
    day_counts[turnstile] = sorted(by_day.items())
    

print day_counts.values()[1]
type(day_counts.values()[1])

[(datetime.date(2015, 3, 28), 1024), (datetime.date(2015, 3, 29), 869), (datetime.date(2015, 3, 30), 2410), (datetime.date(2015, 3, 31), 2427), (datetime.date(2015, 4, 1), 2419), (datetime.date(2015, 4, 2), 2582), (datetime.date(2015, 4, 3), 1809)]


list

In [39]:
#Challege 5
station_counts = {}

for daycounts_key, daycounts_value in day_counts.items():
    daycounts_key_list = list(daycounts_key)
    daycounts_key_list.remove(daycounts_key_list[2])
    station_key = tuple(daycounts_key_list)
    station_counts.setdefault(station_key, []).extend(daycounts_value)
    #print station_key
# for daycounts_key, daycounts_value in day_counts.items():
#     station_key = daycounts_key[0],daycounts_key[1],daycounts_key[3]
#     print station_key

    

In [51]:
#challege 5
merge_date_counts = {}
for station_counts_key, station_counts_value in station_counts.items():
    merge_day = {}
    for merge_time, merge_count in station_counts_value:
        new_date = merge_time
        merge_day[new_date] = merge_day.get(new_date, 0) + merge_count
    merge_date_counts[station_counts_key] = sorted(merge_day.items())
print merge_date_counts.items()[0]

(('N325A', 'R218', 'ELMHURST AVE'), [(datetime.date(2015, 3, 28), 9685), (datetime.date(2015, 3, 29), 8384), (datetime.date(2015, 3, 30), 13796), (datetime.date(2015, 3, 31), 13979), (datetime.date(2015, 4, 1), 14182), (datetime.date(2015, 4, 2), 14296), (datetime.date(2015, 4, 3), 11503)])


In [53]:
station_weekly_counts = {}

for mdc_key, mdc_value in merge_date_counts.items():
    for mdc_time, mdc_counts in mdc_value:
        station_weekly_counts[mdc_key] = station_weekly_counts.get(mdc_key,0) + mdc_counts
print station_weekly_counts.items()[0]

(('N325A', 'R218', 'ELMHURST AVE'), 85825)


In [78]:
#challenge 6
entire_station_data = {}
for station_unit_key, station_unit_value in station_weekly_counts.items():
    esd_key = (station_unit_key[2])
    entire_station_data[esd_key] = entire_station_data.get(esd_key,0) + station_unit_value

all_stations_sorted = sorted(station_weekly_counts.items(), key = lambda x : x[1])
#type(all_stations_sorted)
# print all_stations_sorted[-10:]
# print " "
# print " "
all_stations_sorted = sorted(entire_station_data.items(),key = lambda x : x[1])
print all_stations_sorted[-10:]
plot_stations = all_stations_sorted[-10:]
type(all_stations_sorted)

# for daycounts_key, daycounts_value in day_counts.items():
#     station_key = daycounts_key[0],daycounts_key[1],daycounts_key[3]
#     print station_key


[('59 ST-COLUMBUS', 454134), ('125 ST', 484752), ('96 ST', 485493), ('42 ST-PA BUS TE', 576786), ('42 ST-TIMES SQ', 606133), ('14 ST-UNION SQ', 678753), ('86 ST', 697851), ('34 ST-HERALD SQ', 737871), ('42 ST-GRD CNTRL', 882054), ('34 ST-PENN STA', 1057842)]


list

In [138]:
plot_top_station = []
labels_top_station = plot_top_station
print plot_top_station
print plot_top_value
plot_top_value = []
plot_x = range(0,10)

for i in range(0,len(plot_stations)):
    
    plot_top_station.append(plot_stations[i][0])
    plot_top_value.append(plot_stations[i][1])
    

plt.bar(plot_x, plot_top_value)
plt.xticks(plot_x,labels_top_station,rotation=45)
plt.title("Top Ten Busiest Subway Station in NYC")
plt.xlabel("Station")
plt.ylabel("Turnstile Entrances")
plt.show()



[]
[454134, 484752, 485493, 576786, 606133, 678753, 697851, 737871, 882054, 1057842]


`day_counts` is a solution to Challenge 3.