IDEA:

During the morning rush hour, more people will ride bikes into Midtown than will ride bikes out of Midtown. A similar statement could be said in reverse about the afternoon rush hour.

$H_0:$ On weekdays between the hours of 7:00 AM and 10:00 AM, the number of bike trips originating in Midtown Manhattan, as defined by 59th Street to the north, 9th Avenue to west, 36th Street to the south and 2nd Avenue to the east, and ending outside will be at least as great as the number of bike trips originating outside of Midtown and finishing inside.

$H_1:$ On weekdays  between the hours of 7:00 AM and 10:00 AM, the number of bike trips originating outside of Midtown Manhattan ending inside will be greater than the number of bike trips originating inside and ending outside. This will be significant with $p < 0.01$.

In [1]:
from __future__ import print_function
import os
import sys
import numpy as np
import pandas as pd
import pylab as pl
%pylab inline

puidata = os.getenv("PUIDATA")
sitename = "http://s3.amazonaws.com/tripdata"
bikemon = "201508" # Would like this to be settable during runtime
# Define corners of Midtown
nwcorn = (40.7693944, -73.9870549) # W 59th & 8th
swcorn = (40.7544542, -73.99989933)# W 36th & 8th
secorn = (40.7475701, -73.9742955) # E 36th & 2nd
necorn = (40.7591923, -73.9665707) # E 59th & 2nd
boundary = (nwcorn, swcorn, secorn, necorn)

def getBikeDataCSV(mon):
    # Based loosely on Federica's code
    basename = mon + "-citibike-tripdata"
    zipname = basename + ".zip"
    os.chdir(puidata) # Now all file operations without an absolute path will take place in puidata
    fullfile = puidata + "/" + zipname
    # Check if the file is present
    print("Checking presence of " + zipname)
    print(os.getenv("PWD"))
    if os.path.isfile(basename + ".csv"):
        print("Found file " + basename + ".csv")
        if os.path.isfile(mon + "-citibike-tripdata.csv"):
            if os.system("mv " + mon + "-citibike-tripdata.csv " + puidata):
                print("Could not move file to " + puidata)
                return
    else:
        print("Downloading file " + zipname)
        if not os.path.isfile(fullfile):
            os.system("curl -O " + sitename + "/" + zipname)
            if os.path.isfile(fullfile):
                print("Got " + zipname + "\n")
                os.system("unzip -q " + fullfile)
            else:
                print("Failed\n")
                return

Populating the interactive namespace from numpy and matplotlib


In [8]:
def in_rect(ptlat, ptlon, contour):
    # ptlat - scalar latitude of the coordinate being tested
    # ptlon - scalar longitude of the coordinate being tested
    # contour - tuple of coordinates bounding polygon
    # First look for two pairs of points in contour where
    # which ptlat is between
    bet_segs = []
    eq_pt = []
    try:
        for corner in range(0, len(contour)):
            if between(ptlat, contour[corner][0], contour[(corner + 1) % len(contour)][0]):
                bet_segs.append(corner)
            elif ptlat == contour[corner][0]:
                eq_pt.append(corner)
    except ValueError:
        return False
    # If there were no segments, return False
    if len(bet_segs) <= 1 and len(eq_pt) == 0:
        return False
    # If there is one equal to a vertex and none inside segment,
    # return whether it is on the vertex
    if len(eq_pt) == 1 and len(bet_segs) == 0:
        return ptlon == contour[eq_pt[0]][1]
    # If there are two equal to a vertex and none inside segment,
    # return whether it is on the line connecting those vertices, including ends
    if len(eq_pt) == 2 and len(bet_segs) == 0:
        return between(ptlon, contour[eq_pt[0]][1], contour[eq_pt[1]][1]
                      ) or ptlon == contour[eq_pt[0]][1] or ptlon == contour[eq_pt[1]][1]
    # If there are two or more segments, find the longitude-intercepts
    # for each of those segments with the latitude and return whether or
    # not the point is on a longitude in between
    if len(bet_segs) >= 2:
        xint_list = []
        # Find the longitude on each segment in the list intersecting the latitude
        for ver in bet_segs:
            xint = x_intercept(ptlat, ptlon, contour[ver], contour[(ver+1) % len(contour)])
            xint_list.append(xint)
        lessthan_lon = 0
        # If the number of segments with an x-intercept greater than the points x-coordinate
        # is odd, the point is in the polygon
        for xi in xint_list:
            if xi == ptlon:
                # On the segment
                return True
            if xi < ptlon:
                lessthan_lon += 1
        return (lessthan_lon % 2) == 1

def x_intercept(ptlat, ptlon, seg_ends):
    # Return the x-coordinate of the segment defined by seg_ends that has a y-coordinate of ptlat
    if seg_ends[0][0] == seg_ends[1][0]:
        if ptlat == seg_ends[0][0]:
            if between(ptlon, seg_ends[0][1], seg_ends[1][1]):
                return ptlon
            else:
                return Null
        else:
            return Null
    if seg_ends[0][1] == seg_ends[1][1]:
        return seg_ends[0][1]
    y_frac = (ptlat - seg_ends[0][0]) / (seg_ends[1][1] - seg_ends[0][1])
    x_int = segends[0][1] + y_frac * (seg_ends[1][1] - seg_ends[0][1])
    return x_int

def between (compare, st, en):
    if st < compare and compare < en:
        return True
    if en < compare and compare < st:
        return True
    return False


In [23]:
print(os.getenv("PUIDATA"))
#puidata = os.getenv("PUI2016") + "/data" # This should be fixed in subsequent iterations
# Obtain the data from online for the specified months
getBikeDataCSV(bikemon)
bamboo = pd.read_csv(puidata + "/" + bikemon + "-citibike-tripdata.csv")
# Remove the unneeded fields
bamboo = bamboo.drop(["tripduration", "bikeid", "usertype", "birth year", "gender"], axis=1)
"""
stationlist = {}
fh = open(puidata + "/station-list.txt", "w")
for i in range(0, bamboo["start station name"].size):
    for col in "start", "end":
        if not bamboo[col + " station name"][i] in stationlist.keys():
            mid = in_rect(bamboo[col + " station latitude"], bamboo[col + " station longitude"], boundary)
            stationlist[bamboo[col + " station name"][i]] = mid
            fh.write(bamboo[col + " station name"][i] + ",")
            if mid:
                fh.write("In\n")
            else:
                fh.write("Out\n")
"""
# Create the fields that will hold the calculated values whether the start and end are in Midtown
bamboo["st_in"] = pd.Series(np.zeros(len(bamboo["starttime"])))
bamboo["en_in"] = pd.Series(np.zeros(len(bamboo["starttime"])))
print("Selecting by location")
# Previously, there was a bug in the in_rect() subroutine that prevented getting accurate assessments
# of what was in Midtown. It was fixed offline, but could not test it with pandas due to Xauth problem on gw
bamboo.loc[in_rect(bamboo["end station latitude"], bamboo["end station longitude"],
                   boundary), "en_in"] = 1
bamboo.loc[in_rect(bamboo["start station latitude"], bamboo["start station longitude"],
                   boundary), "st_in"] = 1
print("Identified points in and out of Midtown")
bamboo.head(10)

/home/cusp/ss4977/PUIdata
Checking presence of 201508-citibike-tripdata.zip
/wingrdp/homedirs/ss4977/PUI2016_ss4977/HW3_ss4977
Found file 201508-citibike-tripdata.csv
Could not move file to /home/cusp/ss4977/PUI2016/data
Selecting by location
Identified points in and out of Midtown


Unnamed: 0,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,st_in,en_in
0,8/1/2015 00:00:04,8/1/2015 00:20:07,168,W 18 St & 6 Ave,40.739713,-73.994564,385,E 55 St & 2 Ave,40.757973,-73.966033,0.0,1.0
1,8/1/2015 00:00:05,8/1/2015 00:05:06,450,W 49 St & 8 Ave,40.762272,-73.987882,479,9 Ave & W 45 St,40.760193,-73.991255,0.0,0.0
2,8/1/2015 00:00:06,8/1/2015 00:07:18,312,Allen St & E Houston St,40.722055,-73.989111,296,Division St & Bowery,40.714131,-73.997047,0.0,0.0
3,8/1/2015 00:00:09,8/1/2015 00:04:43,382,University Pl & E 14 St,40.734927,-73.992005,229,Great Jones St,40.727434,-73.99379,0.0,0.0
4,8/1/2015 00:00:17,8/1/2015 00:21:13,352,W 56 St & 6 Ave,40.763406,-73.977225,432,E 7 St & Avenue A,40.726218,-73.983799,0.0,0.0
5,8/1/2015 00:00:24,8/1/2015 00:12:44,212,W 16 St & The High Line,40.743349,-74.006818,498,Broadway & W 32 St,40.748549,-73.988084,0.0,0.0
6,8/1/2015 00:00:30,8/1/2015 00:07:43,388,W 26 St & 10 Ave,40.749718,-74.00295,284,Greenwich Ave & 8 Ave,40.739017,-74.002638,0.0,0.0
7,8/1/2015 00:00:33,8/1/2015 00:26:49,492,W 33 St & 7 Ave,40.7502,-73.990931,492,W 33 St & 7 Ave,40.7502,-73.990931,0.0,0.0
8,8/1/2015 00:00:39,8/1/2015 00:14:43,387,Centre St & Chambers St,40.712733,-74.004607,391,Clark St & Henry St,40.697601,-73.993446,0.0,0.0
9,8/1/2015 00:00:49,8/1/2015 00:08:37,285,Broadway & E 14 St,40.734546,-73.990741,284,Greenwich Ave & 8 Ave,40.739017,-74.002638,0.0,0.0


In [None]:
# Remove all entries where both ends either are both in Midtown or both not in Midtown
dFrame = bamboo[bamboo["st_in"] != bamboo["en_in"]]
print(dFrame.head(10))
print("Reduced dataset")
# Identify which entries are on days of the week and hours of interest
dFrame["st_date"] = pd.to_datetime(dFrame["starttime"])
dFrame["en_date"] = pd.to_datetime(dFrame["stoptime"])
for nkey in "weekday", "day", "sttime", "etime":
    dFrame[nkey] = pd.Series(np.zeros(len(dFrame["st_date"])))
dFrame["weekday"] = dFrame["st_date"].dt.weekday
dFrame["day"] = dFrame["st_date"].dt.day
dFrame["sttime"] = dFrame["st_date"].dt.hour
dFrame["etime"] = dFrame["en_date"].dt.hour
#shoots = bamboo.loc[(dFrame["st_in"] != dFrame["en_in"]) & (dFrame["weekday"] < 5) &
#                    (((dFrame["sttime"] >= 7) & (dFrame["sttime"] < 10) |
#                     ((dFrame["etime"] >= 7) & (dFrame["etime"] < 10))))]
#shoots.head(10)
dFrame.head(10)

In [26]:
fh = open(puidata + "/station-list.txt", "w")
for i in range(0, bamboo["start station name"].size):
    for col in "start", "end":
        if not bamboo[col + " station name"][i] in stationlist.keys():
            mid = in_rect(bamboo[col + " station latitude"], bamboo[col + " station longitude"], boundary)
            stationlist[bamboo[col + " station name"][i]] = mid
            fh.write(bamboo[col + " station name"][i] + ",")
            if mid:
                fh.write("In\n")
            else:
                fh.write("Out\n")
fh.close

<function close>

In [None]:
# Calculate aggregated statistics. Will test when X is working from gw
enter = shoots[shoots["en_in"] == 1]
leave = shoots[shoots["st_in"] == 1]
enter_stats = enter.groupby(['day', 'sttime']).agg({'en_in': [np.size]})
leave_stats = leave.groupby(['day', 'sttime']).agg({'st_in': [np.size]})
pl.hist(enter_stats["en_in"], bins=10,
        label="Bike rides into Midtown by rush-hour hour")
pl.hist(leave_stats["st_in"], bins=10,
        label="Bike rides out of Midtown by rush-hour hour")
ent_mean = enter_stats['en_in'].mean()
ent_std = enter_stats['en_in'].std()
lv_mean = leave_stats['st_in'].mean()
lv_std = leave_stats['st_in'].std()

In [30]:
listofstation = stationlist.keys()
for i in range(0,20):
    if stationlist[listofstation[i]]:
        print(listofstation[i] + ": In")
    else:
        print(listofstation[i] + ": Out")

Broadway & W 37 St: Out
Clinton Ave & Flushing Ave: Out
W 49 St & 8 Ave: Out
W 20 St & 7 Ave: Out
Great Jones St: Out
Bedford Ave & Nassau Ave: Out
McKibbin St & Manhattan Ave: Out
Watts St & Greenwich St: Out
S 5 Pl & S 4 St: Out
Greenwich St & W Houston St: Out
S 4 St & Wythe Ave: Out
N Henry St & Richardson St: Out
Hudson St & Reade St: Out
E 12 St & 3 Ave: Out
Willoughby Ave & Hall St: Out
W 44 St & 5 Ave: Out
1 Ave & E 18 St: Out
Greenwich St & Warren St: Out
Greenpoint Ave & Manhattan Ave: Out
Allen St & E Houston St: Out
