In [14]:
import pandas as pd
import numpy as np


In [15]:
def reformat(data):
    '''
    wrangle stop Times data to get start and stop times between each stop
    '''
    
    reformattedData =  data.copy()
    tempTime= data['departure_time'].ix[1:].values
    times = pd.Series(tempTime)
    times.append(pd.Series(['2']))

    tempSequence = data['stop_sequence'].ix[1:].values
    sequence= pd.Series(tempSequence)
    sequence.append(pd.Series(['2']))

    start_id = data['stop_id'].ix[1:].values
    ids = pd.Series(start_id)
    ids.append(pd.Series(['2']))

    reformattedData['stop_sequence2'] =sequence
    reformattedData['departure_time'] = times
    reformattedData['stop_id'] = ids
    reformattedData['start_id'] = data['stop_id']
    
    goodIndexes = (reformattedData['stop_sequence'] +1 == reformattedData['stop_sequence2'])
    reformattedData = reformattedData[goodIndexes]
    return reformattedData
     
    

In [16]:
#helper functions for cleaning up time when day changes
def hh_mm_ss2seconds(hh_mm_ss):
    return reduce(lambda acc, x: acc*60 + x, map(int, hh_mm_ss.split(':')))

def cleanTimes(t):
    '''
    input: duration
    output: if duration < 0 , adds 24 hrs 
    '''
    if t<0:
        return t+ 24*60*60
    else :
        return t

In [17]:
# http://web.mta.info/developers/developer-data-terms.html#data
# under GTFS schedule data, New York City Transit Bus and New York City Transit Train - last updated march 24 2016

#bus data for stops and stop times
brookStopTimes = pd.read_csv("./data/rawData/bus/google_transit_brooklyn/stop_times.txt")
queensStopTimes = pd.read_csv("./data/rawData/bus/google_transit_queens/stop_times.txt")
bronxStopTimes = pd.read_csv("./data/rawData/bus/google_transit_bronx/stop_times.txt")
manhattanStopTimes = pd.read_csv("./data/rawData/bus/google_transit_manhattan/stop_times.txt")
statStopTimes = pd.read_csv("./data/rawData/bus/google_transit_staten_island/stop_times.txt")


brookStops = pd.read_csv("./data/rawData/bus/google_transit_brooklyn/stops.txt")
queensStops = pd.read_csv("./data/rawData/bus/google_transit_queens/stops.txt")
bronxStops = pd.read_csv("./data/rawData/bus/google_transit_bronx/stops.txt")
manhattanStops = pd.read_csv("./data/rawData/bus/google_transit_manhattan/stops.txt")
statStops = pd.read_csv("./data/rawData/bus/google_transit_staten_island/stops.txt")



#all train data
stopTimes = pd.read_csv("./data/rawData/train/google_transit/stop_times.txt")
stops = pd.read_csv("./data/rawData/train/google_transit/stops.txt")
routes = pd.read_csv("./data/rawData/train/google_transit/routes.txt")
trips = pd.read_csv("./data/rawData/train/google_transit/trips.txt")


#clean busTrips and trainTrips
bronx= reformat(bronxStopTimes)
stat= reformat(statStopTimes)
man =reformat(manhattanStopTimes)
queens = reformat(queensStopTimes)
brook = reformat(brookStopTimes)

allRoutesTrain = reformat(stopTimes)
allRoutesTrain['type']="train"


allRoutesBus = pd.concat([bronx,stat,man,queens,brook])
allRoutesBus['type']="bus"


busStops= pd.concat([brookStops,queensStops,bronxStops,manhattanStops,statStops])
busStops['type']="busStop"
stops['type']="trainStop"

#combine all bus and train stops/trips
allStops = pd.concat([stops,busStops])
allRoutes = pd.concat([allRoutesTrain,allRoutesBus])



In [18]:

result = pd.merge(allRoutes, allStops, on='stop_id')
result.rename(columns = {'stop_lat':'dropoff_latitude' , 'stop_lon':'dropoff_longitude'}, inplace = True)
result2 = pd.merge(result, allStops, left_on='start_id',right_on='stop_id')
result2.rename(columns = {'stop_lat':'pickup_latitude' , 'stop_lon':'pickup_longitude'}, inplace = True)
result2= result2[['trip_id','stop_id_x','start_id','type','arrival_time','departure_time']]
result2.rename(columns = {'start_id':'mta_start_id','stop_id_x':'mta_stop_id'}, inplace = True)



In [19]:
#compute duration of trip
times = pd.DataFrame()
times['ar'] = result2['arrival_time'].apply(lambda x:hh_mm_ss2seconds(x) )
times['de'] = result2['departure_time'].apply(lambda x:hh_mm_ss2seconds(x) )
times['dif']= times['de']-times['ar']
#handle case where time stamp switched between days
result2['duration'] = times['dif'].apply(lambda x: cleanTimes(x)/60.)


result2 = result2[['mta_stop_id','mta_start_id','type','duration']]
result2.to_csv('./data/mergedData/busTrainTrips.csv')

allStops.rename(columns = {'stop_id':'id','stop_lat':'lat','stop_lon':'long'}, inplace = True)
allStops =allStops[['id','lat','long','type']]

allStops = allStops.reset_index()

allStops.to_csv('/home/michael/Desktop/bigDataProject/finalVersion/data/mergedData/busTrainStops.csv')

In [7]:
def stopReducer(data,radius):
    '''
    performs one pass through the stop locations, combining stops that are within the radius 
    (manhattan distance). If a stop is assigned it is not eligible to be assigned to a new group.
    Basic function is to combine points that are close together
    '''
    reducerMap = dict()
    
    data['assigned'] = "a"
    groupNumber =0
    #data[mask]['assigned'].apply(lambda x:  False)
    rowNumber = 0
    while (data['assigned']=="a").sum()>0:  #continue until all of data has been assigned to a group
        row = data.iloc[rowNumber]
        if row['assigned']=="a":  # has not been assigned yet
            
            # create filter for "close points" so we don't have to look at all pairs
            data["x"] = data['lat'].apply(lambda x: np.absolute(x - row.lat))
            data["y"] = data['long'].apply(lambda x: np.absolute(x - row['long']))
            data['L1'] = data["x"] + data["y"]
            mask0= data['roughLatLon']==row.roughLatLon
            mask1 = data['L1']<radius
            
            #only want to combine with other points that have not been assigned
            mask2 = data['assigned']=="a"
            
            mask3 = np.logical_and(mask1, mask2)
            mask4 = np.logical_and(mask3,mask0)
            
            data.ix[mask4,'assigned']=groupNumber
            
            groupNumber += 1
        rowNumber +=1

    
    return   data


In [16]:
allStops['assigned']="a" # mark unassigned rows
allStops['roughLat']=allStops['lat'].apply(lambda x: float(int(x*100))/100)
allStops['roughLon']=allStops['long'].apply(lambda x: float(int(x*100))/100)
allStops['roughLatLon']=zip(allStops['roughLat'] , allStops['roughLon'])
allStops['assigned']="a"

a = stopReducer(allStops,.002)

In [33]:
reducedStops = a[['lat','long','type','assigned','id']]
reducedStops['id'].nunique()

14201

In [20]:
trips = pd.read_csv('./data/mergedData/busTrainTrips.csv')

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,mta_stop_id,mta_start_id,type,duration
0,0,902N,901N,trainStop,1.500000
1,1,902N,901N,trainStop,1.500000
2,2,902N,901N,trainStop,1.500000
3,3,902N,901N,trainStop,1.500000
4,4,902N,901N,trainStop,1.500000
5,5,902N,901N,trainStop,1.500000
6,6,902N,901N,trainStop,1.500000
7,7,902N,901N,trainStop,1.500000
8,8,902N,901N,trainStop,1.500000
9,9,902N,901N,trainStop,1.500000


In [25]:
allStops.columns.values

array(['id', 'lat', 'long', 'type'], dtype=object)

In [10]:
allStops2 = allStops.reset_index()

In [11]:
allStops2['id'][5]


'103S'