# Data Download & Processing

The objective of this script is to create a common dataset for our analysis. It includes the following functions:
- download MTA bus data from May 1 to 31, 2017
- process data in each file 
- select only M15 and M15 SBS buses
- concatenate daily dataset into one consolidated dataset

---

# Download MTA Bus Data

- Download
- Unzip
- Remove zipped files

In [10]:
# Download Bus Time for all May, 2017
# Only do it at set up


for i in [9]:
    
    if i < 10: 
        file = 'bus_time_2017050' + str(i) + '.csv.xz'
    else: 
        file = 'bus_time_201705' + str(i) + '.csv.xz'
    
    url = 'http://data.mytransit.nyc.s3.amazonaws.com/bus_time/2017/2017-05/' + file 
    
    print("--- Downloading MTA Bus Data of May %i, 2017 ---" %i)
    !wget -nc $url -P './download'
    
    print("--- Unzipping MTA Bus Data of May %i, 2017 ---" %i)
    path = "./download/" + file
    !7za e $path -o./data -aoa


--- Downloading MTA Bus Data of May 9, 2017 ---
--2017-11-22 15:20:55--  http://data.mytransit.nyc.s3.amazonaws.com/bus_time/2017/2017-05/bus_time_20170509.csv.xz
Resolving data.mytransit.nyc.s3.amazonaws.com (data.mytransit.nyc.s3.amazonaws.com)... 54.231.33.155
Connecting to data.mytransit.nyc.s3.amazonaws.com (data.mytransit.nyc.s3.amazonaws.com)|54.231.33.155|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41957164 (40M) [text/csv]
Saving to: ‘./download/bus_time_20170509.csv.xz’


2017-11-22 15:20:57 (44.3 MB/s) - ‘./download/bus_time_20170509.csv.xz’ saved [41957164/41957164]

--- Unzipping MTA Bus Data of May 9, 2017 ---

7-Zip (A) [64] 9.20  Copyright (c) 1999-2010 Igor Pavlov  2010-11-18
p7zip Version 9.20 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,8 CPUs)

Processing archive: ./download/bus_time_20170509.csv.xz

Extracting  bus_time_20170509.csv

Everything is Ok

Size:       353835337
Compressed: 41957164


In [12]:
file = 'http://web.mta.info/developers/data/nyct/bus/google_transit_manhattan.zip'
!wget -nc $file -P './download'
!cd download; unzip -o google_transit_manhattan.zip

File ‘./download/google_transit_manhattan.zip’ already there; not retrieving.

Archive:  google_transit_manhattan.zip
  inflating: agency.txt              
  inflating: calendar.txt            
  inflating: calendar_dates.txt      
  inflating: stops.txt               
  inflating: trips.txt               
  inflating: stop_times.txt          
  inflating: shapes.txt              
  inflating: routes.txt              


In [60]:
'''

path = './download/gtfs_nyct_bus_20150103.zip'
!7za e $path -o./data

'''

"\n\npath = './download/gtfs_nyct_bus_20150103.zip'\n!7za e $path -o./data\n\n"

---

In [8]:
import pandas as pd
import numpy as np

In [13]:
trips = pd.read_csv("./download/trips.txt")
stops = pd.read_csv("./download/stops.txt")
schedules = pd.read_csv("./download/stop_times.txt")

49753

### Process and Concatenate 3 Days of Bus Data for Testing

In [5]:
import os
os.path.isfile

In [11]:
import pandas as pd
for i in [9]:
    try:
        if i < 10: 
            file_base = './data/bus_time_2017050' + str(i)
            file = file_base + ".csv"
        else: 
            file_base = './data/bus_time_201705' + str(i)
            file = file_base + ".csv"
            
        if os.path.isfile(file_base + "_m15_filtered.csv"):
            print("Output already exists at {}".format(file_base + "_m15_filtered.csv"))
            print("--- Moving on --- ")
            continue
            
        # ingest into a data frame
        print(" --- Processing %s ---" %file)
        df = pd.read_csv(file)
        df = df[df['block_assigned'] == 1]
        # convert timestamp to DateTime
        df['timestamp'] = pd.to_datetime(df['timestamp'], infer_datetime_format = True)
        df['Day'] = df['timestamp'].apply(lambda x: x.day)
        df['Hour'] =df['timestamp'].apply(lambda x: x.hour)
        df['DOW'] = df['timestamp'].dt.dayofweek

        days = {0:'Mon',1:'Tues',2:'Weds',3:'Thurs',4:'Fri',5:'Sat',6:'Sun'}
        df['DOW_Label'] = df['DOW'].apply(lambda x: days[x])
        
        print(" --- Done (1/3) ---")
        
        #throw away trips with <15 reports
        df = df.groupby(df.trip_id).filter(lambda group: len(group) > 15)
        
        print(" --- Done (2/3) --- ")
        m15 = df[df.trip_id.str.contains("_M15")]
        m15_sbs = df[df.trip_id.str.contains("_SBS15")]
        
        m15.to_csv(file_base + "_m15_filtered.csv")
        m15_sbs.to_csv(file_base + "_m15_sbs_filtered.csv")
        print(" --- Done (3/3) --- ")
        del df
        del m15
        del m15_sbs
        print("--- Done (cleanup) --- ")
        
    except Exception as ex:
        print(ex)
        pass

 --- Processing ./data/bus_time_20170509.csv ---
 --- Done (1/3) ---
 --- Done (2/3) --- 
 --- Done (3/3) --- 
--- Done (cleanup) --- 


In [30]:
#total_df[total_df.trip_id.str.contains("_M15")].trip_id.unique()
total_df[total_df.trip_id.str.contains("_SBS15")].trip_id.unique()

array(['OF_B7-Sunday-119100_SBS15_446', 'OF_B7-Sunday-114900_SBS15_449',
       'OF_B7-Sunday-116600_SBS15_450', 'OF_B7-Sunday-112700_SBS15_455',
       'OF_B7-Sunday-113500_SBS15_461', 'OF_B7-Sunday-115200_SBS15_462',
       'OF_B7-Sunday-118100_SBS15_464', 'OF_B7-Sunday-111800_SBS15_454',
       'OF_B7-Sunday-118300_SBS15_435', 'OF_B7-Sunday-111500_SBS15_434',
       'OF_B7-Sunday-115700_SBS15_437', 'OF_B7-Sunday-117100_SBS15_445',
       'OF_B7-Sunday-120200_SBS15_455', 'OF_B7-Sunday-119200_SBS15_454',
       'OF_B7-Sunday-120100_SBS15_447', 'OF_B7-Sunday-121100_SBS15_448',
       'OF_B7-Sunday-122100_SBS15_449', 'OF_B7-Sunday-123200_SBS15_462',
       'OF_B7-Sunday-117400_SBS15_451', 'OF_B7-Sunday-121200_SBS15_461',
       'OF_B7-Sunday-123200_SBS15_450', 'OF_B7-Sunday-125200_SBS15_445',
       'OF_B7-Sunday-124400_SBS15_451', 'OF_B7-Sunday-126200_SBS15_464',
       'OF_B7-Sunday-125600_SBS15_454', 'OF_B7-Sunday-126800_SBS15_455',
       'OF_B7-Sunday-128200_SBS15_447', 'OF_B7-Sund

In [19]:
join_df = total_df.merge(trips, on='trip_id')

In [20]:
join_df.shape

(0, 21)

In [31]:
total_df.sample(10).block_assigned

817254     1
1282344    1
89769      1
2221719    1
3000728    1
924368     1
232103     1
2010816    1
2360826    0
3177518    1
Name: block_assigned, dtype: int64

In [62]:
trips.sample(10)

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,shape_id
63235,B9,JG_A5-Saturday,JG_A5-Saturday-115800_B9_25,BAY RDGE SHORE RD via AVENUE M via 60 ST,1,B90040
150771,M79,MQ_H5-Weekday,MQ_H5-Weekday-125100_M79_20,WEST SIDE-RIVERSIDE DR CROSSTOWN,1,M790026
132639,M23,MQ_A5-Sunday,MQ_A5-Sunday-079900_M23_15,EAST SIDE FDR DR CROSSTOWN,0,M230005
116424,M103,OH_A5-Saturday,OH_A5-Saturday-108700_M101_102,CITY HALL via LEX AV,1,M1030172
66534,BX11,WF_H5-Weekday,WF_H5-Weekday-146000_BX35_235,WASHINGTON HEIGHTS G W BRIDGE,1,BX110042
109491,BX8,WF_A5-Sunday,WF_A5-Sunday-129000_BX8_20,WESTCHESTER SQ,1,BX080082
175580,Q36,QV_A5-Weekday,QV_A5-Weekday-104500_Q36_5,JAMAICA 165 ST TERM,1,Q360148
150243,M79,MQ_A5-Weekday,MQ_A5-Weekday-127000_M79_18,YORKVILLE EAST END AV CROSSTOWN,0,M790024
198611,Q85,JA_A5-Weekday-SDon,JA_A5-Weekday-SDon-120700_MISC_567,JAMAICA PRSNS - ARCHR,1,Q851148
134690,M2,MV_J5-Weekday,MV_J5-Weekday-142000_M2_143,EAST VILLAGE 8 ST via 5 AV,1,M020171
