# Data Download & Processing

The objective of this script is to create a common dataset for our analysis. It includes the following functions:
- download MTA bus data from May 1 to 31, 2017
- process data in each file 
- select only M15 and M15 SBS buses
- concatenate daily dataset into one consolidated dataset

---

# Download MTA Bus Data

- Download
- Unzip
- Remove zipped files

In [1]:
# Download Bus Time for all May, 2017
# Only do it at set up


for i in range(1, 32):
    
    if i < 10: 
        file = 'bus_time_2017050' + str(i) + '.csv.xz'
    else: 
        file = 'bus_time_201705' + str(i) + '.csv.xz'
    
    url = 'http://data.mytransit.nyc.s3.amazonaws.com/bus_time/2017/2017-05/' + file 
    
    print("--- Downloading MTA Bus Data of May %i, 2017 ---" %i)
    !wget -nc $url -P './download'
    
    print("--- Unzipping MTA Bus Data of May %i, 2017 ---" %i)
    path = "./download/" + file
    !7za e $path -o./data -aoa


--- Downloading MTA Bus Data of May 1, 2017 ---
File “./download/bus_time_20170501.csv.xz” already there; not retrieving.

--- Unzipping MTA Bus Data of May 1, 2017 ---

7-Zip (a) [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,64 CPUs Intel(R) Xeon(R) CPU E5-4640 0 @ 2.40GHz (206D7),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan ./download                    1 file, 45612524 bytes (44 MiB)

Extracting archive: ./download/bus_time_20170501.csv.xz
--
Path = ./download/bus_time_20170501.csv.xz
Type = xz
Physical Size = 45612524
Method = LZMA2:26 CRC64
Streams = 1
Blocks = 1

      0% - bus_time_20170501.c                            1% - bus_time_20170501.c                            3% - bus_time_20170501.c                            4% - bus_time_20170501.c                            7% - bus_time_20170501.c                            9% - bus_time_20170501.c                           12% - bus_

In [2]:
file = 'http://web.mta.info/developers/data/nyct/bus/google_transit_manhattan.zip'
!wget -nc $file -P './download'
!cd download; unzip -o google_transit_manhattan.zip

--2017-11-28 13:51:58--  http://web.mta.info/developers/data/nyct/bus/google_transit_manhattan.zip
Resolving web.mta.info... 23.215.130.154, 23.215.130.192
Connecting to web.mta.info|23.215.130.154|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: “./download/google_transit_manhattan.zip”

    [                      <=>              ] 13,385,446  3.07M/s   in 4.5s    

2017-11-28 13:52:03 (2.86 MB/s) - “./download/google_transit_manhattan.zip” saved [13385446]

Archive:  google_transit_manhattan.zip
  inflating: agency.txt              
  inflating: calendar.txt            
  inflating: calendar_dates.txt      
  inflating: stops.txt               
  inflating: trips.txt               
  inflating: stop_times.txt          
  inflating: shapes.txt              
  inflating: routes.txt              


In [60]:
'''

path = './download/gtfs_nyct_bus_20150103.zip'
!7za e $path -o./data

'''

"\n\npath = './download/gtfs_nyct_bus_20150103.zip'\n!7za e $path -o./data\n\n"

---

In [3]:
import pandas as pd
import numpy as np

In [4]:
trips = pd.read_csv("./download/trips.txt")
stops = pd.read_csv("./download/stops.txt")
schedules = pd.read_csv("./download/stop_times.txt")

### Process and Concatenate 3 Days of Bus Data for Testing

In [5]:
import os
os.path.isfile

<function genericpath.isfile>

In [6]:
# Basic Feature Engineering
# Outputs:
# 1) 31 daily dataset with all bus lines
# 2) 31 daily M15 datasets
# 3) 31 daily M15SBS datasets

import pandas as pd
for i in range(1, 32):
    try:
        if i < 10: 
            file_base = './data/bus_time_2017050' + str(i)
            file = file_base + ".csv"
        else: 
            file_base = './data/bus_time_201705' + str(i)
            file = file_base + ".csv"
            
        if os.path.isfile(file_base + "_m15_filtered.csv"):
            print("Output already exists at {}".format(file_base + "_m15_filtered.csv"))
            print("--- Moving on --- ")
            continue
            
        # ingest into a data frame
        print(" --- Processing %s ---" %file)
        df = pd.read_csv(file)
        df = df[df['block_assigned'] == 1]
        # convert timestamp to DateTime
        df['timestamp'] = pd.to_datetime(df['timestamp'], infer_datetime_format = True)
        df['Day'] = df['timestamp'].apply(lambda x: x.day)
        df['Hour'] =df['timestamp'].apply(lambda x: x.hour)
        df['DOW'] = df['timestamp'].dt.dayofweek

        days = {0:'Mon',1:'Tues',2:'Weds',3:'Thurs',4:'Fri',5:'Sat',6:'Sun'}
        df['DOW_Label'] = df['DOW'].apply(lambda x: days[x])
        
        print(" --- Done (1/3) ---")
        
        #throw away trips with <15 reports
        df = df.groupby(df.trip_id).filter(lambda group: len(group) > 15)
        for trip_id, group in df.groupby("trip_id"):
            sort = group.sort_values("timestamp")
            if sort.first().latitude > sort.last().latitude:
                group["direction"] = 0 # southbound
            else:
                group["direction"] = 1 # northbound
        
        print(" --- Done (2/3) --- ")
        m15 = df[df.trip_id.str.contains("_M15")]
        m15_sbs = df[df.trip_id.str.contains("_SBS15")]
        
        m15.to_csv(file_base + "_m15_filtered.csv")
        m15_sbs.to_csv(file_base + "_m15_sbs_filtered.csv")
        print(" --- Done (3/3) --- ")
        del df
        del m15
        del m15_sbs
        print("--- Done (cleanup) --- ")
        
    except Exception as ex:
        print(ex)
        pass

 --- Processing ./data/bus_time_20170501.csv ---


  interactivity=interactivity, compiler=compiler, result=result)


 --- Done (1/3) ---
 --- Done (2/3) --- 
 --- Done (3/3) --- 
--- Done (cleanup) --- 
 --- Processing ./data/bus_time_20170502.csv ---
 --- Done (1/3) ---
 --- Done (2/3) --- 
 --- Done (3/3) --- 
--- Done (cleanup) --- 
 --- Processing ./data/bus_time_20170503.csv ---
 --- Done (1/3) ---
 --- Done (2/3) --- 
 --- Done (3/3) --- 
--- Done (cleanup) --- 
 --- Processing ./data/bus_time_20170504.csv ---
 --- Done (1/3) ---
 --- Done (2/3) --- 
 --- Done (3/3) --- 
--- Done (cleanup) --- 
 --- Processing ./data/bus_time_20170505.csv ---
 --- Done (1/3) ---
 --- Done (2/3) --- 
 --- Done (3/3) --- 
--- Done (cleanup) --- 
 --- Processing ./data/bus_time_20170506.csv ---
 --- Done (1/3) ---
 --- Done (2/3) --- 
 --- Done (3/3) --- 
--- Done (cleanup) --- 
 --- Processing ./data/bus_time_20170507.csv ---
 --- Done (1/3) ---
 --- Done (2/3) --- 
 --- Done (3/3) --- 
--- Done (cleanup) --- 
 --- Processing ./data/bus_time_20170508.csv ---
 --- Done (1/3) ---
 --- Done (2/3) --- 
 --- Done (3/3

In [12]:
# Concatenate all M15 datasets of May 2017

m15_final_df = pd.DataFrame()
m15_sbs_final_df = pd.DataFrame()

for i in range(1, 32):
    if i < 10: 
        m15_file = './data/bus_time_2017050' + str(i) + "_m15_filtered.csv"
        m15_sbs_file = './data/bus_time_2017050' + str(i) + "_m15_sbs_filtered.csv"
    else: 
        m15_file = './data/bus_time_201705' + str(i) + "_m15_filtered.csv"
        m15_sbs_file = './data/bus_time_201705' + str(i) + "_m15_sbs_filtered.csv"
    
    # read in each daily file
    m15_df = pd.read_csv(m15_file)
    m15sbs_df = pd.read_csv(m15_sbs_file)
    
    # concatenate daily fales
    m15_final_df = pd.concat([m15_final_df, m15_df])
    m15_sbs_final_df = pd.concat([m15_sbs_final_df, m15sbs_df])
    
    print("---  concate {} ---".format(m15_file))
    print("---  concate {} ---".format(m15_sbs_file))
    
    # clean up 
    del m15_df
    del m15sbs_df

---  concate ./data/bus_time_20170501_m15_filtered.csv ---
---  concate ./data/bus_time_20170501_m15_sbs_filtered.csv ---
---  concate ./data/bus_time_20170502_m15_filtered.csv ---
---  concate ./data/bus_time_20170502_m15_sbs_filtered.csv ---
---  concate ./data/bus_time_20170503_m15_filtered.csv ---
---  concate ./data/bus_time_20170503_m15_sbs_filtered.csv ---
---  concate ./data/bus_time_20170504_m15_filtered.csv ---
---  concate ./data/bus_time_20170504_m15_sbs_filtered.csv ---
---  concate ./data/bus_time_20170505_m15_filtered.csv ---
---  concate ./data/bus_time_20170505_m15_sbs_filtered.csv ---
---  concate ./data/bus_time_20170506_m15_filtered.csv ---
---  concate ./data/bus_time_20170506_m15_sbs_filtered.csv ---
---  concate ./data/bus_time_20170507_m15_filtered.csv ---
---  concate ./data/bus_time_20170507_m15_sbs_filtered.csv ---
---  concate ./data/bus_time_20170508_m15_filtered.csv ---
---  concate ./data/bus_time_20170508_m15_sbs_filtered.csv ---
---  concate ./data/bus_

In [16]:
# Dataset Validation
m15_final_df.shape

(645808, 17)

In [20]:
m15_final_df['timestamp'] = pd.to_datetime(m15_final_df['timestamp'], infer_datetime_format= True)
print(m15_final_df['timestamp'].min())
print(m15_final_df['timestamp'].max())

2017-05-01 00:00:00
2017-05-31 23:59:41


In [None]:
m15_final_df.sample()

In [21]:
m15_sbs_final_df['timestamp'] = pd.to_datetime(m15_sbs_final_df['timestamp'], infer_datetime_format= True)
print(m15_sbs_final_df['timestamp'].min())
print(m15_sbs_final_df['timestamp'].max())

2017-05-01 00:00:01
2017-05-31 23:59:41


In [22]:
# Dataset Export
m15_final_df.to_csv("./data/m15_final.csv")
m15_sbs_final_df.to_csv("./data/m15_sbs_final.csv")

In [1]:
import pandas as pd


In [4]:
m15_oneday = pd.read_csv("data/bus_time_20170501_m15_filtered.csv")

In [17]:
df = m15_oneday
df["timestamp"] = pd.to_datetime(df['timestamp'])
for trip_id, group in df.groupby("trip_id"):
    sort = group.sort_values("timestamp")
    trip =  df["trip_id"] == trip_id
    if sort.iloc[0].latitude > sort.iloc[-1].latitude:
        df.loc[trip, "direction"] = 0 # southbound
    else:
        df.loc[trip, "direction"] = 1 # northbound


In [21]:
df.groupby("trip_id").first()

Unnamed: 0_level_0,Unnamed: 0,timestamp,vehicle_id,latitude,longitude,bearing,progress,service_date,block_assigned,next_stop_id,dist_along_route,dist_from_stop,Day,Hour,DOW,DOW_Label,direction
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
OH_B7-Sunday-113300_M15_36,19,2017-05-01 00:00:00,5254,40.731723,-73.985397,233.71,0,20170430,1,404105,9289.01,121.04,1,0,0,Mon,0.0
OH_B7-Sunday-113300_M15_49,151,2017-05-01 00:00:03,5549,40.777583,-73.948857,53.97,0,20170430,1,401716,10650.91,187.72,1,0,0,Mon,1.0
OH_B7-Sunday-113400_M15_23,479,2017-05-01 00:00:10,5544,40.789946,-73.939831,53.84,0,20170430,1,401724,10146.39,224.11,1,0,0,Mon,1.0
OH_B7-Sunday-114300_M15_29,1340,2017-05-01 00:00:28,5556,40.724012,-73.990985,262.81,0,20170430,1,803078,10214.33,67.51,1,0,0,Mon,0.0
OH_B7-Sunday-115300_M15_28,58,2017-05-01 00:00:01,5585,40.753502,-73.966428,54.36,0,20170430,1,401701,7417.29,13.53,1,0,0,Mon,1.0
OH_B7-Sunday-115300_M15_30,668,2017-05-01 00:00:14,5705,40.767240,-73.956407,54.19,0,20170430,1,401709,7147.28,109.86,1,0,0,Mon,1.0
OH_B7-Sunday-115300_M15_44,819,2017-05-01 00:00:17,5256,40.749140,-73.972706,234.21,0,20170430,1,401772,7148.00,193.73,1,0,0,Mon,0.0
OH_B7-Sunday-116300_M15_47,907,2017-05-01 00:00:19,5547,40.761396,-73.963764,233.86,0,20170430,1,401765,5704.99,308.01,1,0,0,Mon,0.0
OH_B7-Sunday-117300_M15_51,1234,2017-05-01 00:00:26,5638,40.712280,-73.994693,4.86,0,20170430,1,402090,2139.04,49.25,1,0,0,Mon,1.0
OH_B7-Sunday-117400_M15_37,867,2017-05-01 00:00:18,5613,40.748731,-73.969845,53.68,0,20170430,1,401698,4758.19,71.19,1,0,0,Mon,1.0
