# Overview

This is the script for Applied Data Science class project. The team performed in-depth analysis and comparison of bus speed between local and express buses. This analysis focuses on M15 and M15SBS that runs on East Manhattan.

It includes the following: 
- Load Data
- Data Processing
- Bus Speed Calculation
- Analysis
- Statistical Testing

In [1]:
%matplotlib inline
import matplotlib.pylab as plt
import seaborn as sns
#import geopandas as gpd
import pandas as pd
import numpy as np
from datetime import timedelta, datetime

#from fiona.crs import from_epsg
import shapely
from shapely.geometry import *


from geopy.distance import vincenty



---

# Load Data

- Remove Redundent Columns
- Specify DateTime Columns
- Convert time from UTC to EDT

In [63]:
def read_data(url):
    '''
    This function reads the specified data from URL, drop unwanted columns, and convert UTC to EST
    Input: url to the dataset
    Output: Pandas Dataframe of bus trip data
    
    '''
    try:
        df = pd.read_csv(url, infer_datetime_format = True)
        df = df.drop(["Unnamed: 0", "Unnamed: 0.1"], axis = 1)
        df['timestamp_utc'] = pd.to_datetime(df['timestamp'])
        df['timestamp_est'] = df['timestamp_utc'] - timedelta(hours=5)
    except Exception as e:
        print(e)
        pass

    return df


# Run the Data Load Scritp
url_m15 = "https://raw.githubusercontent.com/cmoscardi/ads_2017/master/data/m15_final.csv"
url_m15sbs = "./data/m15_sbs_final.csv"

m15_df, m15sbs_df = read_data(url_m15), read_data(url_m15sbs)


In [57]:
m15_df.head()

Unnamed: 0,timestamp,vehicle_id,latitude,longitude,bearing,progress,service_date,trip_id,block_assigned,next_stop_id,dist_along_route,dist_from_stop,Day,Hour,DOW,DOW_Label,timestamp_utc,timestamp_est
0,2017-05-01 00:00:00,5254,40.731723,-73.985397,233.71,0,20170430,OH_B7-Sunday-113300_M15_36,1,404105,9289.01,121.04,1,0,0,Mon,2017-05-01 00:00:00,2017-04-30 19:00:00
1,2017-05-01 00:00:01,5585,40.753502,-73.966428,54.36,0,20170430,OH_B7-Sunday-115300_M15_28,1,401701,7417.29,13.53,1,0,0,Mon,2017-05-01 00:00:01,2017-04-30 19:00:01
2,2017-05-01 00:00:03,5549,40.777583,-73.948857,53.97,0,20170430,OH_B7-Sunday-113300_M15_49,1,401716,10650.91,187.72,1,0,0,Mon,2017-05-01 00:00:03,2017-04-30 19:00:03
3,2017-05-01 00:00:03,5959,40.705924,-74.006331,34.83,0,20170430,OH_B7-Sunday-119300_M15_34,1,401667,867.48,67.73,1,0,0,Mon,2017-05-01 00:00:03,2017-04-30 19:00:03
4,2017-05-01 00:00:10,5544,40.789946,-73.939831,53.84,0,20170430,OH_B7-Sunday-113400_M15_23,1,401724,10146.39,224.11,1,0,0,Mon,2017-05-01 00:00:10,2017-04-30 19:00:10


In [58]:
m15_df.shape

(645808, 18)

In [59]:
m15_df.dtypes

timestamp                   object
vehicle_id                   int64
latitude                   float64
longitude                  float64
bearing                    float64
progress                     int64
service_date                 int64
trip_id                     object
block_assigned               int64
next_stop_id                 int64
dist_along_route           float64
dist_from_stop             float64
Day                          int64
Hour                         int64
DOW                          int64
DOW_Label                   object
timestamp_utc       datetime64[ns]
timestamp_est       datetime64[ns]
dtype: object

In [60]:
m15sbs_df.head()

Unnamed: 0,timestamp,vehicle_id,latitude,longitude,bearing,progress,service_date,trip_id,block_assigned,next_stop_id,dist_along_route,dist_from_stop,Day,Hour,DOW,DOW_Label,timestamp_utc,timestamp_est
0,2017-05-01 00:00:01,1260,40.778163,-73.951532,233.87,0,20170430,OF_B7-Sunday-119100_SBS15_446,1,401756,3935.6,668.81,1,0,0,Mon,2017-05-01 00:00:01,2017-04-30 19:00:01
1,2017-05-01 00:00:02,1263,40.782688,-73.945127,54.26,0,20170430,OF_B7-Sunday-114900_SBS15_449,1,903102,11361.77,258.57,1,0,0,Mon,2017-05-01 00:00:02,2017-04-30 19:00:02
2,2017-05-01 00:00:03,1257,40.738593,-73.977293,53.86,0,20170430,OF_B7-Sunday-116600_SBS15_450,1,404995,5748.06,249.92,1,0,0,Mon,2017-05-01 00:00:03,2017-04-30 19:00:03
3,2017-05-01 00:00:08,1268,40.725444,-73.98997,233.85,0,20170430,OF_B7-Sunday-113500_SBS15_461,1,803078,10214.33,248.88,1,0,0,Mon,2017-05-01 00:00:08,2017-04-30 19:00:08
4,2017-05-01 00:00:08,1272,40.724697,-73.990513,234.26,0,20170430,OF_B7-Sunday-115200_SBS15_462,1,803078,10214.33,154.07,1,0,0,Mon,2017-05-01 00:00:08,2017-04-30 19:00:08


In [61]:
m15sbs_df.shape

(645572, 18)

In [62]:
m15sbs_df.dtypes

timestamp                   object
vehicle_id                   int64
latitude                   float64
longitude                  float64
bearing                    float64
progress                     int64
service_date                 int64
trip_id                     object
block_assigned               int64
next_stop_id                 int64
dist_along_route           float64
dist_from_stop             float64
Day                          int64
Hour                         int64
DOW                          int64
DOW_Label                   object
timestamp_utc       datetime64[ns]
timestamp_est       datetime64[ns]
dtype: object

---

# Data Cleaning and Selection

- Select on Block Assignment > 0; if Block Assignment is 0, then the bus is not in service even it's runnin
- Filter out trips with less than 15 records
- create direction indicator


In [64]:
def data_processing(df):
    
    '''
    This function is the workflow of data cleaning and processing. It selects only buses in service, 
    filter out trips with enough samples, and infers the trip direction
    Input: data frame for processing
    Output: clean data frame with new columns
    '''
    
    try:
        # Select only Block Assignment > 0
        clean_df = df[df.block_assigned > 0]

        # Filter out trips with less than 15 records
        clean_df = clean_df.groupby(clean_df.trip_id).filter(lambda group: len(group) > 15)

        # Infer Direciton

        for trip_id, group in clean_df.groupby("trip_id"):
            sort = group.sort_values("timestamp")
            trip =  clean_df["trip_id"] == trip_id
            if sort.iloc[0].latitude > sort.iloc[-1].latitude:
                clean_df.loc[trip, "direction"] = 0 # southbound
            else:
                clean_df.loc[trip, "direction"] = 1 # northbound
                
    except Exception as e: 
        print(e)
        pass
            
    return clean_df

# Run the Data Processing Script
clean_m15, clean_m15sbs = data_processing(m15_df), data_processing(m15sbs_df)

In [65]:
clean_m15.shape

(645808, 19)

In [66]:
clean_m15.groupby("direction").size()

direction
0.0    351599
1.0    294209
dtype: int64

In [67]:
clean_m15sbs.groupby("direction").size()

direction
0.0    340017
1.0    305555
dtype: int64

---

# Analysis

- Calculate speed between stops
- 