# Toronto Transit Data GTFS (General Transit Feed Specification)


---


**Author**: Group 1

**Creation Date**: December 4, 2021

**Revision Date**: December 11, 2021


---


**Data Source**: Toronto Transit Commission (TTC) Routes and Schedules

**Data**: https://open.toronto.ca/dataset/ttc-routes-and-schedules/

**Data Dictionary**: 

**Data Licence**: See below for full data licence details.

---



The data captures information about serious vehicle collisions in the city of Toronto Ontario.

This notebook will prepare the data for analysis.



Note: In order to run this code you will need to create a shortcut to the shared drive in your Google drive and specify the path to the shortcut in the file_path variable.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import datetime
from IPython.display import display
import plotly.express as px
import plotly.graph_objects as go
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh') 
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
import csv

!pip install geopandas
from shapely.geometry import Polygon
from shapely.geometry import Point
import geopandas as gpd



In [None]:
# The below file path is specific to ECHO only
file_path = '/content/drive/MyDrive/Data Science/1-Foundations of Data Science/Foundations of Data Science - Group Project/Data Files/' 
file_ttc='stops.txt'
file_incidents = 'KSI_Incidents.csv'

In [None]:
# Importing stops data file
file_ttc=pd.read_csv(file_path+file_ttc)
file_ttc.head(3)

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,262,662,Danforth Rd at Kennedy Rd,,43.714379,-79.260939,,,,,,2
1,263,929,Davenport Rd at Bedford Rd,,43.674448,-79.399659,,,,,,1
2,264,940,Davenport Rd at Dupont St,,43.675511,-79.401938,,,,,,2


In [None]:
file_ttc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9476 entries, 0 to 9475
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   stop_id              9476 non-null   int64  
 1   stop_code            9476 non-null   int64  
 2   stop_name            9476 non-null   object 
 3   stop_desc            0 non-null      float64
 4   stop_lat             9476 non-null   float64
 5   stop_lon             9476 non-null   float64
 6   zone_id              0 non-null      float64
 7   stop_url             0 non-null      float64
 8   location_type        0 non-null      float64
 9   parent_station       0 non-null      float64
 10  stop_timezone        0 non-null      float64
 11  wheelchair_boarding  9476 non-null   int64  
dtypes: float64(8), int64(3), object(1)
memory usage: 888.5+ KB


In [None]:
ttc_stop = file_ttc[['stop_id', 'stop_code','stop_name','stop_lat', 'stop_lon']]
ttc_stop = ttc_stop.rename(columns={'stop_lat': 'latitude','stop_lon':'longitude' })
ttc_stop.head()

Unnamed: 0,stop_id,stop_code,stop_name,latitude,longitude
0,262,662,Danforth Rd at Kennedy Rd,43.714379,-79.260939
1,263,929,Davenport Rd at Bedford Rd,43.674448,-79.399659
2,264,940,Davenport Rd at Dupont St,43.675511,-79.401938
3,265,1871,Davisville Ave at Cleveland St,43.702088,-79.378112
4,266,11700,Disco Rd at Attwell Dr,43.701362,-79.594843


In [None]:
# create geopanda dataframe add crs lat long
#ttc_stop['geometry'] = list(zip(ttc_stop.longitude, ttc_stop.latitude))
#ttc_stop['geometry'] = ttc_stop['geometry'].apply(Point)
#ttc_stop['latitude'] = round(ttc_stop['latitude'],3)
#ttc_stop['longitude'] = round(ttc_stop['longitude'],3)
ttc_stop['geometry'] = list(zip(ttc_stop.longitude, ttc_stop.latitude))
ttc_stop['geometry'] = ttc_stop['geometry'].apply(Point)
ttc_stop.head(3)


Unnamed: 0,stop_id,stop_code,stop_name,latitude,longitude,geometry
0,262,662,Danforth Rd at Kennedy Rd,43.714379,-79.260939,POINT (-79.26093900000001 43.714379)
1,263,929,Davenport Rd at Bedford Rd,43.674448,-79.399659,POINT (-79.399659 43.674448)
2,264,940,Davenport Rd at Dupont St,43.675511,-79.401938,POINT (-79.401938 43.675511)


In [None]:
# reprojecting epsg 4386 (wgs84) to epsg 2019 (mtm nad 27)
crs = {'init': 'epsg:4326'}
ttc_stop = gpd.GeoDataFrame(ttc_stop, crs=crs, geometry='geometry')
ttc_stop = ttc_stop.to_crs({'init': 'epsg:2019'})


'+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6


'+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6



In [None]:
# calculating x and y which will be used later to find nearest ttc stop to address 
ttc_stop['x'] = ttc_stop.geometry.x
ttc_stop['y'] = ttc_stop.geometry.y
ttc_stop.head(3)

Unnamed: 0,stop_id,stop_code,stop_name,latitude,longitude,geometry,x,y
0,262,662,Danforth Rd at Kennedy Rd,43.714379,-79.260939,POINT (324064.544 4841414.808),324064.544438,4841415.0
1,263,929,Davenport Rd at Bedford Rd,43.674448,-79.399659,POINT (312891.267 4836955.833),312891.267226,4836956.0
2,264,940,Davenport Rd at Dupont St,43.675511,-79.401938,POINT (312707.354 4837073.706),312707.35433,4837074.0


In [None]:
# Importing incident data file
file_incident=pd.read_csv(file_path+file_incidents)
file_incident.head()

Unnamed: 0,ACCNUM,ACCLASS,INVOLVED,MOTIVE,X,Y,LATITUDE,LONGITUDE,DATE_TIME,SEASON,YEAR,MONTH,MONTH_NAME,DAY_NAME,HOUR,HOUR_INTERVAL,ROAD_CLASS,ACCLOC,TRAFFCTL,VISIBILITY,LIGHT,RDSFCOND,IMPACTYPE,DISTRICT,NEIGHBOURHOOD,HOOD_ID,LOCCOORD,PEOPLE_COUNT,FATAL_INJURY_COUNT,MAJOR_INJURY_COUNT,MINOR_INJURY_COUNT,NO_INJURY_COUNT,NULL_INJURY_COUNT,DRIVERS_COUNT,PEDESTRIAN_COUNT,PASSENGER_COUNT,CYCLIST_COUNT,OTHER_INVTYPE_COUNT,NULL_INVTYPE_COUNT,PEOPLE_VEH_COUNT,AUTOMOBILE_VEH_COUNT,RECREATIONAL_VEH_COUNT,OTHER_VEH_COUNT,NULL_VEH_COUNT
0,25301,Non-Fatal Injury,"Pedestrian, Automobile",Not Recorded,-8836220.0,5420822.0,43.710967,-79.377116,2020-01-04 18:50:00,Winter,2020,1,January,Saturday,18,18:00 to 18:59,Major Arterial,Intersection Related,Traffic Signal,Rain,Dark,Wet,Pedestrian Collisions,North York,Leaside-Bennington (56),56,Intersection,2,0,1,0,1,0,1,1,0,0,0,0,0,1,0,0,0
1,26294,Fatal,"Pedestrian, Automobile",Not Recorded,-8836047.0,5412910.0,43.659568,-79.37556,2020-01-04 22:14:00,Winter,2020,1,January,Saturday,22,22:00 to 22:59,Major Arterial,Non Intersection,No Control,Clear,"Dark, artificial",Dry,Pedestrian Collisions,Toronto and East York,Moss Park (73),73,Mid-Block,2,1,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0
2,37330,Non-Fatal Injury,"Pedestrian, Automobile",Aggressive and Distracted Driving Collision,-8842513.0,5411391.0,43.649699,-79.43365,2020-01-06 15:55:00,Winter,2020,1,January,Monday,15,15:00 to 15:59,Minor Arterial,At Intersection,Traffic Signal,Clear,"Dawn, artificial",Dry,Pedestrian Collisions,Toronto East York,Little Portugal (84),84,Intersection,2,0,1,0,1,0,1,1,0,0,0,0,0,1,0,0,0
3,45664,Non-Fatal Injury,Automobile,Aggressive and Distracted Driving Collision,-8827355.0,5423072.0,43.725577,-79.297481,2020-01-07 18:50:00,Winter,2020,1,January,Tuesday,18,18:00 to 18:59,Major Arterial,At Intersection,Traffic Signal,Clear,Dark,Dry,Turning Movement,Scarborough,Clairlea-Birchmount (120),120,Intersection,3,0,1,1,1,0,2,0,1,0,0,0,0,2,0,0,0
4,56815,Non-Fatal Injury,Automobile,"Speeding Related Collision, Red Light Related...",-8858314.0,5419422.0,43.701876,-79.575588,2020-01-09 11:00:00,Winter,2020,1,January,Thursday,11,11:00 to 11:59,Major Arterial,At Intersection,Traffic Signal,Clear,Daylight,Dry,Angle,Etobicoke York,West Humber-Clairville (1),1,Intersection,4,0,1,0,3,0,4,0,0,0,0,0,0,3,0,1,0


In [None]:
# Importing incident dataset
incident = file_incident[['ACCLASS', 'FATAL_INJURY_COUNT','LATITUDE','LONGITUDE']]
incident.head(3)

Unnamed: 0,ACCLASS,FATAL_INJURY_COUNT,LATITUDE,LONGITUDE
0,Non-Fatal Injury,0,43.710967,-79.377116
1,Fatal,1,43.659568,-79.37556
2,Non-Fatal Injury,0,43.649699,-79.43365


In [None]:
# Reduce the geographical coordinators to 3 decimal points
#ttc_stop['latitude'] = round(ttc_stop['latitude'],3)
#ttc_stop['longitude'] = round(ttc_stop['longitude'],3)
#incident['LATITUDE'] = round(incident['LATITUDE'],3)
#incident['LONGITUDE'] = round(incident['LONGITUDE'],3)
ttc_stop['latitude_3'] = round(ttc_stop['latitude'],3)
ttc_stop['longitude_3'] = round(ttc_stop['longitude'],3)
incident['LATITUDE_3'] = round(incident['LATITUDE'],3)
incident['LONGITUDE_3'] = round(incident['LONGITUDE'],3)

In [None]:
#result=ttc_stop.append([file_incident])
#result.head(3)
ttc_stop.head(3)

Unnamed: 0,stop_id,stop_code,stop_name,latitude,longitude,geometry,x,y,latitude_3,longitude_3
0,262,662,Danforth Rd at Kennedy Rd,43.714379,-79.260939,POINT (324064.544 4841414.808),324064.544438,4841415.0,43.714,-79.261
1,263,929,Davenport Rd at Bedford Rd,43.674448,-79.399659,POINT (312891.267 4836955.833),312891.267226,4836956.0,43.674,-79.4
2,264,940,Davenport Rd at Dupont St,43.675511,-79.401938,POINT (312707.354 4837073.706),312707.35433,4837074.0,43.676,-79.402


In [None]:
incident.head(3)

Unnamed: 0,ACCLASS,FATAL_INJURY_COUNT,LATITUDE,LONGITUDE,LATITUDE_3,LONGITUDE_3
0,Non-Fatal Injury,0,43.710967,-79.377116,43.711,-79.377
1,Fatal,1,43.659568,-79.37556,43.66,-79.376
2,Non-Fatal Injury,0,43.649699,-79.43365,43.65,-79.434


In [None]:
#df["period"] = df["Year"].astype(str) + df["quarter"]
ttc_stop['coordinator']=ttc_stop['latitude_3'].astype(str)+","+ttc_stop['longitude_3'].astype(str)
incident['coordinator']=incident['LATITUDE_3'].astype(str)+","+incident['LONGITUDE_3'].astype(str)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
#pd.merge(product,customer,on='Product_ID',how='outer')
stop = ttc_stop[['stop_name','latitude', 'longitude','coordinator']]
incident = incident[['ACCLASS', 'FATAL_INJURY_COUNT','LATITUDE','LONGITUDE','coordinator']]
result=pd.merge(incident,stop,on='coordinator',how='left')
result

Unnamed: 0,ACCLASS,FATAL_INJURY_COUNT,LATITUDE,LONGITUDE,coordinator,stop_name,latitude,longitude
0,Non-Fatal Injury,0,43.710967,-79.377116,"43.711,-79.377",Eglinton Ave East at Bayview Ave,43.711241,-79.376822
1,Fatal,1,43.659568,-79.375560,"43.66,-79.376",,,
2,Non-Fatal Injury,0,43.649699,-79.433650,"43.65,-79.434",Dundas St West at Sheridan Ave,43.649721,-79.433841
3,Non-Fatal Injury,0,43.649699,-79.433650,"43.65,-79.434",Dundas St West at Sheridan Ave,43.649825,-79.433524
4,Non-Fatal Injury,0,43.725577,-79.297481,"43.726,-79.297",Pharmacy Ave at Eglinton Ave East,43.725514,-79.297383
...,...,...,...,...,...,...,...,...
8962,Non-Fatal Injury,0,43.658438,-79.384853,"43.658,-79.385",,,
8963,Non-Fatal Injury,0,43.691519,-79.570796,"43.692,-79.571",Dixon Rd at Martin Grove Rd,43.691791,-79.570592
8964,Non-Fatal Injury,0,43.691519,-79.570796,"43.692,-79.571",Martin Grove Rd at Dixon Rd North Side,43.691959,-79.570844
8965,Non-Fatal Injury,0,43.691519,-79.570796,"43.692,-79.571",Martin Grove Rd at Dixon Rd,43.691857,-79.571186


In [None]:
result=result.dropna()
result

Unnamed: 0,ACCLASS,FATAL_INJURY_COUNT,LATITUDE,LONGITUDE,coordinator,stop_name,latitude,longitude
0,Non-Fatal Injury,0,43.710967,-79.377116,"43.711,-79.377",Eglinton Ave East at Bayview Ave,43.711241,-79.376822
2,Non-Fatal Injury,0,43.649699,-79.433650,"43.65,-79.434",Dundas St West at Sheridan Ave,43.649721,-79.433841
3,Non-Fatal Injury,0,43.649699,-79.433650,"43.65,-79.434",Dundas St West at Sheridan Ave,43.649825,-79.433524
4,Non-Fatal Injury,0,43.725577,-79.297481,"43.726,-79.297",Pharmacy Ave at Eglinton Ave East,43.725514,-79.297383
5,Non-Fatal Injury,0,43.725577,-79.297481,"43.726,-79.297",Eglinton Ave East at Pharmacy Ave East Side,43.725616,-79.296932
...,...,...,...,...,...,...,...,...
8960,Non-Fatal Injury,0,43.753241,-79.516650,"43.753,-79.517",2900 Jane St,43.752642,-79.516653
8963,Non-Fatal Injury,0,43.691519,-79.570796,"43.692,-79.571",Dixon Rd at Martin Grove Rd,43.691791,-79.570592
8964,Non-Fatal Injury,0,43.691519,-79.570796,"43.692,-79.571",Martin Grove Rd at Dixon Rd North Side,43.691959,-79.570844
8965,Non-Fatal Injury,0,43.691519,-79.570796,"43.692,-79.571",Martin Grove Rd at Dixon Rd,43.691857,-79.571186


In [None]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6278 entries, 0 to 8966
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ACCLASS             6278 non-null   object 
 1   FATAL_INJURY_COUNT  6278 non-null   int64  
 2   LATITUDE            6278 non-null   float64
 3   LONGITUDE           6278 non-null   float64
 4   coordinator         6278 non-null   object 
 5   stop_name           6278 non-null   object 
 6   latitude            6278 non-null   float64
 7   longitude           6278 non-null   float64
dtypes: float64(4), int64(1), object(3)
memory usage: 441.4+ KB


In [None]:
# Plot bus stop map
fig = px.scatter_mapbox(result, lat='latitude', lon='longitude',
                        title='Toronto Transit Locations', 
                        color='ACCLASS', 
                        #size='ACCLASS',
                        hover_name='stop_name',
                        opacity=0.5,
                        center={'lat':43.73, 'lon':-79.4},
                        mapbox_style='open-street-map',
                        height=800, width=1000, zoom=10.15)

fig.show()
fig.write_html(file_path+'Toronto Transit map.html')
