In [2]:
import pandas as pd
import os
import numpy as np
import pandas_profiling

# This line is needed to display plots inline in Jupyter Notebook
%matplotlib inline

# Required for basic python plotting functionality
import matplotlib.pyplot as plt

# Required for formatting dates later in the case
import datetime
import matplotlib.dates as mdates

# Required to display image inline
from IPython.display import Image

# Advanced plotting functionality with seaborn
import seaborn as sns
sns.set(style="whitegrid") # can set style depending on how you'd like it to look

import folium  #needed for interactive map
from folium.plugins import HeatMap

import geopandas
from shapely.geometry import Polygon
from shapely.geometry import Point
from geopandas import GeoDataFrame

In [4]:
path = 'DataSet/Original'
file = 'mta_trips.csv'

mta_trips = pd.read_csv(path + '/' + file, sep = ',', doublequote = True)
print(mta_trips.shape)

  interactivity=interactivity, compiler=compiler, result=result)


(7554197, 10)


In [3]:
mta_trips.head()

Unnamed: 0,station,line_name,division,audit_type,unit_id,datetime,new_entries,new_exits,latitude,longitude
0,WHITEHALL ST,R1,BMT,REGULAR,R001_A058_01-00-00,03/29/2014 05:00:00,4,6,40.703087,-74.012994
1,WHITEHALL ST,R1,BMT,REGULAR,R001_A058_01-00-00,03/29/2014 09:00:00,1,13,40.703087,-74.012994
2,WHITEHALL ST,R1,BMT,REGULAR,R001_A058_01-00-00,03/29/2014 13:00:00,1,8,40.703087,-74.012994
3,WHITEHALL ST,R1,BMT,REGULAR,R001_A058_01-00-00,03/29/2014 17:00:00,1,8,40.703087,-74.012994
4,WHITEHALL ST,R1,BMT,REGULAR,R001_A058_01-00-00,03/29/2014 21:00:00,2,3,40.703087,-74.012994


In [4]:
mta_trips['datetime'] = pd.to_datetime(mta_trips['datetime'], format = '%m/%d/%Y %H:%M:%S', errors = 'coerce')

In [6]:
mta_trips['line_name'] = mta_trips['line_name'].astype('str')
mta_trips.dtypes
mta_trips.to_parquet(path + '/Parquet/mta_trips_filtered.parquet', engine = 'pyarrow')

In [7]:
# Opens the map as a GeoDataFrame
path = 'C:/Users/johns/Documents/DS4A_workspace/Datathon/Dataset'
nyc_bouroughs = geopandas.read_file(path + '/Neighborhood Tabulation Areas.geojson')

In [8]:
geometry = [Point(xy) for xy in zip(mta_trips.longitude, mta_trips.latitude)]
# df = df.drop(['Lon', 'Lat'], axis=1)
crs = {'init': 'epsg:4326'}
gdf = GeoDataFrame(mta_trips, crs=crs, geometry=geometry)

In [9]:
print(gdf.shape)
gdf_boroughs = geopandas.sjoin(gdf, nyc_bouroughs, how="inner", op='intersects')
print(gdf.shape)

(7554197, 11)
(7554197, 11)


In [10]:
gdf_final = gdf_boroughs.drop(['geometry'], axis = 1)

In [11]:
gdf_final['line_name'] = gdf_final['line_name'].astype('str')
gdf_final.dtypes

station                object
line_name              object
division               object
audit_type             object
unit_id                object
datetime       datetime64[ns]
new_entries             int64
new_exits               int64
latitude              float64
longitude             float64
index_right             int64
ntacode                object
shape_area             object
county_fips            object
ntaname                object
shape_leng             object
boro_name              object
boro_code              object
dtype: object

In [12]:
gdf_final.to_parquet(path + '/Parquet/mta_trips_nta.parquet', engine = 'pyarrow')

In [13]:
mta_trips = pd.read_parquet(path + '/Parquet/mta_trips_nta.parquet', engine = 'pyarrow')

In [14]:
max_amount = float(mta_trips['new_entries'].max())

folium_hmap = folium.Map(location=[40.738, -73.98],
                        zoom_start=13,
                        tiles="OpenStreetMap")

hm_wide = HeatMap( list(zip(mta_trips['latitude'], mta_trips['longitude'], 
                            mta_trips['new_entries'])),
                   min_opacity=0.2,
                   max_val=max_amount,
                   radius=8, blur=6, 
                   max_zoom=15, 
                 )

folium_hmap.add_child(hm_wide);
folium_hmap.save(path + "/maps/heatmap_mta_trips.html")

In [15]:
folium_map = folium.Map(location=[40.738, -73.98],
                        zoom_start=13,
                        tiles="OpenStreetMap")

mta_trips = mta_trips.reset_index()

for i in range(0,1000):
    marker = folium.CircleMarker(location=[mta_trips["latitude"][i],
                                           mta_trips["longitude"][i]],
                                 radius=5,color="b",fill=True)
    marker.add_to(folium_map)

folium_map.save(path + "/maps/markmap_mta_trips.html")