In [None]:
import pandas as pd
import numpy as np
import glob
from matplotlib import pyplot as plt
import time
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [None]:
#Or load in allllll the data
t1=time.time()
all_files = glob.glob("data/*.csv")
li = []
for filename in all_files:
    print(filename)
    df = pd.read_csv(filename, index_col=None, header=0,parse_dates=[['Date', 'IntervalStart']])
    li.append(df)
    
df = pd.concat(li, axis=0, ignore_index=True)
t2=time.time()
print("Time to Load:", t2-t1)

Let's look at the some of the data ,just to get a feel of it. 

What are we working with?

Is it a timeseries? Spatial? XYZ? Labels? Classes? Bit of everything? 

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
"Yikes! That is {:,} total trips on sydney Toll roads (not including M5) since 2009".format(df["TotalVolume"].sum())

Let's plot all the 36,447,919 rows of data. Quite a few different ways to plot this....

In [None]:
#Start with a simple time vs volume plot. 
#This will give us a general overview for when are the busy times. 
#But a lot of details are overprinted and lost in the scope of the data
plt.figure(figsize=(12,8)) 
plt.plot(df["Date_IntervalStart"],df["TotalVolume"].values,
         color='black', marker='.',markersize=0.2,linewidth=0.0)

plt.xlabel('Date',fontsize=12)
plt.ylabel('Traffic Volume')
plt.xticks(rotation=15)

plt.show()

There are some NaNs in the data, so let's just ditch them.

In [None]:
df=df.dropna()
print("Dataframe shape: ",df.shape)

There are also some weird point locations that are apparently at lat/lon 0,0. I don't believe it, so ditch them too.

In [None]:
plt.plot(df["GantryGPSLongitude"].values)
plt.show()

df=df[df.GantryGPSLongitude != 0]
print("Dataframe shape: ",df.shape)

# plt.plot(df["GantryGPSLongitude"].values)
# plt.show()

What is a useful way to represent the data?

We have a bunch of toll points. We have a time series representing how busy each point is?

In [None]:
#Get a list of the Toll poin names
gantryUnique=df["GantryLocation"].unique()

In [None]:
#Now find the total volume of cars passing through each toll point
gantSums=np.zeros([len(gantryUnique),3])
for i,gant in enumerate(gantryUnique):
    print(i,gant)
    gantSum=df[df["GantryLocation"]==gant].TotalVolume.sum()
    gidx=df[df.GantryLocation==gant].first_valid_index()
    idxLat=df.loc[gidx].GantryGPSLatitude
    idxLon=df.loc[gidx].GantryGPSLongitude
    gantSums[i,0]=idxLat
    gantSums[i,1]=idxLon
    gantSums[i,2]=gantSum

print(gantSums)

In [None]:
#Set up the figure
fig=plt.figure(figsize=(16,10)) 
ax = plt.subplot(111)
plt.title("Sydney Toll Road Gantry Use",fontsize=24)

#Set up the base map, 
#coordinates are from the orignal image output from Mobile Atlas Creator .png and .map oziexplore format.
#Data is from OpenStreetMap
img_extent = (150.732422,151.347313,-34.015957,-33.651208)
fname="Sydney.png"
img = plt.imread(fname)

#Plot the Base map
plt.imshow(img, origin='upper', extent=img_extent,zorder=0)

#Plot the toll points
tolls=plt.scatter(gantSums[:,1],gantSums[:,0],c=gantSums[:,2],s=160,cmap='inferno_r',edgecolors='k')

#Add all other figure stuff
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="2%", pad=0.05)
cbar=plt.colorbar(tolls, cax=cax)
cbar.set_label('Number of Cars',fontsize=20)
#plt.clim(0,2000000)

#Clean up the default axis ticks
ax.set_xticks([150.8,151.3])
ax.set_yticks([-34.0,-33.7])

#Put labels on the figure
ax.set_xlabel('Longitude',fontsize=20)
ax.set_ylabel('Latitude',fontsize=20)

plt.show()

In [None]:
dfsort = df.sort_index()

In [None]:
dfshort=dfsort[(dfsort.Date_IntervalStart>'2018-01-01 00:00:00') & (dfsort.Date_IntervalStart<'2019-01-01 00:00:00')]

In [None]:
#Get a list of the Toll poin names
gantryUnique=dfshort["GantryLocation"].unique()

#Now find the total volume of cars passing through each toll point
gantSums=np.zeros([len(gantryUnique),3])
for i,gant in enumerate(gantryUnique):
    print(i,gant)
    gantSum=dfshort[dfshort["GantryLocation"]==gant].TotalVolume.sum()
    gidx=dfshort[dfshort.GantryLocation==gant].first_valid_index()
    idxLat=dfshort.loc[gidx].GantryGPSLatitude
    idxLon=dfshort.loc[gidx].GantryGPSLongitude
    gantSums[i,0]=idxLat
    gantSums[i,1]=idxLon
    gantSums[i,2]=gantSum

print(gantSums)
print(np.sum(gantSums[:,2]))

In [None]:
fig=plt.figure(figsize=(16,10)) 
ax = plt.subplot(111)
plt.title("Sydney Toll Road Gantry Use for 2018",fontsize=24)

#Set up the base map, coordinates are from the orignal image output from Mobile Atlas Creator .png and .map oziexplore format
img_extent = (150.732422,151.347313,-34.015957,-33.651208)
fname="Sydney.png"
img = plt.imread(fname)

#Plot the Base map
plt.imshow(img, origin='upper', extent=img_extent,zorder=0)

#Plot the toll points
tolls=plt.scatter(gantSums[:,1],gantSums[:,0],c=gantSums[:,2],s=160,cmap='inferno_r',edgecolors='k')

#Add all other figure stuff
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="2%", pad=0.05)
cbar=plt.colorbar(tolls, cax=cax)
cbar.set_label('Number of Cars',fontsize=20)
#plt.clim(0,2000000)

#Clean up the default axis ticks
ax.set_xticks([150.8,151.3])
ax.set_yticks([-34.0,-33.7])

#Put labels on the figure
ax.set_xlabel('Longitude',fontsize=20)
ax.set_ylabel('Latitude',fontsize=20)

plt.show()

### Some other things I started with...

In [None]:
plt.figure(figsize=(12,8)) 

x=df["GantryGPSLongitude"].values
y=df["GantryGPSLatitude"].values

heatmap, xedges, yedges = np.histogram2d(x, y, bins=100)

extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]

plt.clf()
plt.imshow(heatmap.T, extent=extent, origin='lower',cmap='inferno')
plt.show()

In [None]:
heatmap0 = np.ma.masked_where(heatmap == 0, heatmap)

In [None]:
lons=np.linspace(yedges[0], yedges[-1],100)
lats=np.linspace(xedges[0], xedges[-1],100)
xv, yv = np.meshgrid(lons, lats)

lon=yv.flatten()
lat=xv.flatten()
use=heatmap0.flatten()

In [None]:
plt.scatter(lon,lat,c=use)

In [None]:
plt.figure(figsize=(12,8)) 
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
extent= (150.644531, 151.347656, -34.089061,-33.651208)
plt.clf()
plt.imshow(heatmap0.T, extent=extent, origin='lower',cmap='inferno')
plt.show()