In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

## Parsing the Excel file

The first header's row only contains the timestamp starting from 00:00 with an increment of 15 minutes.

We can ignore it since we can re-calculate the value from the id of velocity and remove that header increase in the readability of the table.

In [2]:
df = pd.read_excel("data/Scats Data October 2006.xls", sheet_name="Data", header=[1])
df

Unnamed: 0,SCATS Number,Location,CD_MELWAY,NB_LATITUDE,NB_LONGITUDE,HF VicRoads Internal,VR Internal Stat,VR Internal Loc,NB_TYPE_SURVEY,Date,...,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95
0,970,WARRIGAL_RD N of HIGH STREET_RD,060 G10,-37.86703,145.09159,249,182,1,1,2006-10-01 00:15:00,...,114,97,97,66,81,50,59,47,29,34
1,970,WARRIGAL_RD N of HIGH STREET_RD,060 G10,-37.86703,145.09159,249,182,1,1,2006-10-02 00:15:00,...,111,102,107,114,80,60,62,48,44,26
2,970,WARRIGAL_RD N of HIGH STREET_RD,060 G10,-37.86703,145.09159,249,182,1,1,2006-10-03 00:15:00,...,130,132,114,86,93,90,73,57,29,40
3,970,WARRIGAL_RD N of HIGH STREET_RD,060 G10,-37.86703,145.09159,249,182,1,1,2006-10-04 00:15:00,...,115,113,132,101,113,90,78,66,52,44
4,970,WARRIGAL_RD N of HIGH STREET_RD,060 G10,-37.86703,145.09159,249,182,1,1,2006-10-05 00:15:00,...,171,120,116,113,99,91,61,55,49,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4187,4821,VICTORIA_ST W OF BURNLEY_ST,002HF02,-37.81296,145.00830,6673,1513,7,1,2006-10-27 00:15:00,...,122,121,127,103,122,124,117,99,108,88
4188,4821,VICTORIA_ST W OF BURNLEY_ST,002HF02,-37.81296,145.00830,6673,1513,7,1,2006-10-28 00:15:00,...,93,93,93,105,105,112,82,97,106,107
4189,4821,VICTORIA_ST W OF BURNLEY_ST,002HF02,-37.81296,145.00830,6673,1513,7,1,2006-10-29 00:15:00,...,87,118,83,76,66,64,77,60,49,45
4190,4821,VICTORIA_ST W OF BURNLEY_ST,002HF02,-37.81296,145.00830,6673,1513,7,1,2006-10-30 00:15:00,...,90,88,89,80,74,48,67,62,50,62


In [3]:
first_velo_col_pos = df.columns.get_loc("V00")
flow_group = np.char.mod("V%02d", np.arange(0, 96))
grouped = df.groupby(['NB_LATITUDE', 'NB_LONGITUDE'])[flow_group].apply(lambda x: x.values.tolist())

In [94]:
flow_data = grouped.values
flow_max = np.array(flow_data.max()).max()
flow_min = np.array(flow_data.min()).min()
def flow_scaler(x):
	return (x - flow_min) / (flow_max - flow_min)

def flow_rescaler(x):
	return x * (flow_max - flow_min) + flow_min
# # MinMaxScaler.fit(flow_data)
# flow_data

In [5]:
latlong_data = np.array(grouped.index.to_list())
latlong_data

array([[-37.8676   , 145.09146  ],
       [-37.86735  , 145.09195  ],
       [-37.86723  , 145.09103  ],
       [-37.86703  , 145.09159  ],
       [-37.86155  , 145.05751  ],
       [-37.86152  , 145.05851  ],
       [-37.8612671, 145.058038 ],
       [-37.86088  , 145.05744  ],
       [-37.85513  , 145.09376  ],
       [-37.85489  , 145.09413  ],
       [-37.85467  , 145.09384  ],
       [-37.85221  , 145.09425  ],
       [-37.85193  , 145.09463  ],
       [-37.85187  , 145.09407  ],
       [-37.8516827, 145.0943457],
       [-37.84741  , 145.05263  ],
       [-37.84726  , 145.05308  ],
       [-37.84714  , 145.05205  ],
       [-37.84686  , 145.04508  ],
       [-37.84683  , 145.05275  ],
       [-37.84632  , 145.04378  ],
       [-37.83799  , 145.09681  ],
       [-37.83777  , 145.06115  ],
       [-37.83756  , 145.06057  ],
       [-37.83755  , 145.09741  ],
       [-37.83741  , 145.09626  ],
       [-37.83738  , 145.06119  ],
       [-37.83725  , 145.06055  ],
       [-37.83695  ,

In [95]:

train = []

lags = 7

i = 0
for flow in grouped.values:
	flow = np.array(flow, dtype=float).flatten()
	flow = np.vectorize(flow_scaler)(flow)
	indices = np.arange(lags, len(flow))
	offset = np.arange(-lags, 1)
	flow = flow[indices[:, np.newaxis] + offset]
	latlong = np.tile(latlong_data[i], (len(flow), 1))
	combined_arr = np.hstack((latlong, flow))
	train.extend(combined_arr)
	i += 1
	# print(latlong.shape)
train = np.array(train)
train

array([[-3.78676000e+01,  1.45091460e+02,  1.44654088e-01, ...,
         7.86163522e-02,  7.38993711e-02,  5.66037736e-02],
       [-3.78676000e+01,  1.45091460e+02,  1.46226415e-01, ...,
         7.38993711e-02,  5.66037736e-02,  5.34591195e-02],
       [-3.78676000e+01,  1.45091460e+02,  1.41509434e-01, ...,
         5.66037736e-02,  5.34591195e-02,  5.66037736e-02],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  1.21069182e-01, ...,
         5.97484277e-02,  5.34591195e-02,  4.40251572e-02],
       [ 0.00000000e+00,  0.00000000e+00,  7.23270440e-02, ...,
         5.34591195e-02,  4.40251572e-02,  4.08805031e-02],
       [ 0.00000000e+00,  0.00000000e+00,  1.24213836e-01, ...,
         4.40251572e-02,  4.08805031e-02,  3.45911950e-02]])

## Getting the positon of the first collums of velocity

Return the array of arrays containing velocity from V00 to V95

In [8]:
X = train[:, :-1]
y = train[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .75)

(X_train, y_train, X_test, y_test)

(array([[-3.78240500e+01,  1.45033490e+02,  4.66981132e-01, ...,
          3.08176101e-01,  2.46855346e-01,  2.72012579e-01],
        [-3.78318600e+01,  1.45046680e+02,  1.68238994e-01, ...,
          9.11949686e-02,  1.13207547e-01,  1.22641509e-01],
        [-3.78145700e+01,  1.45021610e+02,  3.30188679e-01, ...,
          3.86792453e-01,  8.78930818e-01,  3.69496855e-01],
        ...,
        [-3.78254200e+01,  1.45043460e+02,  2.16981132e-01, ...,
          2.31132075e-01,  2.37421384e-01,  2.15408805e-01],
        [-3.78318600e+01,  1.45046680e+02,  2.21698113e-01, ...,
          2.16981132e-01,  2.10691824e-01,  2.35849057e-01],
        [-3.78088700e+01,  1.45027930e+02,  1.05345912e-01, ...,
          1.44654088e-01,  1.68238994e-01,  2.20125786e-01]]),
 array([0.23113208, 0.0927673 , 0.39465409, ..., 0.21855346, 0.22169811,
        0.21226415]),
 array([[-3.78215500e+01,  1.45015030e+02,  3.03459119e-01, ...,
          2.67295597e-01,  3.01886792e-01,  2.76729560e-01],
        