In [None]:
import numpy as np
import csv
import datetime
import time

In [None]:
def addElement(array, index, ele):
    if array and index < len(array):
        array[index] += ele
        return array
    newArray = [0] * (index + 1)
    if array:
        newArray[:len(array)] = array
    newArray[index] = ele
    return newArray

In [None]:
trafficRecordsById = {}
startTime = 1433142000

with open("new.csv") as f:
    trafficRecords = csv.reader(f, delimiter=',')
    count = 0
    for trafficRecord in trafficRecords:
        if count % 100000 == 0:
            print (count / 24015249 * 100, "%")
#         if count == 30000:
#             break
        recordTime = trafficRecord[1]
        checkPointId = trafficRecord[2]
        lane = int(trafficRecord[3])
        traffic = int(trafficRecord[4])

        relativeTime = time.mktime(datetime.datetime.strptime(recordTime, "%Y-%m-%d %H:%M:%S").timetuple()) - startTime
#         print (relativeTime)
        intervalId = relativeTime // 300
#         print (intervalId)
        
        trafficRecordsForId = trafficRecordsById.get(checkPointId)
        if not trafficRecordsForId:
            trafficByLane = addElement(None, lane, traffic)
            trafficRecordsById[checkPointId] = [(intervalId, trafficByLane)]
        else:
            prevIntervalId = trafficRecordsForId[-1][0]
            prevTrafficByLane = trafficRecordsForId[-1][1]
            if (intervalId == prevIntervalId):
                trafficRecordsForId[-1] = (intervalId, addElement(prevTrafficByLane, lane, traffic))
            else:
                trafficRecordsForId.append((intervalId, addElement(None, lane, traffic)))
        count += 1
                

import pickle

pickle.dump(trafficRecordsById, open("trafficRecordsById.p", "wb"))
    

In [1]:
import pickle
trafficRecordsByIdLoaded = pickle.load(open("trafficRecordsById.p", "rb"))

In [None]:
def checkMissingSpots(checkPointId):
    trafficRecordsForId = trafficRecordsByIdLoaded.get(checkPointId)
    prevIndex = -1
    for trafficRecord in trafficRecordsForId:
        intervalId = trafficRecord[0]
        if intervalId != prevIndex + 1:
            print (prevIndex, intervalId)
        prevIndex = intervalId

In [None]:
import matplotlib.pyplot as plt
import bisect
def plotTrafficDistribution(checkPointId, timeRange=(None, None), plot=True):
    trafficRecordsForId = trafficRecordsByIdLoaded.get(checkPointId)
    if trafficRecordsForId is None:
        return
    keys = [ele[0] for ele in trafficRecordsForId]
    
    startIntervalId = timeRange[0]
    endIntervalId = timeRange[1]

    startIndex = 0
    endIndex = len(trafficRecordsForId)
    
    if startIntervalId is not None:
        startIndex = bisect.bisect_left(keys, startIntervalId)
    
    if endIntervalId is not None:
        endIndex = bisect.bisect_right(keys, endIntervalId)
        
    if plot:
        x, y = zip(*((ele[0], sum(ele[1])) for ele in trafficRecordsForId[startIndex: endIndex]))

        plt.plot(x, y)
        plt.show()
        
    return trafficRecordsForId[startIndex: endIndex]
    
plotTrafficDistribution("704", (0, 576))    
    

In [None]:
checkMissingSpots("704")

In [4]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_friedman1
from sklearn.ensemble import GradientBoostingRegressor

def mape(y_true, y_pred):
    yTrueFiltered = []
    yPredFiltered = []
    
    for index, trueValue in enumerate(y_true):
        if trueValue < 5:
            continue
        yTrueFiltered.append(trueValue)
        yPredFiltered.append(y_pred[index])
        
    yTrueFiltered = np.array(yTrueFiltered)
    yPredFiltered = np.array(yPredFiltered)
    return np.mean(np.abs((yTrueFiltered - yPredFiltered) / yTrueFiltered) * 100)

def intervalToRecords(trafficRecordsForId):
    intervalToRecordsMap = {}
    for trafficRecord in trafficRecordsForId:
        intervalToRecordsMap[trafficRecord[0]] = trafficRecord[1]
    return intervalToRecordsMap

def createDatasetSinglePoint(checkPointId):
    trafficRecordsForId = trafficRecordsByIdLoaded.get(checkPointId)
    intervalToRecordsMap = intervalToRecords(trafficRecordsForId)
    X = []
    y = []

    for trafficRecord in trafficRecordsForId:
        intervalId = trafficRecord[0]
        timeInterval = intervalId % (288 * 7)
        traffic1 = sum(intervalToRecordsMap.get(intervalId - 1)) if intervalToRecordsMap.get(intervalId - 1) is not None else None
        traffic2 = sum(intervalToRecordsMap.get(intervalId - 2)) if intervalToRecordsMap.get(intervalId - 2) is not None else None
        traffic3 = sum(intervalToRecordsMap.get(intervalId - 3)) if intervalToRecordsMap.get(intervalId - 3) is not None else None
        traffic4 = sum(intervalToRecordsMap.get(intervalId - 4)) if intervalToRecordsMap.get(intervalId - 4) is not None else None
        traffic288 = sum(intervalToRecordsMap.get(intervalId - 288)) if intervalToRecordsMap.get(intervalId - 288) is not None else None
        
        trafficDelta1 = (traffic1 - traffic2) if None not in (traffic1, traffic2) else None 
        trafficDelta2 = (traffic2 - traffic3) if None not in (traffic2, traffic3) else None
        
#         print (traffic1, traffic2, traffic3)
        if None in (traffic1, traffic2, traffic3, traffic4, traffic288, trafficDelta1, trafficDelta2):
            continue
        y.append(sum(trafficRecord[1]))
        X.append([
            timeInterval,
            traffic1,
            traffic2,
            traffic3,
            traffic4,
            traffic288,
            trafficDelta1,
            trafficDelta2])
    return X, y

def createDatasetWithSurroundingPoints(checkPointId):
    trafficRecordsForId = trafficRecordsByIdLoaded.get(checkPointId)
    intervalToRecordsMap = intervalToRecords(trafficRecordsForId)
    
    intervalToRecordsMapsSurroundings = list(map(intervalToRecords, map(trafficRecordsByIdLoaded.get, [checkPointId, "707", "405"])))
#     print (len(intervalToRecordsMapsSurroundings))
    X = []
    y = []
    for trafficRecord in trafficRecordsForId:
        intervalId = trafficRecord[0]
        timeInterval = intervalId % 288
        toAppend = [timeInterval]
        for intervalToRecordsMapsSurrounding in intervalToRecordsMapsSurroundings:
            traffic1 = sum(intervalToRecordsMapsSurrounding.get(intervalId - 1)) if intervalToRecordsMapsSurrounding.get(intervalId - 1) is not None else None
            traffic2 = sum(intervalToRecordsMapsSurrounding.get(intervalId - 2)) if intervalToRecordsMapsSurrounding.get(intervalId - 2) is not None else None
            traffic3 = sum(intervalToRecordsMapsSurrounding.get(intervalId - 3)) if intervalToRecordsMapsSurrounding.get(intervalId - 3) is not None else None
            traffic4 = sum(intervalToRecordsMapsSurrounding.get(intervalId - 4)) if intervalToRecordsMapsSurrounding.get(intervalId - 4) is not None else None
            traffic288 = sum(intervalToRecordsMapsSurrounding.get(intervalId - 288)) if intervalToRecordsMapsSurrounding.get(intervalId - 288) is not None else None
            
            trafficDelta1 = (traffic1 - traffic2) if None not in (traffic1, traffic2) else None 
            trafficDelta2 = (traffic2 - traffic3) if None not in (traffic2, traffic3) else None
            
            toAppend.extend([traffic1, traffic2, traffic3, traffic4, traffic288, trafficDelta1, trafficDelta2])
        if None in toAppend:
            continue
        y.append(sum(trafficRecord[1]))
#         print (toAppend)
        X.append(toAppend)
    return X, y

def gbm(checkPointId):
    trafficRecordsForId = trafficRecordsByIdLoaded.get(checkPointId)
    X, y = createDatasetSinglePoint(checkPointId)
    splitPoint = len(trafficRecordsForId) // 10 * 8
    X_train, X_test = X[:splitPoint], X[splitPoint:]
    y_train, y_test = y[:splitPoint], y[splitPoint:]
    est = GradientBoostingRegressor(n_estimators=300, learning_rate=0.02,
        max_depth=5, random_state=0, loss='huber').fit(X_train, y_train)
    print (mape(y_test, est.predict(X_test)))
    print (mean_squared_error(y_test, est.predict(X_test)))
    
gbm("704")


13.1911240257
211.074833496


In [None]:
import numpy
import matplotlib.pyplot as plt
from pandas import read_csv
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


def lstm(checkPointId):
    trafficRecordsForId = trafficRecordsByIdLoaded.get(checkPointId)
    numpy.random.seed(7)

    X, y = createDataset(checkPointId)
    X = np.array(X)
    y = np.array(y)
    y = np.reshape(y, (-1, 1))
#     print (y)
    
    dataSet = np.concatenate((X, y), axis=1)

    scaler = MinMaxScaler(feature_range=(0, 1))
    dataSet = scaler.fit_transform(dataSet)

    X, y = np.hsplit(dataSet, np.array([7]))
    
    splitPoint = len(trafficRecordsForId) // 10 * 9
    X_train, X_test = X[:splitPoint], X[splitPoint:]
    y_train, y_test = y[:splitPoint], y[splitPoint:]
    
    X_train = numpy.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
    X_test = numpy.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

#     print (X_train.shape)
    model = Sequential()
    model.add(LSTM(128, input_shape=(40, 3)))
#     model.add(Dense(1))
#     model.compile(loss='mean_squared_error', optimizer='adam')
#     model.fit(X_train, y_train, epochs=100, batch_size=1, verbose=2)
    
    print (mape(y_test, model.predict(X_test)))

lstm("704") 