# Dynamic Heatmap Visualization of Big Geospatial Data

by Niklas Stoehr, 18/05/2018

twitterMap demo: http://cloudberry.ics.uci.edu/apps/twittermap

import dependencies

In [None]:
import numpy as np
from random import randint
import math
import pandas as pd
import re
import time
import datetime
from datetime import datetime
import timeit
import json
import requests
import matplotlib.pyplot as plt
import matplotlib as mpl
from math import *
from pyemd import emd

user specification: database, keyword, grid granularity
measurements of the map

try: iphone, election, me, christmas, monday, coffee, zika, nyc, superbowl, rain

In [None]:
# user specification
server = "ipubmed2.ics.uci.edu"
keyword = "coffee"
cellSize = 1.0

# map measurement
farwest = -135
fareast = -65
farnorth = 60
farsouth = 20


#______________________________________________

WestEastExtent = int(abs(farwest - fareast))
NorthSouthExtent = int(abs(farnorth - farsouth))
xCellLength = int(WestEastExtent/cellSize)
yCellLength = int(NorthSouthExtent/cellSize)
gridCells = (xCellLength * yCellLength)

## Decoupled Approach

query data and transmit

In [None]:
startTime1 = time.clock()

sqlQuery = "use twitter;\nselect get_points(place.bounding_box)[0] as geoPoint from ds_tweet where place.bounding_box is not unknown and ftcontains(text, ['"+str(keyword)+"'], {'mode':'any'});"
headers = {'Content-type': 'query/service'}

requestAnswer = requests.post('http://' + server + ':19002/query/service', headers=headers, data=sqlQuery)
jsonData1 = requestAnswer.json()

tweets1 = jsonData1['metrics']['resultCount']

stopTime1 = time.clock()
queryTime1 = (stopTime1 - startTime1)

print json.dumps(jsonData1, indent=4, sort_keys=True)

normalize data, build grid and visualize

In [None]:
startTime1 = time.clock()
error = 0

grid1 = np.empty([yCellLength, xCellLength])
grid1.fill(0)

# fill grid
for x in range(0, tweets1):

    # xCoordinate
    normalizeX = int(abs(farwest - jsonData1['results'][x]['geoPoint'][0]) / cellSize)

    # yCoordinate
    normalizeY = int((((abs(farnorth - jsonData1['results'][x]['geoPoint'][1])) - cellSize) / cellSize)+1)

    #print normalizeX
    #print normalizeY

    try:
        grid1[normalizeY][normalizeX] += 1
    except:
        error += 1
        #print "data out of range: ", normalizeX, normalizeY, jsonData['results'][x]['geoPoint']
    pass

print pd.DataFrame(grid1)

# find largest value to adjust colour in matrix
largestCNT = np.matrix(grid1).max()
smallestCNT = np.matrix(grid1).min()

# create discrete colormap
cmap = 'hot'
norm = mpl.colors.Normalize(vmin=smallestCNT, vmax=largestCNT)
fig, ax = plt.subplots()

ax.imshow(grid1, cmap=cmap, norm=norm, interpolation='spline36')
#ax.imshow(geoMatrix, cmap= cmap, norm=norm)

print yCellLength

# draw gridlines
ax.grid(which='major', axis='none', linestyle='', color='k', linewidth=1)
ax.set_xticks(np.arange(0, xCellLength, xCellLength/4));
ax.set_yticks(np.arange(0, yCellLength, yCellLength/4));

stopTime1 = time.clock()
gridTime1 = (stopTime1 - startTime1)

print "\n\t\tdecoupled approach"
plt.show()

## Integrated Approach

query data and transmit

In [None]:
startTime2 = time.clock()

startDate = '2017-06-25'#'2017-01-24'
endDate = '2017-06-30'#'2017-09-09'

sqlQuery = "use twitter;\nselect spatial_cell(get_points(place.bounding_box)[0], create_point(0.0,0.0)," + str(cellSize) + "," + str(cellSize) + ") as cell, count(*) as cnt from ds_tweet_" + str(keyword) + " WHERE ftcontains(text, ['" + str(keyword) + "'], {'mode':'any'}) AND place.bounding_box is not unknown AND create_at >= datetime('" + str(startDate) + "T00:00:00') AND create_at < datetime('" + str(endDate) + "T00:00:00')group by cell;"
headers = {'Content-type': 'query/service'}
requestAnswer = requests.post('http://' + server + ':19002/query/service', headers=headers, data=sqlQuery)
jsonData2 = requestAnswer.json()

resultLength2 = jsonData2['metrics']['resultCount']

tweets2 = 0
for x in range(0, resultLength2):
    tweets2 = tweets2 + jsonData2['results'][x]['cnt']

stopTime2 = time.clock()
queryTime2 = (stopTime2 - startTime2)

print json.dumps(jsonData2, indent=4, sort_keys=True)

normalize data, build grid and visualize

In [None]:
startTime2 = time.clock()
error = 0

grid2 = np.empty([yCellLength, xCellLength])
grid2.fill(0)

# fill grid
for x in range(0, resultLength2):
    
    # xCoordinate
    normalizeX = int((abs(farwest - jsonData2['results'][x]['cell'][0][0])) / cellSize)

    # yCoordinate
    normalizeY = int((abs(farnorth - jsonData2['results'][x]['cell'][0][1]) - cellSize) / cellSize)

    try:
        grid2[normalizeY][normalizeX] = grid2[normalizeY][normalizeX] + jsonData2['results'][x]['cnt']
    except:
        error += 1
        #print "data out of range: ", normalizeX, normalizeY, jsonData['results'][x]['cnt']
    pass

print pd.DataFrame(grid2)

# find largest value to adjust colour in matrix
largestCNT = np.matrix(grid2).max()
smallestCNT = np.matrix(grid2).min()

print largestCNT

# create discrete colormap
cmap = 'hot'

norm = mpl.colors.Normalize(vmin=smallestCNT, vmax=largestCNT)
fig, ax = plt.subplots()

ax.imshow(grid2, cmap=cmap, norm=norm, interpolation='spline36')

fig.canvas.set_window_title("grid")

# draw gridlines
ax.grid(which='major', axis='none', linestyle='', color='k', linewidth=1)
ax.set_xticks(np.arange(0, xCellLength, xCellLength / 4));
ax.set_yticks(np.arange(0, yCellLength, yCellLength / 4));

stopTime2 = time.clock()
gridTime2 = (stopTime2 - startTime2)

print "\n\t\tintegrated approach"
plt.show()

## Compare Time and Heat Map Results

In [None]:
print 'similarity\n'


# interpolate tweets
scaleFactorTweets = float(tweets1)/float(tweets2)
grid2Scaled = np.multiply(grid2, scaleFactorTweets)

#print pd.DataFrame(grid1)
#print pd.DataFrame(grid2Scaled)


grid1Reshape = grid1.reshape(-1)
grid2ScaledReshape = grid2Scaled.reshape(-1)




# Euclidean
def euclidean_distance(x, y):
    return sqrt(sum(pow(a - b, 2) for a, b in zip(x, y)))

euclidean = pow(euclidean_distance(grid1Reshape, grid2ScaledReshape),1)
similarity = (1 / (1 + euclidean)) * 10

#print '\neuclidean: ',euclidean
#print 'euclidean similiarity in percent: ', (similarity)*100,'%', '\n'

# EDM
distance_matrixLength = xCellLength * yCellLength
distance_matrix = np.empty([distance_matrixLength, distance_matrixLength])
distance_matrix.fill(1)

maximumDistance = tweets1
emdValue = emd(grid1Reshape, grid2ScaledReshape, distance_matrix)

print 'EMD: ',emdValue
print 'EMD similiarity in percent: ', (1-(emdValue / maximumDistance)),'%'

print '\n\nnumber of tweets\n'

print 'decoupled approach: ', tweets1
print 'integrated approach: ', tweets2

print '\n\nelapsed time'

print '\ndecoupled approach: ', (queryTime1 + gridTime1), ' seconds'  
print 'integrated approach: ', (queryTime2 + gridTime2), ' seconds'   
