### Creating Description2Vector for Geohashes

This notebook create a description to vector representation for each geohash location R

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime,timedelta
import pytz
import pygeohash as gh
from haversine import haversine
import time
import cPickle
import glob
import json
import re

geohash_prec = 5

In [2]:
cities = {'LosAngeles': [33.700615, 34.353627, -118.683511, -118.074559], 
           'Houston': [29.497907,30.129003,-95.797178,-94.988191],
           'Austin': [30.079327, 30.596764,-97.968881,-97.504838],
           'Dallas': [32.559567,33.083278,-97.036586,-96.428928],
           'Charlotte': [34.970168,35.423667,-81.060925,-80.622687],
           'Atlanta': [33.612410,33.916999,-84.575600,-84.231911],
           'Miami': [25.664776,25.942874,-80.386562,-80.118637]}

time_zones = {'Houston':'US/Central', 'Charlotte':'US/Eastern', 'Miami': 'US/Eastern', 'Dallas':'US/Central',
              'Atlanta':'US/Eastern', 'Austin':'US/Central', 'LosAngeles':'US/Pacific'}

# A time interval of length 1 year, to be used to generate description to vector for each geographical region (or geohash) 
start = datetime(2017, 5, 1)
finish   = datetime(2018, 5, 31)

begin = datetime.strptime('2017-05-01 00:00:00', '%Y-%m-%d %H:%M:%S')
end   = datetime.strptime('2018-05-31 23:59:59', '%Y-%m-%d %H:%M:%S')

### 1: Load Past Traffic Events Data

In [3]:
mq = pd.read_csv('TrafficWeatherEvent_Aug16_June19_Publish.csv') # this is the latest version of LSTW dataset
# get the data from https://smoosavi.org/datasets/lstw
mq.head()

In [4]:
mq['StartTime(UTC)'] = mq['StartTime(UTC)'].astype('datetime64[ns]', errors = 'ignore')
mq['EndTime(UTC)'] = mq['EndTime(UTC)'].astype('datetime64[ns]', errors = 'ignore')

In [5]:
for c in cities:
    crds = cities[c]
    subset_all = mq[(mq['Source'] == 'T') & (mq['StartTime(UTC)'] >= start) & (mq['StartTime(UTC)'] < end) & 
                    (mq['LocationLat']>crds[0]) & (mq['LocationLat']<crds[1]) & (mq['LocationLng']>crds[2]) & 
                    (mq['LocationLng']<crds[3])] 
    
    subset_all.to_csv('data/temporary_for_nlp/MQ_{}_20170501_20180531.csv'.format(c), index=False)
    

### 2: Load GloVe Word Embedding Vectors

In [6]:
word2vec = {}
with open('data/glove.6B.100d.txt', 'r') as reader: # suppose that we already downloaded this GloVe model. 
    # you can download this file from https://nlp.stanford.edu/projects/glove/
    for line in reader:
        parts = line.replace('\r', '').replace('\n', '').split(' ')
        v = [float(parts[i]) for i in range(1, len(parts))]
        word2vec[parts[0]] = v
        

print 'loaded {} word vectors!'.format(len(word2vec))


In [7]:
def return_desc2vec(input):
    parts = re.split(' - | |\.|\\\|/|;|,|&|!|\?|\(|\)|\[|\]|\{|\}', input)
    parts = [p.lower() for p in parts]
    v = []
    for p in parts:
        if len(p) ==0: continue
        if p in word2vec: v.append(word2vec[p])
#         else: 
#             v.append(word2vec['UNK'])
    if len(v) ==0: print input
    v = np.mean(v, axis=0)
    return v

### 3: Use Traffic Event Data to Create Embedding Vector

In [8]:
# load valid geohashes
valid_geohashes = set() # we only generate data for those regions/geohashes that have valid POI data 
with open('data/geohash_to_poi_vec.csv', 'r') as reader:
    for line in reader:
        if 'Geohash' in line: continue
        valid_geohashes.add(line.split(',')[0])

In [9]:
geo_to_vec = {}
start_timestamp = time.time()

for c in cities:
    
    # add map-quest data
    with open('data/temporary_for_nlp/MQ_{}_20170501_20180531.csv'.format(c), 'r') as file:
        header = False
        for line in file:
            if not header:
                header = True
                continue
            parts = line.replace('\r', '').replace('\n', '').split(',')
                        
            start_gh = gh.encode(float(parts[9]), float(parts[10]), precision=geohash_prec)     
            if start_gh not in valid_geohashes: continue
            
            mat = []
            if start_gh in geo_to_vec: mat = geo_to_vec[start_gh]
            mat.append(return_desc2vec(parts[17]))
            geo_to_vec[start_gh] = mat            

    
    print 'Done with {} in {:.1f} sec!'.format(c,time.time()-start_timestamp)
    start_timestamp = time.time()
            

### 4: Create and Dump Textual Feature Vector for each Geohash

In [10]:
writer = open('data/geohash_to_text_vec.csv', 'w')
writer.write('Geohash,vec\n')

for g in geo_to_vec:
    vec = list(np.mean(geo_to_vec[g], axis=0))
    v = [str(vec[i]) for i in range(len(vec))]
    v = ' '.join(v)
    writer.write(g + ',' + v + '\n')
writer.close()