In [105]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import zipfile
import json
import datetime
from datetime import datetime
import math
import os
import random
import tensorflow as tf
import collections

# Pre-Processing

In [106]:
# Read data
zf = zipfile.ZipFile('train.csv.zip')
df = pd.read_csv(zf.open('train.csv'), nrows = 10000, converters = {'POLYLINE': lambda x: json.loads(x)})
df = df.drop(['MISSING_DATA', 'TRIP_ID', 'DAY_TYPE'], 1)
print df.shape
df.head()

(10000, 6)


Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,POLYLINE
0,C,,,20000589,1372636858,"[[-8.618643, 41.141412], [-8.618499, 41.141376..."
1,B,,7.0,20000596,1372637303,"[[-8.639847, 41.159826], [-8.640351, 41.159871..."
2,C,,,20000320,1372636951,"[[-8.612964, 41.140359], [-8.613378, 41.14035]..."
3,C,,,20000520,1372636854,"[[-8.574678, 41.151951], [-8.574705, 41.151942..."
4,C,,,20000337,1372637091,"[[-8.645994, 41.18049], [-8.645949, 41.180517]..."


In [107]:
# Change unix timestamp to form "%m-%d-%H"
for i in range(0, df.shape[0]):
    if isinstance(df['TIMESTAMP'][i], int):
        df.TIMESTAMP = df.TIMESTAMP.replace(df['TIMESTAMP'][i],
                                            datetime.fromtimestamp(df['TIMESTAMP'][i]).strftime('%m-%d %H'))

df.head()

Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,POLYLINE
0,C,,,20000589,07-01 07,"[[-8.618643, 41.141412], [-8.618499, 41.141376..."
1,B,,7.0,20000596,07-01 07,"[[-8.639847, 41.159826], [-8.640351, 41.159871..."
2,C,,,20000320,07-01 07,"[[-8.612964, 41.140359], [-8.613378, 41.14035]..."
3,C,,,20000520,07-01 07,"[[-8.574678, 41.151951], [-8.574705, 41.151942..."
4,C,,,20000337,07-01 07,"[[-8.645994, 41.18049], [-8.645949, 41.180517]..."


In [108]:
# Read meta stand data
zf = zipfile.ZipFile('metaData_taxistandsID.csv.zip')
meta_df = pd.read_csv(zf.open('metaData_taxistandsID_name_GPSlocation.csv'))
meta_df.head()

Unnamed: 0,ID,Descricao,Latitude,Longitude
0,1,Agra,41.1771457135,-8.60967
1,2,Alameda,41.15618964,-8.591064
2,3,Aldoar,41.1705249231,-8.665876
3,4,Alfândega,41.1437639911,-8.621803
4,5,Amial,41.1835097223,-8.612726


In [109]:
# Replace ORIGIN_STAND from a numeric to the name of a city.
df.ORIGIN_STAND = df.ORIGIN_STAND.replace(np.array(meta_df.ID), np.array(meta_df.Descricao))
df.head()

Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,POLYLINE
0,C,,,20000589,07-01 07,"[[-8.618643, 41.141412], [-8.618499, 41.141376..."
1,B,,Av. Boavista,20000596,07-01 07,"[[-8.639847, 41.159826], [-8.640351, 41.159871..."
2,C,,,20000320,07-01 07,"[[-8.612964, 41.140359], [-8.613378, 41.14035]..."
3,C,,,20000520,07-01 07,"[[-8.574678, 41.151951], [-8.574705, 41.151942..."
4,C,,,20000337,07-01 07,"[[-8.645994, 41.18049], [-8.645949, 41.180517]..."


In [110]:
# Replace POLYLINE from a GPS location to a grid chain
trajectory_list = list()
for cnt in range(0, df.shape[0]):
    trajectory = np.array(df['POLYLINE'][cnt])
    trajectory = np.ceil(trajectory*100)
    i = 0
    while i < len(trajectory)-1:
        if np.array_equal(trajectory[i], trajectory[i+1]):
            trajectory = np.delete(trajectory, i+1, 0)
        else:
            i = i + 1
    trajectory = np.int64(trajectory)
    trajectory_list.append(trajectory)

df = df.drop(['POLYLINE'], 1)
df['POLYLINE'] = trajectory_list
df.head()

Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,POLYLINE
0,C,,,20000589,07-01 07,"[[-861, 4115], [-862, 4115], [-863, 4115], [-8..."
1,B,,Av. Boavista,20000596,07-01 07,"[[-863, 4116], [-864, 4116], [-864, 4117], [-8..."
2,C,,,20000320,07-01 07,"[[-861, 4115], [-862, 4115], [-863, 4115], [-8..."
3,C,,,20000520,07-01 07,"[[-857, 4116], [-857, 4115], [-858, 4115], [-8..."
4,C,,,20000337,07-01 07,"[[-864, 4119], [-864, 4118], [-865, 4118], [-8..."


In [111]:
df.to_csv('new_train.csv', index=False)

In [112]:
words = list()
pos_list = list()
cnt = 0
for i in range(df.shape[0]):
    pos_list.append(cnt)
    for name in df.columns:
        if name != 'POLYLINE':
            words.append(str(df[name][i]))
            cnt = cnt + 1
        else:
            for li in df[name][i]:
                words.append(str(li))
                cnt = cnt + 1

print len(words)

133005


# CBOW
predicting the word given its context

In [113]:
vocabulary_size = 5000

def build_dataset(words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
print(dictionary['A'])
print(dictionary['B'])
print(dictionary['C'])

('Most common words (+UNK)', [['UNK', 0], ('nan', 12576), ('B', 5157), ('[-861 4115]', 4322), ('[-860 4115]', 3105)])
('Sample data', [8, 1, 1, 449, 167, 3, 13, 30, 5, 9])
12
2
8


In [114]:
start_index = 0

def generate_cbow_batch(batch_size, context_size):
    batch_size = batch_size * context_size
    global start_index
  
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size / context_size, 1), dtype=np.int32)

    for i in range(batch_size / context_size):
        labels[i, 0] = data[pos_list[i+1]-1]
        for j in range(context_size):
            batch[i*context_size + j] = data[pos_list[i]+j]
            
    return batch, labels

b_size = 24
c_size = 6
batch, labels = generate_cbow_batch(batch_size=b_size, context_size=c_size)
for i in range(b_size / (c_size)):
    print "CONTEXT FOR %s:" % reverse_dictionary[labels[i,0]]
    for j in range(c_size):
        print reverse_dictionary[batch[i*(c_size) + j]]
    print

CONTEXT FOR [-863 4116]:
C
nan
nan
20000589
07-01 07
[-861 4115]

CONTEXT FOR [-866 4118]:
B
nan
Av. Boavista
20000596
07-01 07
[-863 4116]

CONTEXT FOR [-861 4115]:
C
nan
nan
20000320
07-01 07
[-861 4115]

CONTEXT FOR [-860 4115]:
C
nan
nan
20000520
07-01 07
[-857 4116]



In [115]:
### initialization of the graph ###

batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
context_window = 6 # How many words to consider left and right
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 64 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(xrange(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default():

    # Input data.
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size * context_window])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
    # Variables.
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    segments = tf.constant([x / context_window for x in range(batch_size * context_window)])
    softmax_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  
    # Model.
    # Look up embeddings for inputs.
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    compressed_embeddings = tf.segment_sum(embed, segments) # merging couple of embeded words into one input
    # Compute the softmax loss, using a sample of the negative labels each time.
    loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, compressed_embeddings,
                                   train_labels, num_sampled, vocabulary_size))

    # Optimizer.
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
  
    # Compute the similarity between minibatch examples and all embeddings.
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
    
    input = [dictionary['C'], dictionary['nan'], dictionary['Av. Boavista'], dictionary['20000320'],
             dictionary['07-01 11'], dictionary['[-861 4115]']] * batch_size
    logits = tf.matmul(compressed_embeddings, tf.transpose(softmax_weights)) + softmax_biases
    y_ = tf.nn.softmax(logits)

In [118]:
num_steps = 7000

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    average_loss = 0
    for step in range(num_steps):
        batch_data, batch_labels = generate_cbow_batch(
        batch_size, context_window)
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += l
        if step % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step %d: %f' % (step, average_loss))
            average_loss = 0
        # note that this is expensive (~20% slowdown if computed every 500 steps)
    
    final_embeddings = normalized_embeddings.eval()
    # feeds the algorithm a context and gets the most probable autocompletion/autocorrection
    feed_dict = {train_dataset : input}
    _, y = session.run([logits, y_], feed_dict=feed_dict)
 
    top_k = 4 # number of nearest neighbors
    for i in xrange(1):
        print [reverse_dictionary[x] for x in (-y[i, :]).argsort()[1:top_k+1]]

Initialized
Average loss at step 0: 8.135115
Average loss at step 2000: 4.175498
Average loss at step 4000: 0.127605
Average loss at step 6000: 0.083688
['[-871 4126]', '[-854 4113]', '[-791 4063]', '[-861 4116]']


In [134]:
s = '[12 34]'
int(s.split()[0].split('[')[1])

12