In [27]:
"""

This iPython file processes training data we need to predict demand of 
ride-sharing business given certain time and location.

Part 1: We save each district in the city by its hash value and index. 
There are 66 districts.

Part 2: For each order, we save its order_id, driver_id, passenger_id, 
start_district, dest_district in hash values. We also have the corresponding 
price and time stamp.

Part 3: For each district, we have information about position of interest (POI).
These are count of various kinds of region in one district. For each key-value pair, 
we define key as a tuple representing the hierarchy of each region in this district, 
and the corresponding value would be the count of this region. 

"""

import numpy as np
import pandas as pd
import os
import glob as gb
import matplotlib.pyplot as plt
import csv
import re
%matplotlib inline

# 1: Import training cluster_map

In [2]:
path = 'data/training_data/cluster_map/cluster_map.txt'
cluster_map = {}

In [3]:
with open(path, 'r') as map_file:
    for line in map_file:
        hash_value, num = line.split()
        cluster_map[hash_value] = int(num)

In [4]:
df = pd.DataFrame.from_dict(cluster_map, orient="index")
df.to_csv('data/cluster_map.csv')

In [5]:
for hash_value in cluster_map:
    print(hash_value, cluster_map[hash_value])

('f47f35242ed40655814bc086d7514046', 53)
('38d5ad2d22b61109fd8e7b43cd0e8901', 24)
('08f5b445ec6b29deba62e6fd8b0325a6', 43)
('364bf755f9b270f0f9141d1a61de43ee', 21)
('49ac89aa860c27e26c0836cb8dab2df2', 60)
('8bb37d24db1ad665e706c2655d9c4c72', 34)
('8316146a6f78cc6d9f113f0390859417', 44)
('08232402614a9b48895cc3d0aeb0e9f2', 50)
('b702e920dcd2765e624dc1ce3a770512', 9)
('52d7b69796362a8ed1691a6cc02ddde4', 33)
('62afaf3288e236b389af9cfdc5206415', 48)
('bf44d327f0232325c6d5280926d7b37d', 64)
('445ff793ebd3477d4a2e0b36b2db9271', 55)
('b05379ac3f9b7d99370d443cfd5dcc28', 37)
('a814069db8d32f0fa6e188f41059c6e1', 17)
('c9f855e3e13480aad0af64b418e810c3', 45)
('0a5fef95db34383403d11cb6af937309', 63)
('ca064c2682ca48c6a21de012e87c0df5', 42)
('693a21b16653871bbd455403da5412b4', 39)
('f9280c5dab6910ed44e518248048b9fe', 41)
('825c426141df01d38c1b9e9c5330bdac', 30)
('a5609739c6b5c2719a3752327c5e33a7', 19)
('90c5a34f06ac86aee0fd70e2adce7d8a', 1)
('de092beab9305613aca8f79d7d7224e7', 61)
('3a43dcdff3c0b66b

# 2: Import training order_data

In [6]:
list_of_files = gb.glob('data/training_data/order_data/*.txt')

In [7]:
order_info = []
for f in list_of_files:
    order_info_single_day = pd.read_csv(f, delimiter = '\t', header = None)
    order_info.append(order_info_single_day)

In [8]:
order_info = pd.concat(order_info)

In [9]:
order_info[6] = pd.to_datetime(order_info[6])

In [10]:
order_info.columns = ['order_id', 'driver_id', 'passenger_id', \
                      'start_district_hash', 'dest_district_hash', 'price', 'time_stamp']
order_info.head()

Unnamed: 0,order_id,driver_id,passenger_id,start_district_hash,dest_district_hash,price,time_stamp
0,97ebd0c6680f7c0535dbfdead6e51b4b,dd65fa250fca2833a3a8c16d2cf0457c,ed180d7daf639d936f1aeae4f7fb482f,4725c39a5e5f4c188d382da3910b3f3f,3e12208dd0be281c92a6ab57d9a6fb32,24,2016-01-01 13:37:23
1,92c3ac9251cc9b5aab90b114a1e363be,c077e0297639edcb1df6189e8cda2c3d,191a180f0a262aff3267775c4fac8972,82cc4851f9e4faa4e54309f8bb73fd7c,b05379ac3f9b7d99370d443cfd5dcc28,2,2016-01-01 09:47:54
2,abeefc3e2aec952468e2fd42a1649640,86dbc1b68de435957c61b5a523854b69,7029e813bb3de8cc73a8615e2785070c,fff4e8465d1e12621bc361276b6217cf,fff4e8465d1e12621bc361276b6217cf,9,2016-01-01 18:24:02
3,cb31d0be64cda3cc66b46617bf49a05c,4fadfa6eeaa694742de036dddf02b0c4,21dc133ac68e4c07803d1c2f48988a83,4b7f6f4e2bf237b6cc58f57142bea5c0,4b7f6f4e2bf237b6cc58f57142bea5c0,11,2016-01-01 22:13:27
4,139d492189ae5a933122c098f63252b3,,26963cc76da2d8450d8f23fc357db987,fc34648599753c9e74ab238e9a4a07ad,87285a66236346350541b8815c5fae94,4,2016-01-01 17:00:06


In [11]:
order_info.to_csv('data/training_order_data.csv')

In [12]:
order_id = order_info['order_id']
driver_id = order_info['driver_id']
passenger_id = order_info['passenger_id']
start_district_hash = order_info['start_district_hash']
dest_district_hash = order_info['dest_district_hash']
price = order_info['price']
time_stamp = order_info['time_stamp']

In [13]:
order_id_num = len(order_id)
order_id_num_unique = len(order_id.unique())
print 'total order_id:', order_id_num
print 'unique order_id:', order_id_num_unique

total order_id: 823571
unique order_id: 819575


In [14]:
driver_id_null = pd.isnull(driver_id)
driver_id_num = len(driver_id)
driver_id_num_unique = len(driver_id.unique())
driver_id_num_null = len(driver_id[driver_id_null])
print 'total driver_id:', driver_id_num
print 'unique driver_id:', driver_id_num_unique
print 'null driver_id:', driver_id_num_null
print 'fraction of missed orders:', float(driver_id_num_null) / float(driver_id_num)
print 'fraction of received orders:', 1.0 - float(driver_id_num_null) / float(driver_id_num)

total driver_id: 823571
unique driver_id: 44228
null driver_id: 211336
fraction of missed orders: 0.256609326943
fraction of received orders: 0.743390673057


In [15]:
passenger_id_num = len(passenger_id)
passenger_id_num_unique = len(passenger_id.unique())
print 'total passenger_id:', passenger_id_num
print 'unique passenger_id:', passenger_id_num_unique
print 'orders per passenger:', float(passenger_id_num) / float(passenger_id_num_unique)

total passenger_id: 823571
unique passenger_id: 368899
orders per passenger: 2.23251079564


In [16]:
start_district_hash_num = len(start_district_hash)
start_district_hash_num_unique = len(start_district_hash.unique())
print 'total start_district_hash:', start_district_hash_num
print 'unique start_district_hash:', start_district_hash_num_unique

total start_district_hash: 823571
unique start_district_hash: 66


In [17]:
dest_district_hash_num = len(dest_district_hash)
dest_district_hash_num_unique = len(dest_district_hash.unique())
print 'total dest_district_hash:', dest_district_hash_num
print 'unique dest_district_hash:', dest_district_hash_num_unique

total dest_district_hash: 823571
unique dest_district_hash: 433


In [18]:
price_num = len(price)
price_num_unique = len(price.unique())
price_max = price.max()
price_min = price.min()
print 'total price:', price_num
print 'unique price:', price_num_unique
print 'max price:', price_max
print 'min price:', price_min

total price: 823571
unique price: 1297
max price: 499.0
min price: 0.0


In [19]:
time_stamp_num = len(time_stamp)
time_stamp_num_unique = len(time_stamp.unique())
print 'total time_stamp:', time_stamp_num
print 'unique time_stamp:', time_stamp_num_unique

total time_stamp: 823571
unique time_stamp: 154988


# 3: Import training poi_data

In [20]:
path = 'data/training_data/poi_data/poi_data.txt'

In [22]:
poi_map = {}
poi_info_list = []
with open(path, 'r') as poi_file:
    for line in poi_file:
        poi_list = line.split()
        hash_value = poi_list[0]
        poi_map[hash_value] = {}
        print 'hash_value:', hash_value
        for i in range(1, len(poi_list)):
            poi_info_list = re.split('#|:', poi_list[i])
            if (len(poi_info_list) == 2):
                [level, num] = poi_info_list
                poi_map[hash_value][(int(level), )] = int(num)
            elif(len(poi_info_list) == 3):
                [level, sublevel, num] = poi_info_list
                poi_map[hash_value][(int(level), int(sublevel))] = int(num)
            elif(len(poi_info_list) == 4):
                [level, sublevel1, sublevel2, num] = poi_info_list
                poi_map[hash_value][(int(level), int(sublevel1), int(sublevel2))] = int(num)
        print poi_map[hash_value]

hash_value: 74c1c25f4b283fa74a5514307b0d0278
{(23,): 913, (13, 4): 19173, (19, 4): 581, (20, 7): 11952, (1, 6): 83, (2, 5): 1079, (24,): 1245, (8, 5): 83, (15,): 6059, (11, 5): 2822, (16, 3): 332, (17, 2): 12201, (16, 9): 332, (16,): 7636, (19, 3): 27722, (7,): 6640, (20, 4): 11703, (1, 1): 415, (22, 3): 83, (4, 10): 4067, (3, 2): 166, (2, 6): 1245, (8, 2): 8798, (4, 5): 4814, (8,): 166, (4, 16): 664, (14, 2): 83, (25,): 3652, (3, 1): 3652, (2, 11): 1826, (17,): 2988, (14, 8): 83, (25, 5): 249, (2, 1): 83, (23, 3): 498, (4, 12): 83, (2, 12): 4648, (5, 1): 4731, (1, 11): 2241, (16, 10): 15521, (1,): 4482, (24, 3): 166, (1, 5): 12367, (2, 2): 9213, (1, 10): 249, (23, 6): 249, (4, 1): 664, (6, 4): 830, (5, 4): 83, (11, 4): 30544, (16, 4): 7968, (17, 1): 249, (14, 6): 1909, (19, 2): 1245, (15, 7): 1411, (20, 5): 5561, (25, 3): 581, (4, 11): 3569, (2, 7): 1826, (8, 3): 5229, (23, 5): 996, (4, 6): 913, (6, 1): 1826, (4, 17): 4150, (11, 3): 6640, (16, 1): 83, (14, 3): 1245, (19, 1): 1826, (19

In [23]:
np.save('data/training_poi_map.npy', poi_map)
# dictionary with multiple lengths of keys might be complicated to save in csv file
# we use npy file instead