In [2]:
import numpy as np
import pandas as pd
import random, copy, os
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from multiprocessing import Pool
import multiprocessing
%run algorithm_functions.ipynb

from cvxopt.base import matrix as m
from cvxopt import solvers
from cvxopt.modeling import op, dot, variable, max, min

## Prepare Data

In [703]:
df = pd.read_csv('./data/tripData2013/trip_data_1_filtered.csv')
df.head()

Unnamed: 0,medallion,hack_license,pickup_datetime,dropoff_datetime,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,driver_race,driver_gender,requests_gender,requests_race,pickup_lat_bin,pickup_long_bin,dropoff_lat_bin,dropoff_long_bin
0,89D227B655E5C82AECF13C3F540D4CF4,BA96DE419E711691B9445D6A6307C170,2013-01-01 15:11:48,2013-01-01 15:18:10,382,1.0,-73.978165,40.757977,-73.989838,40.751171,black,m,f,white,7,20,7,19
1,0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,2013-01-06 00:18:35,2013-01-06 00:22:54,259,1.5,-74.006683,40.731781,-73.994499,40.75066,black,m,f,white,6,19,7,19
2,0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,2013-01-05 18:49:41,2013-01-05 18:54:23,282,1.1,-74.004707,40.73777,-74.009834,40.726002,black,m,f,white,6,19,6,19
3,DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,2013-01-07 23:54:15,2013-01-07 23:58:20,244,0.7,-73.974602,40.759945,-73.984734,40.759388,black,m,m,white,7,20,7,19
4,DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,2013-01-07 23:25:03,2013-01-07 23:34:24,560,2.1,-73.97625,40.748528,-74.002586,40.747868,black,m,m,white,6,20,6,19


In [704]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

In [705]:
s = MinMaxScaler() # scaling distances such that all values are [0,1]
df['trip_distance'] = s.fit_transform(df['trip_distance'].values.reshape(-1, 1))

In [706]:
min_latitude = min(np.min(df['pickup_latitude']), np.min(df['dropoff_latitude']))
max_latitude = max(np.max(df['pickup_latitude']), np.max(df['dropoff_latitude']))
min_longitude = min(np.min(df['pickup_longitude']), np.min(df['dropoff_longitude']))
max_longitude = max(np.max(df['pickup_longitude']), np.max(df['dropoff_longitude']))

In [707]:
min_longitude, max_longitude, min_latitude, max_latitude

(-74.98333000000002, -73.011703, 40.400002, 40.999813)

In [708]:
requests_features = ['pickup_lat_bin', 'pickup_long_bin', 'dropoff_lat_bin', 'dropoff_long_bin', 
                     'requests_gender', 'requests_race']
driver_features = ['driver_race', 'driver_gender', 'pickup_lat_bin', 'pickup_long_bin']

###### Total number of drivers and types of drivers

In [709]:
all_driver_types = df.groupby(driver_features)['driver_gender'].agg('count')
print ("Total number of drivers: {}, total driver types: {}".format(len(set(list(df['hack_license']))), 
                                                                    len(all_driver_types)))

Total number of drivers: 32092, total driver types: 716


###### Total number and types of requests

In [710]:
all_request_types = df.groupby(['pickup_lat_bin', 'pickup_long_bin', 'dropoff_lat_bin', 'dropoff_long_bin', 
            'requests_gender', 'requests_race'])['requests_gender'].agg('count')
print ("Total requests: {}, types of requests: {}, total rides: {}".format(all_request_types.sum(), 
                                                                           len(all_request_types),
                                                                           len(df)))

Total requests: 14486242, types of requests: 8676, total rides: 14486242


In [711]:
peak_hour = 19
peak_day = 31

In [712]:
peak_hour_entries = df[np.logical_and(df['pickup_datetime'].dt.hour == peak_hour, 
                                     df['pickup_datetime'].dt.day == peak_day)]
peak_hour_entries.head()

Unnamed: 0,medallion,hack_license,pickup_datetime,dropoff_datetime,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,driver_race,driver_gender,requests_gender,requests_race,pickup_lat_bin,pickup_long_bin,dropoff_lat_bin,dropoff_long_bin
7853828,6B53BBA9567F90BD8A4130FEE57F703A,A21EF85E38B13F676E69C40A0127654B,2013-01-31 19:21:00,2013-01-31 19:32:00,660,0.044,-74.004677,40.711876,-73.972374,40.742809,white,m,m,white,6,19,6,20
7873451,E565EF3C83240654D540CBAA0D7C96EE,4269094FA5A341993B6707B5E1200AAA,2013-01-31 19:20:55,2013-01-31 19:49:11,1694,0.059,-73.976387,40.748592,-73.965439,40.678673,black,m,f,white,6,20,5,20
7877649,79584E2DA4A76EA45C482EA2B1C1B220,3B1803457E6CD3936B2C32CB856AD272,2013-01-31 19:17:16,2013-01-31 19:26:00,523,0.012,-73.983322,40.750206,-73.971024,40.762066,black,m,m,white,7,20,7,20
7879766,171BF6AD2C32D927735A7A5ADB516512,6CB8F868A9A20D3FAB2B3307C841C00B,2013-01-31 19:54:32,2013-01-31 20:14:15,1183,0.11,-73.872978,40.774025,-73.975807,40.676292,black,m,m,white,7,22,5,20
7880711,FB30B64440B4A7B8DBA9903C7598AD90,A4897540F0711AE1BC2C76A348883100,2013-01-31 19:56:28,2013-01-31 20:06:14,586,0.011,-73.977638,40.7589,-73.990395,40.76712,black,m,m,black,7,20,7,19


In [713]:
len(peak_hour_entries)

35109

In [714]:
historical_data = df[df['pickup_datetime'].dt.day != peak_day]
print (len(historical_data))
historical_data.head()

13965626


Unnamed: 0,medallion,hack_license,pickup_datetime,dropoff_datetime,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,driver_race,driver_gender,requests_gender,requests_race,pickup_lat_bin,pickup_long_bin,dropoff_lat_bin,dropoff_long_bin
0,89D227B655E5C82AECF13C3F540D4CF4,BA96DE419E711691B9445D6A6307C170,2013-01-01 15:11:48,2013-01-01 15:18:10,382,0.01,-73.978165,40.757977,-73.989838,40.751171,black,m,f,white,7,20,7,19
1,0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,2013-01-06 00:18:35,2013-01-06 00:22:54,259,0.015,-74.006683,40.731781,-73.994499,40.75066,black,m,f,white,6,19,7,19
2,0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,2013-01-05 18:49:41,2013-01-05 18:54:23,282,0.011,-74.004707,40.73777,-74.009834,40.726002,black,m,f,white,6,19,6,19
3,DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,2013-01-07 23:54:15,2013-01-07 23:58:20,244,0.007,-73.974602,40.759945,-73.984734,40.759388,black,m,m,white,7,20,7,19
4,DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,2013-01-07 23:25:03,2013-01-07 23:34:24,560,0.021,-73.97625,40.748528,-74.002586,40.747868,black,m,m,white,6,20,6,19


## Get Peak Hour Driver and Requests

##### Number of drivers in peak hour

In [715]:
peak_hour_driver_types = list(peak_hour_entries.groupby(driver_features)['hack_license'].agg('count').index)
print ("Driver Types: {}, Total Drivers: {}".format(len(peak_hour_driver_types), 
                                                    len(set(list(peak_hour_entries['hack_license'])))))
# assert len(set(list(peak_hour_drivers.values))) == len(peak_hour_drivers)

Driver Types: 82, Total Drivers: 10814


###### Number of request types in the peak hour and number of requests

In [716]:
peak_hour_requests = list(peak_hour_entries.groupby(requests_features)['requests_gender'].agg('count').index)
print ("Request Types: {}, Total requests: {}".format(len(peak_hour_requests), 
                                                      len(peak_hour_entries)))

Request Types: 800, Total requests: 35109


In [13]:
def calculate_probability(request, driver, kappa=0.5):
    if request.race == 'black' and driver.race == 'b':
        request_driver_coeff = 0.3
    elif request.race == 'white' and driver.race == 'w':
        request_driver_coeff = 0.6
    else:
        request_driver_coeff = 0.1
#     p = 0.6 * request.distance + 0.4 * request_driver_coeff
    p = request_driver_coeff
    p = kappa + (1 - kappa) * p
    assert p <= 1 and p >= kappa
    return p

###### Subsample drivers for peak hour: assign k (=2) drivers for each driver type

In [717]:
def get_drivers(driver_quota):
    subsample_fraction = 0.5
    k = 10
    random.seed(42)
    sampled_peak_hour_drivers = random.sample(peak_hour_driver_types, 
        k=int(subsample_fraction * len(peak_hour_driver_types)))

    # Further downsampling to 4 driver types!
    sampled_peak_hour_drivers = random.sample(sampled_peak_hour_drivers, k=20)

    print ("Chosen driver types: {}".format(len(sampled_peak_hour_drivers)))

    drivers = [] # Equivalent to set U
    for driver_type in sampled_peak_hour_drivers:
        mask = None
        bool_peak_entries_match = peak_hour_entries[driver_features] == driver_type
        for ft in driver_features:
            mask = bool_peak_entries_match[ft].values if mask is None else\
                np.logical_and(mask, bool_peak_entries_match[ft].values)

        if (np.count_nonzero(mask)) < 10:
            continue
        print (np.count_nonzero(mask))
        sampled_ids = random.sample(list(peak_hour_entries[mask]['hack_license']), k=k \
                                    if k is not None and k < np.count_nonzero(mask) else np.count_nonzero(mask))
        for d_id in sampled_ids:
            driver_instance = Driver(**dict(zip(driver_features, driver_type)), d_id=d_id, quota=driver_quota)
            if driver_instance not in drivers:
                driver_instance.race = np.random.choice(['black', 'white'], p=[0.75, 0.25])[0]
                print (driver_instance.race)
                drivers.append(driver_instance)

    set_unique_ids(drivers)

    print ("Total number of drivers: {}, Size of set U: {}".format(len(drivers), len(set(drivers))))
    return drivers

Chosen driver types: 10
44
123
Total number of drivers: 19, Size of set U: 19


###### Subsample requests for the peak hour: multiply arrival rate of requests by k from each request type

In [718]:
def count_total_requests(all_entries, sampled_request_types):
    total = 0
    for request_vals in sampled_request_types:
        mask = None
        bool_peak_entries_match = all_entries[requests_features] == request_vals
        for ft in requests_features:
            mask = np.array(bool_peak_entries_match[ft]) if mask is None else \
                mask & np.array(bool_peak_entries_match[ft])
        total += np.count_nonzero(mask)
    return total

In [719]:
def get_requests():
    subsample_fraction = 0.1
    k = 1
    random.seed(42)
    sampled_request_types = random.sample(peak_hour_requests, k=int(subsample_fraction * len(peak_hour_requests)))
    print ("Chosen types of requests: {}".format(len(sampled_request_types)))
    total_sampled_requests = count_total_requests(peak_hour_entries, sampled_request_types)
    if subsample_fraction == 1:
        assert total_sampled_requests == len(peak_hour_entries)
    print ("Total Sampled Requests: {}".format(total_sampled_requests))

    requests = [] # Equivalent to set V
    for request_vals in sampled_request_types:
        mask = None
        bool_peak_entries_match = peak_hour_entries[requests_features] == request_vals
        for ft in requests_features:
            mask = np.array(bool_peak_entries_match[ft]) if mask is None else \
                mask & np.array(bool_peak_entries_match[ft])
        # Find average distance in miles and put that in request type
        requests.append(Request(**dict(zip(requests_features, request_vals)), 
                                arrival_rate=float(np.count_nonzero(mask)),
                                distance=np.mean(peak_hour_entries[mask]['trip_distance'])))

    set_unique_ids(requests)

    print ("Initial Number of Requests: {}, size of set V: {}".format(
        np.sum([r.arrival_rate for r in requests]), len(set(requests))))
    return requests, np.sum([r.arrival_rate for r in requests])

Chosen types of requests: 80
Total Sampled Requests: 2731
Number of Requests: 2731.0, size of set V: 80


In [720]:
def edge_existence_bool(driver, request):
    if driver.latitude == request.start_latitude and driver.longitude == request.start_longitude:
        return True
    else:
        return False

def draw_probability_edges(drivers, requests):
    edge_matrix = np.zeros((len(drivers), len(requests))) - 1
    for d in drivers:
        for r in requests:
            if edge_existence_bool(d, r):
                edge_matrix[d.u_id, r.u_id] = calculate_probability(r, d)
    return edge_matrix

In [721]:
def filter_out_requests(drivers, requests, total_requests):
    probability_matrix = draw_probability_edges(drivers, requests)
    new_requests, new_total = [], total_requests
    average_arrival_rate = total_requests/len(requests)
#     for r in requests:
#         if np.count_nonzero(probability_matrix[:,r.u_id] != -1) > 20 and r.arrival_rate >= average_arrival_rate:
#             if abs(r.arrival_rate - average_arrival_rate) > 10:
#                 new_arrival_rate = int(average_arrival_rate) + random.randint(0, 10)
#                 new_total -= (r.arrival_rate - new_arrival_rate)
#                 r.arrival_rate = new_arrival_rate
#             new_requests.append(r)
#         else:
#             new_total -= r.arrival_rate

    for i in range(len(requests)):
        if np.count_nonzero(probability_matrix[:,requests[i].u_id] != -1) > 5:
            new_requests.append(requests[i])
        else:
            new_total -= requests[i].arrival_rate
    np.random.seed(42)
    new_arrival_rates = np.random.normal(loc=15, scale=1, size=len(new_requests))
    for i in range(len(new_requests)):
        new_total -= new_requests[i].arrival_rate
        new_requests[i].arrival_rate = round(new_arrival_rates[i])
        new_total += new_requests[i].arrival_rate
    assert int(new_total) == new_total
    return new_requests, int(new_total)

In [722]:
def get_drivers_and_requests(driver_quota):
    drivers = get_drivers(driver_quota)
    requests, total_sampled_requests = get_requests()
    requests, total_sampled_requests = filter_out_requests(drivers, requests, total_sampled_requests)
    set_unique_ids(requests)
    print ("Final Number of Requests: {} {}, size of set V: {}".format(
        np.sum([r.arrival_rate for r in requests]), total_sampled_requests, len(set(requests))))
    return drivers, requests

(19, 80)