In [14]:
import numpy as np
import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D
from uszipcode import ZipcodeSearchEngine
from geopy.geocoders import Nominatim

import pandas as pd
import seaborn.apionly as sns
from datetime import date, datetime
from haversine import haversine

# statistics package
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats

# packages for mapping
from mpl_toolkits.basemap import Basemap

# packages for interactive graphs
from ipywidgets import widgets, interact
from IPython.display import display
from copy import deepcopy as copy
%matplotlib inline

In [28]:
historical_data = pd.read_csv('train.csv')

In [29]:
historical_data.head()

Unnamed: 0.1,Unnamed: 0,medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,...,pickup_latitude,dropoff_longitude,dropoff_latitude,fare_amount,payment_type,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,0,89D227B655E5C82AECF13C3F540D4CF4,BA96DE419E711691B9445D6A6307C170,CMT,1,N,2013-01-01 15:11:48,2013-01-01 15:18:10,4,382,...,40.757977,-73.989838,40.751171,6.5,CSH,0.0,0.5,0.0,0.0,7.0
1,1,0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-01-06 00:18:35,2013-01-06 00:22:54,1,259,...,40.731781,-73.994499,40.75066,6.0,CSH,0.5,0.5,0.0,0.0,7.0
2,2,0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-01-05 18:49:41,2013-01-05 18:54:23,1,282,...,40.73777,-74.009834,40.726002,5.5,CSH,1.0,0.5,0.0,0.0,7.0
3,3,DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,CMT,1,N,2013-01-07 23:54:15,2013-01-07 23:58:20,2,244,...,40.759945,-73.984734,40.759388,5.0,CSH,0.5,0.5,0.0,0.0,6.0
4,4,DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,CMT,1,N,2013-01-07 23:25:03,2013-01-07 23:34:24,1,560,...,40.748528,-74.002586,40.747868,9.5,CSH,0.5,0.5,0.0,0.0,10.5


## Data Preprocessing

In [33]:
process_train_data = copy(historical_data)

## Train Data

In [39]:
train_data = copy(process_train_data.loc[0:100000,])
del train_data['Unnamed: 0']
train_data.head()

Unnamed: 0,medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,trip_distance,...,mta_tax,tip_amount,tolls_amount,total_amount,pickup_day,pickup_weekday,pickup_hour,pickup_minute,pickup_time,dropoff_hour
0,89D227B655E5C82AECF13C3F540D4CF4,BA96DE419E711691B9445D6A6307C170,CMT,1,N,2013-01-01 15:11:48,2013-01-01 15:18:10,4,382,1.0,...,0.5,0.0,0.0,7.0,1,1,15,11,15.183333,15
1,0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-01-06 00:18:35,2013-01-06 00:22:54,1,259,1.5,...,0.5,0.0,0.0,7.0,6,6,0,18,0.3,0
2,0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-01-05 18:49:41,2013-01-05 18:54:23,1,282,1.1,...,0.5,0.0,0.0,7.0,5,5,18,49,18.816667,18
3,DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,CMT,1,N,2013-01-07 23:54:15,2013-01-07 23:58:20,2,244,0.7,...,0.5,0.0,0.0,6.0,7,0,23,54,23.9,23
4,DFD2202EE08F7A8DC9A57B02ACB81FE2,51EE87E3205C985EF8431D850C786310,CMT,1,N,2013-01-07 23:25:03,2013-01-07 23:34:24,1,560,2.1,...,0.5,0.0,0.0,10.5,7,0,23,25,23.416667,23


In [40]:
train_data["pickup_day"] = train_data["pickup_datetime"].apply(lambda x: x.day)
train_data["pickup_weekday"] = train_data["pickup_datetime"].apply(lambda x: x.weekday())
train_data["pickup_hour"] = train_data["pickup_datetime"].apply(lambda x: x.hour)
train_data["pickup_minute"] = train_data["pickup_datetime"].apply(lambda x: x.minute)
train_data["pickup_time"] = train_data["pickup_hour"] + (df["pickup_minute"] / 60)
train_data["dropoff_hour"] = train_data["dropoff_datetime"].apply(lambda x: x.hour)

In [45]:
train_data['pickup_date'] = [date.date() for date in train_data['pickup_datetime']]
train_data['dropoff_date'] = [date.date() for date in train_data['dropoff_datetime']]

### The distance is calculated in kilometers

In [41]:
def distance(lat1, lon1, lat2, lon2):
    """calculates the Manhattan distance between 2 points
        using their coordinates
    
    Parameters
    ----------
    lat1: float
        latitude of first point
        
    lon1: float
        longitude of first point
        
    lat2: float
        latitude of second point
    
    lon2: float
        longitude of second point
        
    Returns
    -------
    d: float
        The Manhattan distance between the two points in kilometers
        
    """
    
    d = haversine((lat1, lon1), (lat2, lon1)) + haversine((lat2, lon1), (lat2, lon2))
    return d

In [42]:
train_data["distance"] = train_data.apply(lambda row: distance(row["pickup_latitude"], 
                                               row["pickup_longitude"], 
                                               row["dropoff_latitude"], 
                                               row["dropoff_longitude"]), axis=1)

### The speed is calculated in km/h

In [43]:
train_data["speed"] = train_data["distance"] / (train_data["trip_time_in_secs"] / 3600)

In [55]:
pickup_datetime = train_data['pickup_datetime']
day_interval = []
for i in range(pickup_datetime.size):
    hour = pickup_datetime[i].hour
    if hour>=6 and hour<12:
        day_interval.append(0)
    elif hour>=12 and hour<18:
        day_interval.append(1)
    elif hour>=18 and hour<24:
        day_interval.append(2)
    else:
        day_interval.append(3)
train_data['day_interval'] = day_interval

In [56]:
train_data.keys()

Index(['medallion', 'hack_license', 'vendor_id', 'rate_code',
       'store_and_fwd_flag', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'trip_time_in_secs', 'trip_distance',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'fare_amount', 'payment_type', 'surcharge',
       'mta_tax', 'tip_amount', 'tolls_amount', 'total_amount', 'pickup_day',
       'pickup_weekday', 'pickup_hour', 'pickup_minute', 'pickup_time',
       'dropoff_hour', 'distance', 'speed', 'pickup_date', 'dropoff_date',
       'time_inteval', 'day_interval'],
      dtype='object')

## Reinforcement Model

In [None]:
class TaxiWorld():
    def __init__(self):
        self.reward = dict()
        self.count = dict()
        self.s = None
        self.actions = [0,1]
        self.a = 0
        self.zip_codes = []
        
    def initialize(self, data):
        self.zip_codes = trip_and_fare['zip_code'].unique()
        for index,row in data.T.iteritems():
            key = row['zip_code'] + '-' + row['dropoff_zip_codes']
            if key not in self.reward:
                self.reward[key] = row['total_amount']
                self.count[key] = 1
            else:
                self.reward[key] = ((self.reward[key]*self.count[key]) + row['total_amount'])/(self.count[key] + 1)
                self.count[key] += 1
                
    def getReward(self, zip_code, dropoff_zip_code):
        key = zip_code + '-' + dropoff_zip_code
        if key not in self.reward:
                return 0
        else:
            return self.reward[key]
    
    def getState(self):
        print('Current State')
        return self.s
    
    def setState(self, s):
        self.s = s
        
    def getStateSize(self, trip_and_fare):
        return len(trip_and_fare['zip_code'].unique())
    
    def getZipCodes(self, trip_and_fare):
        return self.zip_codes
    
    def getActionSize(self):
        return len(self.actions)

    def nextAction(self, s):
        print('Calculate Next Action based on state')
        
    def check_Q_table(self, s):
        if s not in self.Q:
            self.Q[s] = dict((action, 0.0) for action in self.actions)
            


In [None]:
env = TaxiWorld()

In [79]:
class TaxiRevenue:
    def __init__(self, env):
        self.env = env
        self.size = env.getStateSize()
        self.actionsize = env.getActionSize()
        self.Q = dict()
    
    def initialize(self, data):
        for index,row in data.T.iteritems():
            key = row['pickup_latitude'] + '_' + row['pickup_longitude'] + '_'  + row['pickup_weekday'] + '_' + row['day_interval']
            action_key = row['dropoff_latitude'] + '_' + row['dropoff_longitude']
            tmp_dict = dict()
            tmp_dict[action_key] = [row['total_amount'],1]
            if key not in self.Q:
                self.Q[key] = tmp_dict
            else:
                if action_key not in self.Q[key]:
                    self.Q[key] = tmp_dict
                else:
                    count = self.Q[key][action_key][0] + 1
                    average_revenue = ((self.Q[key][action_key][0]*self.Q[key][action_key][1]) + row['total_amount'])/(count)
                    self.Q[key][action_key] = [average_revenue, count]
                    
                
    def greedy(self, s):
        return np.argmax(self.Q[s[0]]) 

    def epsilon_greed(self, epsilon, s):
        if np.random.rand() < epsilon:
            return np.random.randint(self.n_a)
        else:
            return self.greedy(s)
        
    def train(self, trip_and_fare):
        self.initialize(trip_and_fare)
        
    def test(self):
        return self.env.getState()

In [58]:
train_data.shape

(100001, 33)

In [71]:
_dic = dict()
_key = "abc"
_action_key = "xyz"
dic2 = dict()
dic2[_action_key] = [123,1]
_dic[_key] = dic2

In [78]:
if 'xyz' not in _dic['abc']:
    print('notfound')