In [None]:
import numpy as np
import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D
from uszipcode import ZipcodeSearchEngine
from geopy.geocoders import Nominatim

import pandas as pd
import seaborn.apionly as sns
from datetime import date, datetime
from haversine import haversine

# statistics package
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats

# packages for mapping
from mpl_toolkits.basemap import Basemap

# packages for interactive graphs
from ipywidgets import widgets, interact
from IPython.display import display
from copy import deepcopy as copy
import time
from geopy.geocoders import Nominatim
from time import sleep
import requests
import json
%matplotlib inline

In [None]:
findzip = ZipcodeSearchEngine()

In [None]:
historical_data = pd.read_csv('train.csv')

In [None]:
weather_dictionary = dict()
interval_to_hour = {0:6, 1:12, 2:18, 3:1}

In [None]:
historical_data.shape

In [None]:
historical_data.head()

## Data Preprocessing

In [None]:
process_train_data = copy(historical_data)

## Train Data

In [None]:
train_data = copy(process_train_data.loc[:100000,])
del train_data['Unnamed: 0']
train_data.head()

In [None]:
train_data["pickup_datetime"] = pd.to_datetime(train_data["pickup_datetime"])
train_data["dropoff_datetime"] = pd.to_datetime(train_data["dropoff_datetime"])

In [None]:
train_data["pickup_day"] = train_data["pickup_datetime"].apply(lambda x: x.day)
train_data["pickup_weekday"] = train_data["pickup_datetime"].apply(lambda x: x.weekday())
train_data["pickup_hour"] = train_data["pickup_datetime"].apply(lambda x: x.hour)
train_data["pickup_minute"] = train_data["pickup_datetime"].apply(lambda x: x.minute)
train_data["pickup_time"] = train_data["pickup_hour"] + (train_data["pickup_minute"] / 60)
train_data["dropoff_hour"] = train_data["dropoff_datetime"].apply(lambda x: x.hour)

In [None]:
train_data['pickup_date'] = [date.date() for date in train_data['pickup_datetime']]
train_data['dropoff_date'] = [date.date() for date in train_data['dropoff_datetime']]

### The distance is calculated in kilometers

In [None]:
def distance(lat1, lon1, lat2, lon2):
    """calculates the Manhattan distance between 2 points
        using their coordinates
    
    Parameters
    ----------
    lat1: float
        latitude of first point
        
    lon1: float
        longitude of first point
        
    lat2: float
        latitude of second point
    
    lon2: float
        longitude of second point
        
    Returns
    -------
    d: float
        The Manhattan distance between the two points in kilometers
        
    """
    
    d = haversine((lat1, lon1), (lat2, lon1)) + haversine((lat2, lon1), (lat2, lon2))
    return d

In [None]:
train_data["distance"] = train_data.apply(lambda row: distance(row["pickup_latitude"], 
                                               row["pickup_longitude"], 
                                               row["dropoff_latitude"], 
                                               row["dropoff_longitude"]), axis=1)

### The speed is calculated in km/h

In [None]:
train_data["speed"] = train_data["distance"] / (train_data["trip_time_in_secs"] / 3600)

In [None]:
pickup_datetime = train_data['pickup_datetime']
day_interval = []
for pickup_date in pickup_datetime:
    hour = pickup_date.hour
    if hour>=6 and hour<12:
        day_interval.append(0)
    elif hour>=12 and hour<18:
        day_interval.append(1)
    elif hour>=18 and hour<24:
        day_interval.append(2)
    else:
        day_interval.append(3)
train_data['day_interval'] = day_interval

In [None]:
def add_waiting_time_and_penalty(train_data):
    sorted_train_data = copy(train_data.sort_values(by=['hack_license', 'pickup_datetime'], ascending=True))
    sorted_train_data['waiting_time'] = 0.0
    sorted_train_data['waiting_penalty'] = 0.0
    previous_license = None
    previous_drpoff_datatime = 0
    previous_date = None
    previous_row = None
    wait_time_list = []
    wait_penalty_list = []
    average_speed = sorted_train_data['speed'].mean()
    for index,row in sorted_train_data.T.iteritems():
        if row['hack_license'] == previous_license and row['pickup_date'] == previous_date:
            diff =  row['pickup_datetime'] - previous_row['dropoff_datetime']
            _distance = distance(row["pickup_latitude"], row["pickup_longitude"], previous_row["dropoff_latitude"], previous_row["dropoff_longitude"])
            speed = row['speed']
            if(speed <= 0):
                speed = average_speed
            time_take_to_reach = (_distance/speed)*3600
            if (time_take_to_reach>diff.total_seconds()):
                wait_time_list.append(0.0)
                wait_penalty_list.append(0.0)
            else:
                wait_time_list.append(diff.total_seconds()-time_take_to_reach)
                wait_penalty_list.append((diff.total_seconds()-time_take_to_reach)/3600*5)
        else:
            wait_time_list.append(0.0)
            previous_license = row['hack_license'] 
            previous_date = row['pickup_date']
            previous_row = row
            wait_penalty_list.append(0.0)
    sorted_train_data['waiting_time'] = wait_time_list
    sorted_train_data['waiting_penalty'] = wait_penalty_list
    return sorted_train_data

In [None]:
sorted_train_data = add_waiting_time_and_penalty(train_data)
sorted_train_data.head()

## Introduce Zip Code in the Historical Data

In [None]:
trip_data_with_zip_code = copy(sorted_train_data)

In [241]:
def get_weather(lat, lon, pickup_datetime):
    base_url = 'http://api.openweathermap.org/data/2.5/forecast?'
    payload = {
                    'lat': lat,
                    'lon': lon,
                    'start': pickup_datetime,
                    'cnt': 1,
                    'APPID': 'e42277154fa7b3c31cb3b98fae71c220'
                    }
    r = requests.get(base_url, params=payload)
    if 'list' in r.json() and 'weather' in r.json()['list'][0] and 'main' in r.json()['list'][0]['weather'][0]:
        return r.json()['list'][0]['weather'][0]['main']
    else:
        return 'Clear'

In [341]:
# documentation: https://pythonhosted.org/uszipcode/#by-coordinate
def add_zip_code_and_weather(trip_data):
    pickup_zipcode_list = []
    dropoff_zipcode_list = []
    trip_data['pickup_zipcode'] = None
    trip_data['dropoff_zipcode']  = None
    ## load previous weathers
    with open('weather_data.json', 'r') as fp:
        weather_dictionary = json.load(fp)
    weather_list = []
    count = 0
    for index,row in trip_data.T.iteritems():
        pickup_res = findzip.by_coordinate(row['pickup_latitude'], row['pickup_longitude'], radius=10, returns=1)
        if(len(pickup_res) > 0):
            zipcode = pickup_res[0]["Zipcode"]
        else:
            zipcode = 'NaN'
        pickup_zipcode_list.append(zipcode)

        dropoff_res = findzip.by_coordinate(row['dropoff_latitude'], row['dropoff_longitude'], radius=10, returns=1)
        if(len(dropoff_res) > 0):
            zipcode = dropoff_res[0]["Zipcode"]
        else:
            zipcode = 'NaN'
        dropoff_zipcode_list.append(zipcode)
        
        ## Add historical weather
        key = str(row['pickup_zipcode']) + '_' + str(row['pickup_date']) + '_' + str(row['day_interval'])
        if key not in weather_dictionary:
            _date = datetime.strptime(str(row['pickup_date']), '%Y-%m-%d')
            _date = _date.replace(hour=interval_to_hour[row['day_interval']])
            weather_dictionary[key] = get_weather(row['pickup_latitude'], row['pickup_longitude'], time.mktime(_date.timetuple()))
            count += 1
            if count >= 60:
                count = 0
                sleep(60)
        weather_list.append(weather_dictionary[key])
    trip_data['pickup_zipcode'] = pickup_zipcode_list
    trip_data['dropoff_zipcode'] = dropoff_zipcode_list
    trip_data['weather'] = weather_list
    ## store weathers 
    with open('weather_data.json', 'w') as fp:
        json.dump(weather_dictionary, fp)


In [None]:
add_zip_code_and_weather(trip_data_with_zip_code)

In [None]:
trip_data_with_zip_code = copy(trip_data_with_zip_code[(str(trip_data_with_zip_code['pickup_zipcode']) != 'NaN') & (str(trip_data_with_zip_code['dropoff_zipcode']) != 'NaN')])

In [None]:
trip_data_with_zip_code.head()

## Store Data

In [192]:
trip_data_with_zip_code.to_csv('clean_data6.csv', index=False )

## Check Data

In [239]:
check_data = pd.read_csv('clean_data_6.csv')

In [240]:
check_data.head()

Unnamed: 0,medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,trip_distance,...,pickup_date,dropoff_date,distance,speed,day_interval,waiting_time,waiting_penalty,pickup_zipcode,dropoff_zipcode,weather
0,BE530E79CB7E459DEF5BBDF2F319EDC1,0002555BBE359440D6CEB34B699D3932,CMT,1,N,2013-01-01 19:00:00,2013-01-01 19:16:33,1,992,7.9,...,2013-01-01,2013-01-01,11.499047,41.730414,2,0.0,0.0,11371,10016,Rain
1,BE530E79CB7E459DEF5BBDF2F319EDC1,0002555BBE359440D6CEB34B699D3932,CMT,1,N,2013-01-01 19:52:35,2013-01-01 19:58:07,2,331,1.5,...,2013-01-01,2013-01-01,3.178577,34.570622,2,2016.523623,2.800727,10174,10009,Rain
2,BE530E79CB7E459DEF5BBDF2F319EDC1,0002555BBE359440D6CEB34B699D3932,CMT,1,N,2013-01-01 21:23:24,2013-01-01 21:40:06,1,1001,9.2,...,2013-01-01,2013-01-01,13.149924,47.292434,2,6736.02451,9.35559,11371,10009,Rain
3,BE530E79CB7E459DEF5BBDF2F319EDC1,0002555BBE359440D6CEB34B699D3932,CMT,1,N,2013-01-01 23:01:07,2013-01-01 23:05:41,2,273,1.7,...,2013-01-01,2013-01-01,0.475662,6.27246,2,6067.954056,8.427714,11369,11369,Rain
4,BE530E79CB7E459DEF5BBDF2F319EDC1,0002555BBE359440D6CEB34B699D3932,CMT,1,N,2013-01-01 23:40:33,2013-01-01 23:53:54,1,801,7.5,...,2013-01-01,2013-01-01,6.674613,29.998262,2,14321.0859,19.890397,11371,10128,Rain


## Build Weather Dictionary

### Read Weather Data from File

In [262]:
with open('weather_data.json', 'r') as fp:
    weather_dictionary = json.load(fp)

In [None]:
trip_data_with_weather = copy(trip_data_with_zip_code)

In [205]:
def build_weather_dictionary(trip_data):
    count = 0
    weather_list = []
    for index,row in trip_data.T.iteritems():
        key = str(row['pickup_zipcode']) + '_' + str(row['pickup_date']) + '_' + str(row['day_interval'])
        if key not in weather_dictionary:
            _date = datetime.strptime(row['pickup_date'], '%Y-%m-%d')
            _date = _date.replace(hour=interval_to_hour[row['day_interval']])
            weather_dictionary[key] = get_weather(row['pickup_latitude'], row['pickup_longitude'], time.mktime(_date.timetuple()))
            count += 1
            if count >= 60:
                count = 0
                sleep(60)
        else:
            pass
        weather_list.append(weather_dictionary[key])
    trip_data['weather'] = weather_list


In [None]:
build_weather_dictionary(trip_data_with_weather)

## Store Weather Dictionary

In [261]:
with open('weather_data.json', 'w') as fp:
    json.dump(weather_dictionary, fp)