In [1]:
import sys, os
sys.path.insert(0,"../code")

import pandas as pd
import numpy as np
import geopandas as gpd
import folium 

%load_ext autoreload
%autoreload 2

# Load Data 

In [2]:
def load_data(path_data="../data"): 
    
    if not os.path.exists(path_data): 
        os.mkdir(path)
    
    if not os.path.path.exists(os.path.join(path_data,"nyc-taxi-trip-duration")): 
        pass
        #!curl -o FremontBridge.csv https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD
        #download file 
        #unzip file 

In [3]:
PATH_DATA = os.path.join("..\data","nyc-taxi-trip-duration","train","train.csv")

df_rides = pd.read_csv(PATH_DATA,index_col=0,parse_dates=[2,3],dtype={'store_and_fwd_flag':'category','vendor_id':'category','passenger_count':'int8',})
print(df_rides.info())
df_rides.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1458644 entries, id2875421 to id1209952
Data columns (total 10 columns):
vendor_id             1458644 non-null category
pickup_datetime       1458644 non-null datetime64[ns]
dropoff_datetime      1458644 non-null datetime64[ns]
passenger_count       1458644 non-null int8
pickup_longitude      1458644 non-null float64
pickup_latitude       1458644 non-null float64
dropoff_longitude     1458644 non-null float64
dropoff_latitude      1458644 non-null float64
store_and_fwd_flag    1458644 non-null category
trip_duration         1458644 non-null int64
dtypes: category(2), datetime64[ns](2), float64(4), int64(1), int8(1)
memory usage: 93.2+ MB
None


Unnamed: 0_level_0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [4]:
pd.isna(df_rides).sum()

vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

In [5]:
assert(df_rides.index.unique().shape[0]==df_rides.shape[0])
assert(np.all(df_rides.dropoff_datetime>df_rides.pickup_datetime))
assert(np.all(df_rides.passenger_count>=0))
#assert(np.allclose((df_rides.dropoff_datetime-df_rides.pickup_datetime).dt.seconds.values,df_rides.trip_duration,))

In [6]:
from shapely.geometry import Point

def transform_to_geodf(df_rides,set_geometry_col='pickup'): 
        df_rides['pickup_geom'] = list(zip(df_rides.pickup_longitude,df_rides.pickup_latitude))
        df_rides['pickup_geom'] = df_rides['pickup_geom'].apply(Point)
        df_rides['dropoff_geom'] = list(zip(df_rides.dropoff_longitude,df_rides.dropoff_latitude))
        df_rides['dropoff_geom'] = df_rides['dropoff_geom'].apply(Point)
        df_rides = gpd.GeoDataFrame(df_rides,geometry=f'{set_geometry_col}_geom',crs={"init":"epsg:4326"})
        
        return df_rides

In [7]:
df_rides = transform_to_geodf(df_rides)
df_rides.head()

Unnamed: 0_level_0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_geom,dropoff_geom
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,POINT (-73.98215484619139 40.76793670654297),POINT (-73.96463012695313 40.76560211181641)
id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,POINT (-73.98041534423827 40.73856353759766),POINT (-73.99948120117188 40.73115158081055)
id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,POINT (-73.97902679443358 40.76393890380859),POINT (-74.00533294677734 40.71008682250977)
id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,POINT (-74.01004028320313 40.719970703125),POINT (-74.01226806640625 40.70671844482422)
id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,POINT (-73.97305297851563 40.79320907592773),POINT (-73.97292327880859 40.78252029418945)


In [8]:
# Correct utm-zone: {'init': 'epsg:32618'}

In [9]:
# Correct utm-zone: {'init': 'epsg:32618'}
#df_rides.to_crs()
import pyproj
def convert_projection_to_utm(df,col_x_source,col_y_source,
                              col_x_dest = 'x_utm', col_y_dest = 'y_utm',
                              projection_source=pyproj.Proj("+init=EPSG:4326"),
                              projection_dest=pyproj.Proj("+init=EPSG:32618")):
    x,y = pyproj.transform(projection_source, projection_dest,df[col_x_source].values,df[col_y_source].values)
    return df.assign(**{col_x_dest:x,col_y_dest:y})

def calc_distance(df,col_x1_utm,col_x2_utm,col_y1_utm,col_y2_utm,type_='Manhattan'):
    if type_ =='beeline':
        distance = np.sqrt((df[col_x1_utm]-df[col_x2_utm])**2+(df[col_y1_utm]-df[col_y2_utm])**2)
    elif type_ =='Manhattan':
        distance = np.abs((df[col_x1_utm]-df[col_x2_utm]))+np.abs((df[col_y1_utm]-df[col_y2_utm]))
    return df.assign(**{f"distance_{type_}": distance})

df_rides = convert_projection_to_utm(df_rides,col_x_source='pickup_longitude',col_y_source='pickup_latitude',col_x_dest="pickup_x_utm",col_y_dest='pickup_y_utm')
df_rides = convert_projection_to_utm(df_rides,col_x_source='dropoff_longitude',col_y_source='dropoff_latitude',col_x_dest="dropoff_x_utm",col_y_dest='dropoff_y_utm')
df_rides.head()

Unnamed: 0_level_0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_geom,dropoff_geom,pickup_x_utm,pickup_y_utm,dropoff_x_utm,dropoff_y_utm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,POINT (-73.98215484619139 40.76793670654297),POINT (-73.96463012695313 40.76560211181641),585902.544347,4513495.0,587384.651075,4513253.0
id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,POINT (-73.98041534423827 40.73856353759766),POINT (-73.99948120117188 40.73115158081055),586087.236357,4510236.0,584486.788662,4509394.0
id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,POINT (-73.97902679443358 40.76393890380859),POINT (-74.00533294677734 40.71008682250977),586171.709289,4513054.0,584019.127278,4507050.0
id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,POINT (-74.01004028320313 40.719970703125),POINT (-74.01226806640625 40.70671844482422),583609.127585,4508143.0,583437.516201,4506670.0
id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,POINT (-73.97305297851563 40.79320907592773),POINT (-73.97292327880859 40.78252029418945),586637.875245,4516309.0,586662.713668,4515123.0


In [10]:
#df['distances']=calc_distance(df_rides,df_rides['pickup_x_utm'],df_rides['pickup_y_utm'],df_rides['dropoff_x_utm'],df_rides['dropoff_y_utm'])
df_rides=calc_distance(df_rides,'pickup_x_utm','dropoff_x_utm','pickup_y_utm','dropoff_y_utm')

In [11]:
df_rides.head()

Unnamed: 0_level_0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_geom,dropoff_geom,pickup_x_utm,pickup_y_utm,dropoff_x_utm,dropoff_y_utm,distance_Manhattan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,POINT (-73.98215484619139 40.76793670654297),POINT (-73.96463012695313 40.76560211181641),585902.544347,4513495.0,587384.651075,4513253.0,1723.957684
id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,POINT (-73.98041534423827 40.73856353759766),POINT (-73.99948120117188 40.73115158081055),586087.236357,4510236.0,584486.788662,4509394.0,2441.748555
id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,POINT (-73.97902679443358 40.76393890380859),POINT (-74.00533294677734 40.71008682250977),586171.709289,4513054.0,584019.127278,4507050.0,8156.039197
id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,POINT (-74.01004028320313 40.719970703125),POINT (-74.01226806640625 40.70671844482422),583609.127585,4508143.0,583437.516201,4506670.0,1644.816109
id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,POINT (-73.97305297851563 40.79320907592773),POINT (-73.97292327880859 40.78252029418945),586637.875245,4516309.0,586662.713668,4515123.0,1211.250987


In [12]:
df_rides['pickup_datetime'][1].day

12

In [13]:
import holidays
us_holidays = holidays.US()
def Zeiten(df):
    times=pd.DatetimeIndex(df['pickup_datetime'])
    dayofweek = times.dayofweek
    dayofyear = times.dayofyear
    season = times.quarter
    hour = times.hour
    day = times.day
    year = times.year
    month= times.month
    holiday = [holiday.append(x) for dayofweek in us_holidays]
    return df.assign(**{"Season": season,'DayOfYear':dayofyear,"DayOfWeek":dayofweek,"hour":hour,"day":day,'month':month,'year':year})


#dict_test = {0:'winter',1:'spring',2:'summer',3:'fall'}
#df_rides.replace({"season": dict_test}).head()

In [14]:
df_rides=Zeiten(df_rides)
df_rides.head()

Unnamed: 0_level_0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,...,dropoff_x_utm,dropoff_y_utm,distance_Manhattan,Season,DayOfYear,DayOfWeek,hour,day,month,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,...,587384.651075,4513253.0,1723.957684,1,74,0,17,14,3,2016
id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,...,584486.788662,4509394.0,2441.748555,2,164,6,0,12,6,2016
id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,...,584019.127278,4507050.0,8156.039197,1,19,1,11,19,1,2016
id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,...,583437.516201,4506670.0,1644.816109,2,97,2,19,6,4,2016
id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,...,586662.713668,4515123.0,1211.250987,1,86,5,13,26,3,2016


In [15]:
df_rides['pickup_datetime'][1].month
df_rides['pickup_datetime'][1]

Timestamp('2016-06-12 00:43:35')

In [16]:
df_rides.shape

(1458644, 24)

In [17]:
import datetime
from suntime import Sun, SunTimeException

def Sonnenstunden(df):
    Sonnenstunden=[]
    for i in range(df.shape[0]):
        lat = df_rides['pickup_latitude'][i]
        long = df_rides['pickup_longitude'][i]
        sun = Sun(lat,long)
        date = datetime.date(df_rides['pickup_datetime'][i].year, df_rides['pickup_datetime'][i].month, df_rides['pickup_datetime'][i].day)
        sunrise= sun.get_sunrise_time(date)
        sunset = sun.get_sunset_time(date)
        hours = sunset-sunrise
        c = sunset - sunrise
        datetime.timedelta(0, 8, 562000)
        h=(divmod(c.days * 86400 + c.seconds, 60))
        stunden=h[0]/60
        #print(stunden)
        #print(hours)
        Sonnenstunden.append(hours)
    
    return df.assign(**{"Sonnensrunden": Sonnenstunden})


In [18]:
df_rides=Sonnenstunden(df_rides)

In [19]:
df_rides

Unnamed: 0_level_0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,...,dropoff_y_utm,distance_Manhattan,Season,DayOfYear,DayOfWeek,hour,day,month,year,Sonnensrunden
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.964630,40.765602,N,455,...,4.513253e+06,1723.957684,1,74,0,17,14,3,2016,11:54:00
id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,...,4.509394e+06,2441.748555,2,164,6,0,12,6,2016,-1 days +15:04:00
id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,...,4.507050e+06,8156.039197,1,19,1,11,19,1,2016,09:40:00
id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.010040,40.719971,-74.012268,40.706718,N,429,...,4.506670e+06,1644.816109,2,97,2,19,6,4,2016,12:57:00
id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.782520,N,435,...,4.515123e+06,1211.250987,1,86,5,13,26,3,2016,12:27:00
id0801584,2,2016-01-30 22:01:40,2016-01-30 22:09:03,6,-73.982857,40.742195,-73.992081,40.749184,N,443,...,4.511403e+06,1554.481852,1,30,5,22,30,1,2016,10:03:00
id1813257,1,2016-06-17 22:34:59,2016-06-17 22:40:40,4,-73.969017,40.757839,-73.957405,40.765896,N,341,...,4.513293e+06,1875.473917,2,169,4,22,17,6,2016,-1 days +15:06:00
id1324603,2,2016-05-21 07:54:58,2016-05-21 08:20:49,1,-73.969276,40.797779,-73.922470,40.760559,N,1551,...,4.512736e+06,8083.599186,2,142,5,7,21,5,2016,-1 days +14:39:00
id1301050,1,2016-05-27 23:12:23,2016-05-27 23:16:38,1,-73.999481,40.738400,-73.985786,40.732815,N,255,...,4.509592e+06,1770.154709,2,148,4,23,27,5,2016,-1 days +14:49:00
id0012891,2,2016-03-10 21:45:01,2016-03-10 22:05:26,1,-73.981049,40.744339,-73.973000,40.789989,N,1225,...,4.515952e+06,5695.712689,1,70,3,21,10,3,2016,11:43:00


In [20]:
a  = [1,2,3]
b = [4,5,6]
c = Sun(a, b)

In [47]:
lat = df_rides['pickup_latitude']
long = df_rides['pickup_longitude']
sun = Sun(lat,long)
date = datetime.date(pd.DatetimeIndex(df_rides['pickup_datetime']).year.values, pd.DatetimeIndex(df_rides['pickup_datetime']).month.values, pd.DatetimeIndex(df_rides['pickup_datetime']).day.values)
sunrise= sun.get_sunrise_time(date)
sunset = sun.get_sunset_time(date)
hours = sunset-sunrise
c = sunset - sunrise
datetime.timedelta(0, 8, 562000)
h=(divmod(c.days * 86400 + c.seconds, 60))
stunden=h[0]/60
#print(stunden)
#print(hours)
Sonnenstunden.append(hours)

TypeError: only size-1 arrays can be converted to Python scalars

In [29]:
df_rides['pickup_datetime'][1].year
df_rides['pickup_datetime'][1]

Timestamp('2016-06-12 00:43:35')

In [39]:
a= pd.DatetimeIndex(df_rides['pickup_datetime']).year

np.dtype(a[1])

dtype('int64')

In [49]:
pd.DatetimeIndex(df_rides['pickup_datetime']).year.values

array([2016, 2016, 2016, ..., 2016, 2016, 2016], dtype=int64)

In [56]:
a=pd.DatetimeIndex(df_rides['pickup_datetime']).year.values

In [63]:
a?

[1;31mType:[0m            ndarray
[1;31mString form:[0m     [2016 2016 2016 ... 2016 2016 2016]
[1;31mLength:[0m          1458644
[1;31mFile:[0m            c:\users\simon\appdata\local\conda\conda\envs\tensorflow\lib\site-packages\numpy\__init__.py
[1;31mDocstring:[0m       <no docstring>
[1;31mClass docstring:[0m
ndarray(shape, dtype=float, buffer=None, offset=0,
        strides=None, order=None)

An array object represents a multidimensional, homogeneous array
of fixed-size items.  An associated data-type object describes the
format of each element in the array (its byte-order, how many bytes it
occupies in memory, whether it is an integer, a floating point number,
or something else, etc.)

Arrays should be constructed using `array`, `zeros` or `empty` (refer
to the See Also section below).  The parameters given here refer to
a low-level method (`ndarray(...)`) for instantiating an array.

For more information, refer to the `numpy` module and examine the
methods and attri