In [1]:
import sys, os
sys.path.insert(0,"../code")

import pandas as pd
import numpy as np
import geopandas as gpd
import folium 
import math

%load_ext autoreload
%autoreload 2

# Load Data 

In [2]:
def load_data(path_data="../data"): 
    
    if not os.path.exists(path_data): 
        os.mkdir(path)
    
    if not os.path.path.exists(os.path.join(path_data,"nyc-taxi-trip-duration")): 
        pass
        #!curl -o FremontBridge.csv https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD
        #download file 
        #unzip file 

In [3]:
#PATH_DATA = os.path.join("..\data","nyc-taxi-trip-duration","train","train.csv")
PATH_DATA= r'C:\Users\Jan\sciebo\NYC_TaxiTraficDuration\NYC_TaxiTripDuration'
df_rides = pd.read_csv(PATH_DATA,index_col=0,parse_dates=[2,3],dtype={'store_and_fwd_flag':'category','vendor_id':'category','passenger_count':'int8',})
print(df_rides.info())
df_rides.head()

OSError: Initializing from file failed

In [None]:
pd.isna(df_rides).sum()

In [None]:
assert(df_rides.index.unique().shape[0]==df_rides.shape[0])
assert(np.all(df_rides.dropoff_datetime>df_rides.pickup_datetime))
assert(np.all(df_rides.passenger_count>=0))
#assert(np.allclose((df_rides.dropoff_datetime-df_rides.pickup_datetime).dt.seconds.values,df_rides.trip_duration,))

In [None]:
from shapely.geometry import Point

def transform_to_geodf(df_rides,set_geometry_col='pickup'): 
        df_rides['pickup_geom'] = list(zip(df_rides.pickup_longitude,df_rides.pickup_latitude))
        df_rides['pickup_geom'] = df_rides['pickup_geom'].apply(Point)
        df_rides['dropoff_geom'] = list(zip(df_rides.dropoff_longitude,df_rides.dropoff_latitude))
        df_rides['dropoff_geom'] = df_rides['dropoff_geom'].apply(Point)
        df_rides = gpd.GeoDataFrame(df_rides,geometry=f'{set_geometry_col}_geom',crs={"init":"epsg:4326"})
        
        return df_rides

In [None]:
df_rides = transform_to_geodf(df_rides)
df_rides.head()

In [None]:
df_rides.shape

In [None]:
df_rides = df_rides.to_crs({'init':'epsg:32618'})

In [None]:
df_rides.head()


In [None]:
import pandas as pd 
import numpy as np
import pyproj

# Correct utm-zone: {'init': 'epsg:32618'}
#df_rides.to_crs()

def convert_projection_to_utm(df,col_x_source,col_y_source,
                              col_x_dest = 'x_utm', col_y_dest = 'y_utm',
                              projection_source=pyproj.Proj("+init=EPSG:4326"),
                              projection_dest=pyproj.Proj("+init=EPSG:32618")):
    x,y = pyproj.transform(projection_source, projection_dest,df[col_x_source].values,df[col_y_source].values)
    return df.assign(**{col_x_dest:x,col_y_dest:y})

def calc_distance(df,col_x1_utm,col_x2_utm,col_y1_utm,col_y2_utm,type_='beeline'):
    if type_ =='beeline':
        distance = np.sqrt((df[col_x1_utm]-df[col_x2_utm])**2+(df[col_y1_utm]-df[col_y2_utm])**2)
    elif type_ =='Manhattan':
        distance = np.abs((df[col_x1_utm]-df[col_x2_utm]))+np.abs((df[col_y1_utm]-df[col_y2_utm]))
    return df.assign(**{f"distance_{type_}": distance})

df_rides = convert_projection_to_utm(df_rides,col_x_source='pickup_longitude',col_y_source='pickup_latitude',col_x_dest="pickup_x_utm",col_y_dest='pickup_y_utm')
df_rides = convert_projection_to_utm(df_rides,col_x_source='dropoff_longitude',col_y_source='dropoff_latitude',col_x_dest="dropoff_x_utm",col_y_dest='dropoff_y_utm')
df_rides.head()

In [None]:
df_rides= calc_distance(df_rides,'pickup_x_utm','dropoff_x_utm','pickup_y_utm','dropoff_y_utm')

In [None]:
df_rides= calc_distance(df_rides,'pickup_x_utm','dropoff_x_utm','pickup_y_utm','dropoff_y_utm',type_='Manhattan')

In [None]:
df_rides.head()

In [None]:
def calc_time(df,pickup_datetime,dropoff_datetime):
    weekday = df[pickup_datetime].dt.dayofweek
    hour    = df[pickup_datetime].dt.hour
    quarter = df[pickup_datetime].dt.quarter
    month   = df[pickup_datetime].dt.month
    day_of_year = df[pickup_datetime].dt.dayofyear
    
    return df.assign(**{"weekday": weekday,"hour":hour,"quarter":quarter,'month':month,'day_of_year':day_of_year})


In [None]:

    
#     dr = pd.date_range(start=df[pickup_datetime].min(), end=df[dropoff_datetime].max())
#     df_times' = pd.DataFrame()
#     df_times['Date'] = dr

#     cal = calendar()
#     holidays = cal.holidays(start=dr.min(), end=dr.max())
#     holidays = df_times['Date'].isin(holidays)
#     test = df.join(holidays, how='outer')

In [None]:
df_rides = calc_time(df_rides,'pickup_datetime','dropoff_datetime')

In [None]:
df_rides.head()

In [None]:
#df_rides[df_rides['holidays']==True]

In [None]:
def calc_daylight(df, pickup_latitude,day_of_year):
    P = np.arcsin(0.39795 * np.cos(0.2163108 + 2 * np.arctan(0.9671396 * np.tan(.00860 * (df[day_of_year].astype(int) - 186)))))
    #assert(False)
    pi = math.pi
    daylightamount = 24 - (24 / pi) * np.arccos(
        (np.sin((0.8333 * pi / 180) + np.sin(df[pickup_latitude] * pi / 180) * np.sin(P)) / (np.cos(df[pickup_latitude] * pi / 180) * np.cos(P))))
    #assert(False)
    return df.assign(**{"daylighthours": daylightamount})

In [None]:
%%time
df_rides = calc_daylight(df_rides,'pickup_latitude','day_of_year')

In [None]:
df_rides.head()

In [None]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
def holidays(df,pickup_datetime,dropoff_datetime):
        dr = pd.date_range(start=df[pickup_datetime].min().year, end=df[pickup_datetime].max())
        df_times = pd.DataFrame()
        df_times['Date'] = dr
        cal = calendar()
        holidays = cal.holidays(start=dr.min(), end=dr.max())
        df['Holiday'] = df[pickup_datetime].dt.date.astype('datetime64').isin(holidays)
        print(holidays)
        return df.assign(**{"Is_on_Holiday": df['Holiday']})

In [None]:
df_rides = holidays(df_rides,'pickup_datetime','dropoff_datetime')

In [None]:
df_rides.head()

In [None]:
df_rides['Is_on_Holiday'].value_counts()

In [None]:
# Correct utm-zone: {'init': 'epsg:32618'}


Jahreszeit 

Wochentag

Tageszeit

Tageslänge - Phil berechnet den Azimuthwinkel 

Feiertag

Aufteilung in 100x100 Meter Quadrate: 
- Gruppieren Mittelwert aller

dict_test = {0:'winter',1:'spring',2:'summer',3:'fall'}
df_rides.replace({"season": dict_test}).head()