### Import functions

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import radians, cos, sin, asin, sqrt
from sklearn.svm import OneClassSVM
#from functionsPredictions import *
from sklearn import preprocessing
import matplotlib.pyplot as plt
import category_encoders as ce
from datetime import timedelta
from scipy import stats
import seaborn as sns
import xgboost as xgb
import datetime as dt
import pandas as pd
import numpy as np
import warnings
import glob

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     display(df_holiday2)

In [2]:
#!pip install category_encoders
#!pip install xgboost

In [3]:
warnings.filterwarnings(action='once')
plt.style.use('seaborn-poster')
sns.set_context("poster") 

### Preprocess, merge and clean functions

In [4]:
# variabelen/lijsten aanmaken
# Center points of all the parks (https://www.latlong.net/)
all_parks = ['vondelpark_west','vondelpark_oost_3','vondelpark_oost_2',
             'vondelpark_oost_1' 'oosterpark', 'sarphatipark',
            'westerpark_west','westerpark_oost','westerpark_centrum', 
             'westergasfabriek','rembrandtpark_noord', 'rembrandtpark_zuid', 
             'erasmuspark']

vondelpark_west = [{'lat': 52.356496, 'lng': 4.861447}]
vondelpark_oost_3 = [{'lng': 4.869217, 'lat': 52.358252}]
vondelpark_oost_2 = [{'lng': 4.874692, 'lat': 52.359798}]
vondelpark_oost_1 = [{'lng': 4.879652, 'lat': 52.360991}]
oosterpark = [{'lng': 4.920558, 'lat': 52.360098}]
sarphatipark = [{'lng': 4.896375, 'lat': 52.354364}]
westerpark_west = [{'lng': 4.867128, 'lat': 52.387099}]
westerpark_centrum = [{'lng': 4.873268, 'lat': 52.387374}]
westerpark_oost = [{'lng': 4.878379, 'lat': 52.386379}]
westergasfabriek = [{'lng': 4.869769, 'lat': 52.385920}]
rembrandtpark_noord = [{'lng': 4.846573, 'lat': 52.366664}]
rembrandtpark_zuid = [{'lng': 4.846932, 'lat': 52.361161}]
erasmuspark = [{'lng': 4.851909, 'lat': 52.374808}]
amstelpark = [{'lng': 4.894404, 'lat': 52.330409}]
park_frankendael = [{'lng': 4.929839, 'lat': 52.350703}]
beatrixpark = [{'lng': 4.881352, 'lat': 52.342471}]
flevopark = [{'lng': 4.947881, 'lat': 52.360087}]
gaasperpark = [{'lng': 4.992192, 'lat': 52.310420}]
nelson_mandelapark = [{'lng': 4.963691, 'lat': 52.312204}]
noorderpark = [{'lng': 4.919606, 'lat': 52.392651}]
sloterpark = [{'lng': 4.811894, 'lat': 52.366219}]
wh_vliegenbos = [{'lng': 4.931495, 'lat': 52.388802}]

# Create empty list that needs to be filled later
vondelpark_west_stations = []
vondelpark_oost_3_stations = []
vondelpark_oost_2_stations = []
vondelpark_oost_1_stations = []
oosterpark_stations = []
sarphatipark_stations = []
westerpark_west_stations = []
westerpark_centrum_stations = []
westerpark_oost_stations = []
westergasfabriek_stations = []
rembrandtpark_noord_stations = []
rembrandtpark_zuid_stations = []
erasmuspark_stations = []

amstelpark_stations = []
park_frankendael_stations= []
beatrixpark_stations = []
flevopark_stations = []
gaasperpark_stations = []
nelson_mandelapark_stations = []
noorderpark_stations = []
sloterpark_stations = []
wh_vliegenbos_stations = []

In [5]:
def preprocessGVB(path_url):
    files = glob.glob(path_url)
    gvb_data = pd.concat( (pd.read_csv(file, sep=";") for file in files), ignore_index = True)
    # Set dates to datetime
    # Only select data from 10-2020 till 12-2021
    # Drop if destination is unknown
    # Remove NaN and [[ Onbekend ]] values
    gvb_data['Datum'] = pd.to_datetime(gvb_data['Datum'])
    gvb_data = gvb_data.sort_values(by=['Datum', 'UurgroepOmschrijving (van aankomst)'])
    gvb_data_range = gvb_data[(gvb_data['Datum'] >= '2020-10-1') & (gvb_data['Datum'] <= '2021-12-31')]
    gvb_data_range_cleaned = gvb_data_range[gvb_data_range['AankomstHalteCode'].notnull()]
    gvb_data_range_cleaned = gvb_data_range_cleaned[gvb_data_range_cleaned['AankomstHalteNaam'] != "[[ Onbekend ]]"]

    # Replace missing data with one week before
    gvb_data_range_cleaned_without_9_november = gvb_data_range_cleaned[gvb_data_range_cleaned['Datum'] != "2020-11-09"]
    gvb_week46 = gvb_data_range_cleaned[(gvb_data_range_cleaned['Datum'] >= '2020-11-02') & (gvb_data_range_cleaned['Datum'] <= '2020-11-08')]
    gvb_week46['Datum'] = gvb_week46["Datum"] + dt.timedelta(days=7)
    frames = [gvb_data_range_cleaned_without_9_november, gvb_week46]
    gvb_data_range_very_cleaned = pd.concat(frames)
    gvb_data_range_very_cleaned.sort_values(by="Datum", inplace=True)
    
    # Still a lot of values are missing, make sure every data point gets added and interpolated
    # take last hour from column UurgroepOmschrijving and convert to datetime
    # add one minute to get hour, so 17:00 means 16:00 - 16:59
    # cmobine date and hour to make index unique
    gvb_data_range_travels = gvb_data_range_very_cleaned.copy()

    gvb_data_range_travels['hour'] = gvb_data_range_travels['UurgroepOmschrijving (van aankomst)'].str[:5]
    gvb_data_range_travels['hour'] = pd.to_datetime(gvb_data_range_travels['hour'], format='%H:%M').dt.time
    gvb_data_range_travels['date'] = gvb_data_range_travels.apply(lambda r : pd.datetime.combine(r['Datum'],r['hour']),1)
    gvb_data_range_travels = gvb_data_range_travels.drop(columns=['Datum', 'UurgroepOmschrijving (van aankomst)',
                                                                  'AankomstHalteCode','hour'])
    
    # Create DF with all stations with their lon and lat
    stations_lon_lat = gvb_data_range_travels.drop_duplicates(subset=['AankomstHalteNaam'])[['AankomstHalteNaam', 'AankomstLon', 'AankomstLat']]
    stations_lon_lat = stations_lon_lat.set_index('AankomstHalteNaam')
    stations_lon_lat.rename(columns={"AankomstLat": "lng", "AankomstLon": "lat"}, inplace=True)  
    
    gvb_data_range_travels = gvb_data_range_travels.drop(columns=['AankomstLat', 'AankomstLon'])    
    
    return gvb_data_range_travels, stations_lon_lat
def preprocessWeather(path_url):
    '''
    Reads in and preprocesses the weather data
    
    :path_url: The path_url to the weather data
    
    Returns a preprocessed Dataframe
    '''

    df = pd.read_csv(path_url)
    df.columns = df.columns.str.replace(' ', '')
    df[['FH', 'T', 'RH']] = df[['FH', 'T', 'RH']] / 10
    df['YYYYMMDD'] = pd.to_datetime(df['YYYYMMDD'], format='%Y%m%d')
    df['date'] = df['YYYYMMDD'] +  pd.to_timedelta(df['HH'], unit='h')
    df.drop(columns = ['#STN', 'DD', 'FF', 'FX', 'T10N', 'TD', 'Q', 
                       'P', 'VV', 'U', 'WW', 'IX', 'HH', 'YYYYMMDD'], inplace=True)
    df.set_index('date', inplace=True)
    return df

def fill_missing_values_dataframe(df, park):
    '''
    Add missing dates and give value from day before or if not available interpolate
    
    :df: the dataframe of the park
    
    :park: give as input about which park it is   
    '''     
    idx = pd.date_range(df['Datetime'].min(), df['Datetime'].max(), freq="15min")
    df['Datetime'] = df['Datetime']
    df_without_missing = df.set_index('Datetime').reindex(idx)
    df_without_missing[df_without_missing['Visits'] < 25] = np.NaN
    df_without_missing['Location'] = park
    df_without_missing['Date'] = df_without_missing.index.date
    df_without_missing['Time'] = df_without_missing.index.time
    df_without_missing = df_without_missing.groupby(df_without_missing.index.hour).ffill()
    df_without_missing = df_without_missing.interpolate()
    df_without_missing = df_without_missing.reset_index().rename(columns={"index": "Datetime"})
    return df_without_missing.dropna()

def fill_missing_values_dataframe(df, park):
    '''
    Add missing dates and give value from day before or if not available interpolate
    
    :df: the dataframe of the park
    
    :park: give as input about which park it is   
    '''     
    idx = pd.date_range(df.index.min(), df.index.max(), freq="15min")
    df_without_missing = df.reindex(idx)
    df_without_missing[df_without_missing['Visits'] < 25] = np.NaN
    df_without_missing['Location'] = park
    df_without_missing['Date'] = df_without_missing.index.date
    df_without_missing['Time'] = df_without_missing.index.time
    df_without_missing = df_without_missing.groupby(df_without_missing.index.hour).ffill()
    df_without_missing = df_without_missing.interpolate()
    df_without_missing = df_without_missing.backfill()
    df_without_missing = df_without_missing.reset_index().rename(columns={"index": "Datetime"})
    return df_without_missing

def preprocessResono(path_url):
    '''
    Reads in and preprocesses the resono data
    
    :path_url: The path_url to the resono data
    
    Returns a preprocessed Dataframe
    ''' 
    df = pd.read_csv(path_url)
    df = df.drop(columns = ["Unnamed: 0"])
    
    df['End'] = pd.to_datetime(df['End'])
    df['End'] = pd.to_datetime(df['End'].dt.strftime("%Y-%m-%d %H:%M:%S"))
    
    df = df.rename(columns = {'End' : 'Datetime',
                              'End_Dates' : 'Date',
                              'End_Time' : 'Time'})
    df = df.set_index('Datetime')
    df = df.loc['2020-10':]
    
    df = df[df.Location != 'Vondelpark Oost']
    df = df[df.Location != 'Westerpark']
    df = df[df.Location != 'Rembrandtpark Noord']
    df = df[df.Location != 'Rembrandtpark Zuid']

    df_no_missing = pd.DataFrame(columns=['Datetime','Location','Visits','Date','Time'])

    for park in df["Location"].unique():
        result = fill_missing_values_dataframe(df[df['Location'] == park], park)
        df_no_missing = df_no_missing.append(result)    
    
    return df_no_missing.set_index("Datetime")


def preprocessHoliday(path_url):
    holiday = pd.read_csv(path_url)
    holiday = holiday.drop(['Unnamed: 0'], axis = 1)
    holiday = holiday.drop([0,28,120,122, 128, 150,219,221,227],axis=0)
    holiday['Holiday_Name'] = holiday['Holiday_Name'].str.replace('Boxing Day', 'Christmas Day')
    return holiday

def mergeGVBdata(gvb, resono):
    return gvb

def mergeWeatherFiles(df_Weather2020, df_Weather2021):
    '''
    Merges the weather data
    
    :df_Weather2020: Weather data from 2020
    :df_Weather2021: Weather data from 2021
    
    Returns a merged weather Dataframe
    '''
    
    df_weather = pd.concat([df_Weather2020, df_Weather2021], axis=0)
    df_weather = df_weather.loc['2020-10':]

    cols_int = ['SQ', 'DR', 'N', 'M', 'R', 'S', 'O', 'Y']
    cols_float = ['FH', 'T']

    df_weather[cols_float] = df_weather[cols_float].apply(pd.to_numeric, errors='coerce', axis=1)
    df_weather[cols_int] = df_weather[cols_int].apply(pd.to_numeric, errors='coerce', axis=1)
    df_weather['RH'] = df_weather['RH'].apply(lambda x: 0.05 if x==-0.1 else x)
    
    df_weather_resample = pd.concat([df_weather[['FH', 'T', 'N']].resample('15T').interpolate(method='linear'),
                    df_weather[['RH', 'DR', 'SQ', 'M', 'R', 'S', 'O', 'Y']].resample('15T').bfill()],
                   axis=1)
    
    df_weather_resample[['DR', 'SQ']] = df_weather_resample[['DR', 'SQ']] * 1.5
    df_weather_resample['RH'] = df_weather_resample['RH'] / 4
    
    return df_weather_resample 

def mergeWeatherResonoHoliday(df_resono, df_weather, df_holiday):
    '''
    Merges the resono and weather data
    
    :df_resono: All resono data
    :df_weather: All weather data
    :df_holiday: All holiday data
    
    Returns a merged weather Dataframe
    '''
    
    
    merge_resono_weather = pd.merge(df_resono, df_weather, left_index=True, right_index=True, how='left')
    merge_resono_weather = merge_resono_weather.rename({'T': 'Temperature', 'N': 'Clouds', 'FH': 'Windspeed',
                                                    'RH': 'Rain amount', 'DR': 'Rain duration' , 'SQ': 'Sun duration',
                                                    'M': 'Fog', 'R': 'Rain', 'S': 'Snow', 'O': 'Thunder', 'Y': 'Ice'},
                                                   axis=1) 
    
    all_merged = pd.merge(merge_resono_weather, df_holiday, how='left', right_on = 'End_Dates', left_on='Date')
    all_merged = all_merged.drop(['End_Dates'], axis=1)
    return all_merged

def Target_OneHotEncoding(Resono_Holi):
    #fill the blank of Holiday count, year, month, day
    Resono_Holi['Holiday_Count'] = Resono_Holi['Holiday_Count'].replace(np.nan, 0)
    Resono_Holi['Year'] = pd.to_datetime(Resono_Holi['Date']).dt.year
    Resono_Holi['Month'] = pd.to_datetime(Resono_Holi['Date']).dt.month
    Resono_Holi['Day'] = pd.to_datetime(Resono_Holi['Date']).dt.day
    
    Resono_Holi['Holiday_Name'] = Resono_Holi['Holiday_Name'].replace(
                             ['Christmas Day', 'New year', 'Boxing Day', 'Holiday_Name_New year', 'Christmas holiday', 'Holiday_Name_Boxing Day'] ,'Winter holiday')

    Resono_Holi['Holiday_Name'] = Resono_Holi['Holiday_Name'].replace(
                                 ["King's day"] ,'Kings day')

    Resono_Holi['Holiday_Name'] = Resono_Holi['Holiday_Name'].replace(
                                 ['Easter Monday', 'Easter Sunday'] ,'Easter')

    Resono_Holi['Holiday_Name'] = Resono_Holi['Holiday_Name'].replace(
                                 ['Whit Monday', 'Whit Sunday'] ,'Whit')

    '''
    Monday =0, Tuesday=1, Wednesday=2,Thursday =3,  Friday=4 ,  Saturday =5, Sunday =6
    '''

    Resono_Holi['Date'] = Resono_Holi['Date'].astype('datetime64[ns]')

    encoder = ce.TargetEncoder(cols='Holiday_Name')
    Resono_Holi['Holiday_name'] = encoder.fit_transform(Resono_Holi['Holiday_Name'], Resono_Holi['Visits'])
    
    # Holidays
    Resono_Holi_Dummies = pd.get_dummies(Resono_Holi, columns=["Holiday_Name"])

    return Resono_Holi_Dummies

def remove_outliers(df, gamma=0.01, nu=0.03):
    '''
    Remove outliers for each location with a One-Class SVM.
    
    :df: Dataframe to perform outlier detection on
    :gamma: Value of the kernel coefficient for ‘rbf’ (default = 0.01)
    :nu: Percentage of the data to be classified as outliers (default = 0.03)
    
    Returns
    :df_detected: Dataframe with the outliers replaced by NaN
    :outlier_index: List of the indexes of the outliers (used for plotting the outliers, probably 
                                                         not necessary for final product)
    '''
    model = OneClassSVM(kernel='rbf', gamma=gamma, nu=nu)
    df = df.reset_index()
    
    for loc in list(set(df.Location)):
        dt = df[(df.Location == loc)]
        dt_detected = dt.copy()

        scaler = preprocessing.StandardScaler()
        dt_scaled = scaler.fit_transform(dt['Visits'].values.reshape(-1,1))

        fit = model.fit(dt_scaled)
        pred = fit.predict(dt_scaled)
        outlier_index = np.where(pred == -1)
        idx = dt.iloc[outlier_index].index
        df.loc[idx, 'Visits'] = np.nan
        
    df = df.set_index('Datetime')
    return df

def interpolate_df(df, backfill=False):
    '''
    Interpolate the NaN values in the dataframe with either backfilling or linear interpolation.
    
    :df: Dataframe to be interpolated
    :backfill: Bool, if true, interpolate with backfilling, otherwise use linear interpolation (default = False)
    
    Returns a Dataframe with interpolated values
    '''
    df_int = df.copy()
    
    if backfill == True:
        df_int = df_int.backfill()
        
    else:
        for idx, loc in enumerate(df.columns):
            dt = df[loc]
            dt_int = dt.copy()
            dt_int = dt_int.interpolate()
            df_int[loc] = dt_int
        
    return df_int

def smooth_df(df, N=3):
    '''
    Smooth the data with a rolling average to remove false peaks in the data
    
    :df: Dataframe to be smoothed
    :N: Size of the moving window (default = 3)
    
    Returns a smoothed Dataframe
    '''
    df_smooth = df.copy()
    df_smooth = df_smooth.rolling(N).mean()
    
    begin_vals = df.iloc[:N-1]
    df_smooth.update(begin_vals)
        
    return df_smooth

def add_time_vars(data, onehot=True):
    '''
    Adds columns for the month and weekday, and also the one-hot encoding or the cyclical versions of those features.

    :data: Dataframe that contains the a column with the datetime
    :onehot: Use onehot encoding if true and cyclical features if false (default = True)
    
    Returns a Dataframe with either the one-hot encoding or the sine and cosine of the month, weekday and time added
    '''
    data = data.reset_index()
    if onehot == True:
        data['Year'] = pd.Categorical(data['Datetime'].dt.year)
        data['Month'] = pd.Categorical(data['Datetime'].dt.month)
        data['Weekday'] = pd.Categorical(data['Datetime'].dt.weekday)
        data['Hour'] =  pd.Categorical(data['Datetime'].dt.hour)
        data['Minute'] =  pd.Categorical(data['Datetime'].dt.minute)

        year_dummies = pd.get_dummies(data[['Month']], prefix='Year_')
        month_dummies = pd.get_dummies(data[['Month']], prefix='Month_')
        weekday_dummies = pd.get_dummies(data[['Weekday']], prefix='Weekday_')
        hour_dummies = pd.get_dummies(data[['Hour']], prefix='Hour_')
        minute_dummies = pd.get_dummies(data[['Minute']], prefix='Minute_')
        
        data = data.merge(year_dummies, left_index = True, right_index = True)
        data = data.merge(month_dummies, left_index = True, right_index = True)
        data = data.merge(weekday_dummies, left_index = True, right_index = True)
        data = data.merge(hour_dummies, left_index = True, right_index = True)
        data = data.merge(minute_dummies, left_index = True, right_index = True)
        
    else: 
        dates = data['Date'].values
        weekdays = []
        months = []
        hours = []
        minutes = []

        for d in dates:
            year, month, day = (int(x) for x in d.split('-'))
            ans = dt.date(year, month, day)
            weekdays.append(ans.isocalendar()[2])
            months.append(month)

        for t in data['Time']:
            hour, minute, second = (int(x) for x in t.split(':'))
            hours.append(hour)
            minutes.append(minute)
        
        data['Weekday'] = weekdays
        data['Month'] = months
        data['Hour'] = hours
        data['Minute'] = minutes
        data['Weekday_sin'] = np.sin(data['Weekday'] * (2 * np.pi / 7))
        data['Weekday_cos'] = np.cos(data['Weekday'] * (2 * np.pi / 7))
        data['Month_sin'] = np.sin(data['Month'] * (2 * np.pi / 12))
        data['Month_cos'] = np.cos(data['Month'] * (2 * np.pi / 12))
        data['Hour_sin'] = np.sin(data['Hour'] * (2 * np.pi / 24))
        data['Hour_cos'] = np.cos(data['Hour'] * (2 * np.pi / 24))
        data['Minute_sin'] = np.sin(data['Minute'] * (2 * np.pi / 60))
        data['Minute_cos'] = np.cos(data['Minute'] * (2 * np.pi / 60))
        
    data = data.set_index('Datetime')
    return data


def predict(data, location, pred_params, N_boost=100):
    '''
    Predict the amount of visits using XGBoost
    
    :data: Dataframe with all the data
    :location: The location of the park to make predictions for
    :pred_params: A list of the names of the predictor variables
    :N_boost: Number of boost rounds during training (default = 100)
    
    Returns nothing (yet)
    '''
    # Select data for a specific park
    data = data[data['Location'] == location]
    
    # Split the data into input and output variables
    X = data[pred_params]
    y = data['Visits']

    # Split the data into test and train sets
    train_X, test_X, train_y, test_y = train_test_split(X, y,
                          test_size = 0.3, random_state = 123)

    # Convert test and train set to DMatrix objects
    train_dmatrix = xgb.DMatrix(data = train_X, label = train_y)
    test_dmatrix = xgb.DMatrix(data = test_X, label = test_y)
    
    # Set parameters for base learner
    params = {
        'booster': 'gblinear',
#         'colsample_bynode': 0.8,
        'learning_rate': 1,
#         'max_depth': 15,
#         'num_parallel_tree': 100,
        'objective': 'reg:squarederror',
#         'subsample': 0.8,
#         'tree_method': 'gpu_hist'
    }

    # Fit the data and make predictions
    model = xgb.train(params = params, dtrain = train_dmatrix, num_boost_round = N_boost)
    pred = model.predict(test_dmatrix)
    predictions = pd.DataFrame({'Predicted visitors': pred,
                                'Actual visitors': test_y})
    predictions = predictions.clip(lower=0)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(test_y, predictions['Predicted visitors']))
    mae = mean_absolute_error(test_y, predictions['Predicted visitors'])
    print("RMSE : % f" %(rmse))
    print("MAE : % f" %(mae))
    return predictions
    
def getDataframe():
    df_Weather2020 = preprocessWeather("KNMI (Weather) 2020-2021/uurgeg_240_2011-2020.txt")
    df_Weather2021 = preprocessWeather("KNMI (Weather) 2020-2021/uurgeg_240_2021-2030.txt")
    df_resono = preprocessResono("resono_2020_2022.csv")
    df_holiday = preprocessHoliday('holidays.csv')

    df_weather = mergeWeatherFiles(df_Weather2020, df_Weather2021)
    df_resono_weather = mergeWeatherResonoHoliday(df_resono, df_weather, df_holiday)
    dataframe = Target_OneHotEncoding(df_resono_weather)
    return dataframe

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

def add_nearby_stations(radius, center_point_dict, add_to_list):
    """
    radius in km
    center_point_dict needs to be a dictionary with the lon and lat from a location
    add_to_list specify to which list this needs to be added (for example vondelpark)
    """
    latpark = center_point_dict[0]['lat']
    lonpark = center_point_dict[0]['lng']

    # check for every station if it is within 1 km distance of the park
    for station in range(len(long_lat)):
        name_station = long_lat.iloc[station].name
        latstation = long_lat.iloc[station].lat
        lonstation = long_lat.iloc[station].lng

        a = haversine(lonpark, latpark, lonstation, latstation)
        
        if a <= radius:
            add_to_list.append(name_station)
    

### Reading in filepaths

In [6]:
cleaned_gvb, long_lat = preprocessGVB(r'GVB/*.csv')



In [8]:
df_Weather2020 = preprocessWeather("KNMI (Weather) 2020-2021/uurgeg_240_2011-2020.txt")
df_Weather2021 = preprocessWeather("KNMI (Weather) 2020-2021/uurgeg_240_2021-2030.txt")
resono = preprocessResono("resono_2020_2022.csv")



In [77]:
df_holiday = preprocessHoliday('holidays.csv')
df_weather = mergeWeatherFiles(df_Weather2020, df_Weather2021)
df_resono_weather = mergeWeatherResonoHoliday(min15_all_resono_park, df_weather, df_holiday)
dataframe = Target_OneHotEncoding(df_resono_weather)

In [18]:
# calculate all stations within 1 km from the park
add_nearby_stations(1,vondelpark_west, vondelpark_west_stations)
add_nearby_stations(1,vondelpark_oost_3, vondelpark_oost_3_stations)
add_nearby_stations(1,vondelpark_oost_2, vondelpark_oost_2_stations)
add_nearby_stations(1,vondelpark_oost_1, vondelpark_oost_1_stations)
add_nearby_stations(1,oosterpark,oosterpark_stations)
add_nearby_stations(1,sarphatipark,sarphatipark_stations)
add_nearby_stations(1,westergasfabriek, westergasfabriek_stations)
add_nearby_stations(1,westerpark_west, westerpark_west_stations)
add_nearby_stations(1,westerpark_centrum, westerpark_centrum_stations)
add_nearby_stations(1,westerpark_oost, westerpark_oost_stations)
add_nearby_stations(1,rembrandtpark_noord,rembrandtpark_noord_stations)
add_nearby_stations(1,rembrandtpark_zuid,rembrandtpark_zuid_stations)
add_nearby_stations(1,erasmuspark,erasmuspark_stations)

add_nearby_stations(1,amstelpark,amstelpark_stations)
add_nearby_stations(1,park_frankendael,park_frankendael_stations)
add_nearby_stations(1,beatrixpark,beatrixpark_stations)
add_nearby_stations(1,flevopark,flevopark_stations)
add_nearby_stations(1,gaasperpark,gaasperpark_stations)
add_nearby_stations(1,nelson_mandelapark,nelson_mandelapark_stations)
add_nearby_stations(1,noorderpark,noorderpark_stations)
add_nearby_stations(1,sloterpark,sloterpark_stations)
add_nearby_stations(1,wh_vliegenbos,wh_vliegenbos_stations)

In [20]:
cleaned_gvb = df_data.copy()

In [31]:
vondelpark_west_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(vondelpark_west_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
vondelpark_oost1_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(vondelpark_oost_1_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
vondelpark_oost2_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(vondelpark_oost_2_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
vondelpark_oost3_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(vondelpark_oost_3_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
oosterpark_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(oosterpark_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
sarphatipark_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(sarphatipark_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
westerpark_west_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(westerpark_west_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
westerpark_centrum_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(westerpark_centrum_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
westerpark_oost_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(westerpark_oost_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
westergasfabriek_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(westergasfabriek_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
rembrandtpark_noord_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(rembrandtpark_noord_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
rembrandtpark_zuid_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(rembrandtpark_zuid_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
erasmuspark_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(erasmuspark_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()

amstelpark_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(amstelpark_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
park_frankendael_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(park_frankendael_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
beatrixpark_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(beatrixpark_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
flevopark_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(flevopark_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
gaasperpark_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(gaasperpark_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
nelson_mandelapark_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(nelson_mandelapark_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
noorderpark_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(noorderpark_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
sloterpark_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(sloterpark_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()
wh_vliegenbos_journeys = cleaned_gvb[cleaned_gvb['AankomstHalteNaam'].isin(wh_vliegenbos_stations)].drop(columns=['AankomstHalteNaam']).groupby('date').sum()

# concatenate all dataframes into one, for later usage
vondelpark_west_journeys["park"] = 'vondelpark_west'
vondelpark_oost1_journeys["park"] = 'vondelpark_oost_1'
vondelpark_oost2_journeys["park"] = 'vondelpark_oost_2'
vondelpark_oost3_journeys["park"] = 'vondelpark_oost_3'
oosterpark_journeys["park"] = "oosterpark"
sarphatipark_journeys["park"] = "sarphatipark"
westerpark_west_journeys["park"] = "westerpark_west"
westerpark_oost_journeys["park"] = "westerpark_oost"
westerpark_centrum_journeys["park"] = "westerpark_centrum"
westergasfabriek_journeys["park"] = "westergasfabriek"
rembrandtpark_noord_journeys["park"] = "rembrandtpark_noord"
rembrandtpark_zuid_journeys["park"] = "rembrandtpark_zuid"
erasmuspark_journeys["park"] = "erasmuspark"

amstelpark_journeys["park"] = "amstelpark"
park_frankendael_journeys["park"] = "park_frankendael"
beatrixpark_journeys["park"] = "beatrixpark"
flevopark_journeys["park"] = "flevopark"
gaasperpark_journeys["park"] = "gaasperpark"
nelson_mandelapark_journeys["park"] = "nelson_mandelapark"
noorderpark_journeys["park"] = "noorderpark"
sloterpark_journeys["park"] = "sloterpark"
wh_vliegenbos_journeys["park"] = "wh_vliegenbos"

frames = [vondelpark_west_journeys, vondelpark_oost1_journeys, vondelpark_oost2_journeys, 
          vondelpark_oost3_journeys,oosterpark_journeys, sarphatipark_journeys, westerpark_west_journeys,
         westerpark_centrum_journeys, westerpark_oost_journeys, westergasfabriek_journeys,
         rembrandtpark_noord_journeys, rembrandtpark_zuid_journeys, erasmuspark_journeys,
         amstelpark_journeys, park_frankendael_journeys, beatrixpark_journeys, flevopark_journeys,
         gaasperpark_journeys, nelson_mandelapark_journeys, noorderpark_journeys, sloterpark_journeys, wh_vliegenbos_journeys]

all_parks_journeys = pd.concat(frames)


In [22]:
#interpolation

# Make all GVB data 15 min
vondelpark_west_journeys_15min = vondelpark_west_journeys.resample('15T').pad()
vondelpark_west_journeys_15min['AantalReizen'] = vondelpark_west_journeys_15min['AantalReizen'] / 4

vondelpark_oost1_journeys_15min = vondelpark_oost1_journeys.resample('15T').pad()
vondelpark_oost1_journeys_15min['AantalReizen'] = vondelpark_oost1_journeys_15min['AantalReizen'] / 4

vondelpark_oost2_journeys_15min = vondelpark_oost2_journeys.resample('15T').pad()
vondelpark_oost2_journeys_15min['AantalReizen'] = vondelpark_oost2_journeys_15min['AantalReizen'] / 4

vondelpark_oost3_journeys_15min = vondelpark_oost3_journeys.resample('15T').pad()
vondelpark_oost3_journeys_15min['AantalReizen'] = vondelpark_oost3_journeys_15min['AantalReizen'] / 4

oosterpark_journeys_15min = oosterpark_journeys.resample('15T').pad()
oosterpark_journeys_15min['AantalReizen'] = oosterpark_journeys_15min['AantalReizen'] / 4

sarphatipark_journeys_15min = sarphatipark_journeys.resample('15T').pad()
sarphatipark_journeys_15min['AantalReizen'] = sarphatipark_journeys_15min['AantalReizen'] / 4

rembrandtpark_noord_journeys_15min = rembrandtpark_noord_journeys.resample('15T').pad()
rembrandtpark_noord_journeys_15min['AantalReizen'] = rembrandtpark_noord_journeys_15min['AantalReizen'] / 4

rembrandtpark_zuid_journeys_15min = rembrandtpark_zuid_journeys.resample('15T').pad()
rembrandtpark_zuid_journeys_15min['AantalReizen'] = rembrandtpark_zuid_journeys_15min['AantalReizen'] / 4

westerpark_centrum_journeys_15min = westerpark_centrum_journeys.resample('15T').pad()
westerpark_centrum_journeys_15min['AantalReizen'] = westerpark_centrum_journeys_15min['AantalReizen'] / 4

westerpark_oost_journeys_15min = westerpark_oost_journeys.resample('15T').pad()
westerpark_oost_journeys_15min['AantalReizen'] = westerpark_oost_journeys_15min['AantalReizen'] / 4

westerpark_west_journeys_15min = westerpark_west_journeys.resample('15T').pad()
westerpark_west_journeys_15min['AantalReizen'] = westerpark_west_journeys_15min['AantalReizen'] / 4

westergasfabriek_journeys_15min = westergasfabriek_journeys.resample('15T').pad()
westergasfabriek_journeys_15min['AantalReizen'] = westergasfabriek_journeys_15min['AantalReizen'] / 4

erasmuspark_journeys_15min = erasmuspark_journeys.resample('15T').pad()
erasmuspark_journeys_15min['AantalReizen'] = erasmuspark_journeys_15min['AantalReizen'] / 4

amstelpark_journeys_15min= amstelpark_journeys.resample('15T').pad()
amstelpark_journeys_15min['AantalReizen'] = amstelpark_journeys_15min['AantalReizen'] / 4

park_frankendael_journeys_15min = park_frankendael_journeys.resample('15T').pad()
park_frankendael_journeys_15min['AantalReizen'] = park_frankendael_journeys_15min['AantalReizen'] / 4

beatrixpark_journeys_15min = beatrixpark_journeys.resample('15T').pad()
beatrixpark_journeys_15min['AantalReizen'] = beatrixpark_journeys_15min['AantalReizen'] / 4

flevopark_journeys_15min = flevopark_journeys.resample('15T').pad()
flevopark_journeys_15min['AantalReizen'] = flevopark_journeys_15min['AantalReizen'] / 4

gaasperpark_journeys_15min = gaasperpark_journeys.resample('15T').pad()
gaasperpark_journeys_15min['AantalReizen'] = gaasperpark_journeys_15min['AantalReizen'] / 4

nelson_mandelapark_journeys_15min = nelson_mandelapark_journeys.resample('15T').pad()
nelson_mandelapark_journeys_15min['AantalReizen'] = nelson_mandelapark_journeys_15min['AantalReizen'] / 4

noorderpark_journeys_15min = noorderpark_journeys.resample('15T').pad()
noorderpark_journeys_15min['AantalReizen'] = noorderpark_journeys_15min['AantalReizen'] / 4

sloterpark_journeys_15min = sloterpark_journeys.resample('15T').pad()
sloterpark_journeys_15min['AantalReizen'] = sloterpark_journeys_15min['AantalReizen'] / 4

In [81]:
# create list of unique locations and empty list with potential resono x park lists
locations = resono["Location"].unique().tolist()
resono_park_list_15min = []

for location in locations:   
    # create dynamic name (https://www.delftstack.com/howto/python/python-dynamic-variable-name/)
    name = f"min15_resono_{location.lower()}"
    name = "_".join(name.split())
    resono_park_list_15min.append(name)
    
    # prepare resono data
    resono_park = resono[resono['Location'] == location]
    #resono_park = resono_park.set_index('End')
    resono_park.index = pd.to_datetime(resono_park.index, utc=True)
    resono_park.index = resono_park.index.tz_convert(None)
    resono_park.index = resono_park.index.tz_localize('utc') 
    
    parknaam = "_".join(location.lower().split())
    
    # prepare gvb data of certain park 
    gvb_park = all_parks_journeys[all_parks_journeys["park"] == parknaam]
    gvb_park.index = gvb_park.index.tz_localize('utc')

    # merge gvb and resono to new dynamic df
    globals()[name] = resono_park.loc["2020-10":].join(gvb_park) 
    globals()[name].index = globals()[name].index.tz_convert(None)

frames = [min15_resono_amstelpark, min15_resono_beatrixpark, min15_resono_erasmuspark,
         min15_resono_flevopark, min15_resono_gaasperpark, min15_resono_nelson_mandelapark,
         min15_resono_noorderpark, min15_resono_oosterpark, min15_resono_park_frankendael,
         min15_resono_sarphatipark, min15_resono_vondelpark_oost_1,
         min15_resono_vondelpark_oost_2, min15_resono_vondelpark_oost_3, min15_resono_vondelpark_west,
         min15_resono_westergasfabriek, min15_resono_westerpark_centrum,
         min15_resono_westerpark_oost, min15_resono_westerpark_west]
min15_all_resono_park = pd.concat(frames)

In [82]:
min15_all_resono_park['Journeys'] = min15_all_resono_park['AantalReizen'].interpolate(method='linear')
min15_all_resono_park = min15_all_resono_park.drop(columns=['AantalReizen', 'park']).dropna()
min15_all_resono_park

Unnamed: 0_level_0,Location,Visits,Date,Time,Journeys
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-11-12 06:00:00,Amstelpark,71,2020-11-12,06:00:00,59.0
2020-11-12 06:15:00,Amstelpark,93,2020-11-12,06:15:00,82.0
2020-11-12 06:30:00,Amstelpark,92,2020-11-12,06:30:00,105.0
2020-11-12 06:45:00,Amstelpark,386,2020-11-12,06:45:00,128.0
2020-11-12 07:00:00,Amstelpark,463,2020-11-12,07:00:00,151.0
...,...,...,...,...,...
2022-01-06 13:45:00,Westerpark West,116,2022-01-06,13:45:00,10.0
2022-01-06 14:00:00,Westerpark West,107,2022-01-06,14:00:00,10.0
2022-01-06 14:15:00,Westerpark West,112,2022-01-06,14:15:00,10.0
2022-01-06 14:30:00,Westerpark West,114,2022-01-06,14:30:00,10.0


In [83]:
min15_all_resono_park['Journeys'].loc[min15_all_resono_park.between_time('01:01:00', '06:30:00')['Journeys'].index] = 0 
min15_all_resono_park

Unnamed: 0_level_0,Location,Visits,Date,Time,Journeys
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-11-12 06:00:00,Amstelpark,71,2020-11-12,06:00:00,0.0
2020-11-12 06:15:00,Amstelpark,93,2020-11-12,06:15:00,0.0
2020-11-12 06:30:00,Amstelpark,92,2020-11-12,06:30:00,0.0
2020-11-12 06:45:00,Amstelpark,386,2020-11-12,06:45:00,128.0
2020-11-12 07:00:00,Amstelpark,463,2020-11-12,07:00:00,151.0
...,...,...,...,...,...
2022-01-06 13:45:00,Westerpark West,116,2022-01-06,13:45:00,10.0
2022-01-06 14:00:00,Westerpark West,107,2022-01-06,14:00:00,10.0
2022-01-06 14:15:00,Westerpark West,112,2022-01-06,14:15:00,10.0
2022-01-06 14:30:00,Westerpark West,114,2022-01-06,14:30:00,10.0


In [29]:
frames = [min15_resono_amstelpark, min15_resono_beatrixpark, min15_resono_erasmuspark,
         min15_resono_flevopark, min15_resono_gaasperpark, min15_resono_nelson_mandelapark,
         min15_resono_noorderpark, min15_resono_oosterpark, min15_resono_park_frankendael,
         min15_resono_rembrandtpark_noord, min15_resono_rembrandtpark_zuid, 
         min15_resono_sarphatipark, min15_resono_vondelpark_oost, min15_resono_vondelpark_oost_1,
         min15_resono_vondelpark_oost_2, min15_resono_vondelpark_oost_3, min15_resono_vondelpark_west,
         min15_resono_westergasfabriek, min15_resono_westerpark, min15_resono_westerpark_centrum,
         min15_resono_westerpark_oost, min15_resono_westerpark_west]

Unnamed: 0_level_0,Location,Visits,Date,Time
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-11-12 00:15:00,Amstelpark,15,2020-11-12,00:15:00
2020-11-12 00:30:00,Amstelpark,15,2020-11-12,00:30:00
2020-11-12 00:45:00,Amstelpark,11,2020-11-12,00:45:00
2020-11-12 01:00:00,Amstelpark,8,2020-11-12,01:00:00
2020-11-12 01:15:00,Amstelpark,3,2020-11-12,01:15:00
...,...,...,...,...
2022-01-06 13:45:00,Westerpark West,116,2022-01-06,13:45:00
2022-01-06 14:00:00,Westerpark West,107,2022-01-06,14:00:00
2022-01-06 14:15:00,Westerpark West,112,2022-01-06,14:15:00
2022-01-06 14:30:00,Westerpark West,114,2022-01-06,14:30:00


In [86]:
dataframe.Location

0              Erasmuspark
1               Oosterpark
2             Sarphatipark
3          Vondelpark West
4         Westergasfabriek
                ...       
603667     Westerpark Oost
603668     Westerpark West
603669           Flevopark
603670        Sarphatipark
603671     Vondelpark West
Name: Location, Length: 603672, dtype: object

In [33]:
# These two lines add the complete date to a new column, used for merging with weather/holidays

# dataframe["Date"] = dataframe["Date"].astype('str')
# dataframe['Datetime']=pd.to_datetime(dataframe.Date + ' ' + dataframe.Time, format='%Y/%m/%d %H:%M:%S')

In [102]:
dataframe["Date"] = dataframe["Date"].astype('str')
dataframe['Datetime']=pd.to_datetime(dataframe.Date + ' ' + dataframe.Time, format='%Y/%m/%d %H:%M:%S')
dataframe = dataframe.set_index('Datetime')
dataframe.head()

Unnamed: 0_level_0,Location,Visits,Date,Time,Journeys_normal,Windspeed,Temperature,Clouds,Rain amount,Rain duration,...,Holiday_Name_Easter,Holiday_Name_Fall holiday,Holiday_Name_Good Friday,Holiday_Name_Kings day,Holiday_Name_Liberation Day,Holiday_Name_May holiday,Holiday_Name_Spring holiday,Holiday_Name_Summer holiday,Holiday_Name_Whit,Holiday_Name_Winter holiday
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-10-01 00:15:00,Erasmuspark,243,2020-10-01,00:15:00,85.861869,5.0,15.575,8.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2020-10-01 00:15:00,Oosterpark,53,2020-10-01,00:15:00,185.207097,5.0,15.575,8.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2020-10-01 00:15:00,Sarphatipark,96,2020-10-01,00:15:00,145.621873,5.0,15.575,8.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2020-10-01 00:15:00,Vondelpark West,18,2020-10-01,00:15:00,11.334497,5.0,15.575,8.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2020-10-01 00:15:00,Westergasfabriek,12,2020-10-01,00:15:00,11.040116,5.0,15.575,8.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [103]:
df_no_outliers = remove_outliers(dataframe)
df_no_outliers_int = interpolate_df(df_no_outliers, backfill=False)

df_resono_no_outliers = dataframe.copy()
df_resono_no_outliers['Visits'] = df_no_outliers_int['Visits']

In [110]:
df_resono_no_outliers

Unnamed: 0_level_0,Location,Visits,Date,Time,Journeys_normal,Windspeed,Temperature,Clouds,Rain amount,Rain duration,...,Holiday_Name_Easter,Holiday_Name_Fall holiday,Holiday_Name_Good Friday,Holiday_Name_Kings day,Holiday_Name_Liberation Day,Holiday_Name_May holiday,Holiday_Name_Spring holiday,Holiday_Name_Summer holiday,Holiday_Name_Whit,Holiday_Name_Winter holiday
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-10-01 00:15:00,Erasmuspark,243.0,2020-10-01,00:15:00,85.861869,5.0,15.575,8.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2020-10-01 00:15:00,Oosterpark,53.0,2020-10-01,00:15:00,185.207097,5.0,15.575,8.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2020-10-01 00:15:00,Sarphatipark,96.0,2020-10-01,00:15:00,145.621873,5.0,15.575,8.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2020-10-01 00:15:00,Vondelpark West,54.0,2020-10-01,00:15:00,11.334497,5.0,15.575,8.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2020-10-01 00:15:00,Westergasfabriek,12.0,2020-10-01,00:15:00,11.040116,5.0,15.575,8.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-06 14:45:00,Westerpark Oost,131.0,2022-01-06,14:45:00,74.106210,5.0,5.400,3.25,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2022-01-06 14:45:00,Westerpark West,91.0,2022-01-06,14:45:00,10.000000,5.0,5.400,3.25,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2022-01-06 15:00:00,Flevopark,165.0,2022-01-06,15:00:00,18.870370,5.0,5.200,4.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2022-01-06 15:00:00,Sarphatipark,776.0,2022-01-06,15:00:00,115.074419,5.0,5.200,4.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


### Make predictions

In [4]:
df = pd.read_csv('Merged_algo_tester_gvb.csv')
df.drop('Unnamed: 0', inplace=True, axis=1)

df["Date"] = df["Date"].astype('str')
df['Datetime']=pd.to_datetime(df.Date + ' ' + df.Time, format='%Y/%m/%d %H:%M:%S')

In [111]:
data_aug = df_resono_no_outliers.copy()

In [112]:
data_aug = add_time_vars(data_aug, onehot=True)
data_aug.drop(['Year', 'Month', 'Day', 'Weekday', "Holiday_name", 'Hour', 'Minute'], inplace=True, axis=1)

In [77]:
predictor_cols = data_aug.columns.to_list()[5:]
predictions = predict(data_aug, 'Westergasfabriek', predictor_cols, 1000)

RMSE :  135.620048
MAE :  73.029249


In [120]:
predictor_cols = data_aug.columns.to_list()[4:]
predictions = predict(data_aug, 'Westergasfabriek', predictor_cols, 1000)

RMSE :  140.506190
MAE :  69.065391


In [123]:
df_resono_no_outliers.to_csv('Merged_algo_tester_gvb.csv')

In [124]:
df_resono_no_outliers

Unnamed: 0_level_0,Location,Visits,Date,Time,Journeys_normal,Windspeed,Temperature,Clouds,Rain amount,Rain duration,...,Holiday_Name_Easter,Holiday_Name_Fall holiday,Holiday_Name_Good Friday,Holiday_Name_Kings day,Holiday_Name_Liberation Day,Holiday_Name_May holiday,Holiday_Name_Spring holiday,Holiday_Name_Summer holiday,Holiday_Name_Whit,Holiday_Name_Winter holiday
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-10-01 00:15:00,Erasmuspark,243.0,2020-10-01,00:15:00,85.861869,5.0,15.575,8.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2020-10-01 00:15:00,Oosterpark,53.0,2020-10-01,00:15:00,185.207097,5.0,15.575,8.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2020-10-01 00:15:00,Sarphatipark,96.0,2020-10-01,00:15:00,145.621873,5.0,15.575,8.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2020-10-01 00:15:00,Vondelpark West,54.0,2020-10-01,00:15:00,11.334497,5.0,15.575,8.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2020-10-01 00:15:00,Westergasfabriek,12.0,2020-10-01,00:15:00,11.040116,5.0,15.575,8.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-06 14:45:00,Westerpark Oost,131.0,2022-01-06,14:45:00,74.106210,5.0,5.400,3.25,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2022-01-06 14:45:00,Westerpark West,91.0,2022-01-06,14:45:00,10.000000,5.0,5.400,3.25,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2022-01-06 15:00:00,Flevopark,165.0,2022-01-06,15:00:00,18.870370,5.0,5.200,4.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2022-01-06 15:00:00,Sarphatipark,776.0,2022-01-06,15:00:00,115.074419,5.0,5.200,4.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
