In [1]:
import os 
from typing import List, Dict

import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.preprocessing import MinMaxScaler

In [2]:
base_path = '../dataset_nyc_taxi_samples/parquet/'

In [3]:
def get_csv_dict(names: List[str], csv_root: str) -> Dict[str, pd.DataFrame]:
    csv_dict = {}

    for name in names:
        csv_dict[name] = {}
        csv_dict[name]["train"] = pd.read_csv(os.path.join(csv_root, f"{name}_train_1M.csv"))

        csv_dict[name]["valid"] = pd.read_csv(os.path.join(csv_root, f"{name}_valid_500k.csv"))
    
    return csv_dict

In [4]:
names = ["fhv", "green", "yellow"]

csv_dict = get_csv_dict(names, "../dataset_nyc_taxi_samples/csv")

  csv_dict[name]["train"] = pd.read_csv(os.path.join(csv_root, f"{name}_train_1M.csv"))
  csv_dict[name]["valid"] = pd.read_csv(os.path.join(csv_root, f"{name}_valid_500k.csv"))


In [5]:
def update_columns(df: pd.DataFrame, taxi_type: str) -> pd.DataFrame:

    if taxi_type == "green":
        df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
        df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])

        df['store_and_fwd_flag'] = df['store_and_fwd_flag'] == 'Y'
        df['store_and_fwd_flag'] = df['store_and_fwd_flag'].astype(int)

        df['trip_time'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
        df['trip_time'] = df['trip_time'].values.astype(float)//10**9

        df['month'] = df['lpep_pickup_datetime'].dt.month
        df['weekday'] = df['lpep_pickup_datetime'].dt.weekday
        df['hour'] = df['lpep_pickup_datetime'].dt.hour

        df = df.drop(columns=['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'ehail_fee', 'total_amount'])


    if taxi_type == "yellow":
        df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
        df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

        df['store_and_fwd_flag'] = df['store_and_fwd_flag'] == 'Y'
        df['store_and_fwd_flag'] = df['store_and_fwd_flag'].astype(int)

        df['trip_time'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
        df['trip_time'] = df['trip_time'].values.astype(float)//10**9

        df['month'] = df['tpep_pickup_datetime'].dt.month
        df['weekday'] = df['tpep_pickup_datetime'].dt.weekday
        df['hour'] = df['tpep_pickup_datetime'].dt.hour

        df = df.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'airport_fee', 'total_amount'])

    if taxi_type == "fhv":
        df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
        df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
        df['request_datetime'] = pd.to_datetime(df['request_datetime'])

        for flag in ['shared_request_flag', 'shared_match_flag', 'access_a_ride_flag', 'wav_request_flag', 'wav_match_flag']:
            df[flag] = df[flag] == 'Y'
            df[flag] = df[flag].astype(int)

        df['trip_time'] = df['dropoff_datetime'] - df['pickup_datetime']
        df['trip_time'] = df['trip_time'].values.astype(float)//10**9

        df['wait_time'] = df['pickup_datetime'] - df['request_datetime']
        df['wait_time'] = df['wait_time'].values.astype(float)//10**9

        df['month'] = df['pickup_datetime'].dt.month
        df['weekday'] = df['pickup_datetime'].dt.weekday
        df['hour'] = df['pickup_datetime'].dt.hour

        df = df.drop(columns=['hvfhs_license_num', 'dispatching_base_num', 'originating_base_num', 'airport_fee',
                                        'request_datetime', 'on_scene_datetime', 'pickup_datetime', 'dropoff_datetime',])
    
    df = df.dropna()

    return df

In [6]:
shapefile = gpd.read_file("../taxi_zones")
location_to_borough_mapping = dict(zip(shapefile['LocationID'], shapefile['borough']))
borough_to_borough_idx_mappng = {'Manhattan': 0, 'Queens': 1, 'Brooklyn': 2, 'Bronx': 3, 'EWR': 4, 'Staten Island': 5}

location_to_borough_idx_mapping = dict(zip(shapefile['LocationID'], shapefile['borough'].map(borough_to_borough_idx_mappng)))
shapefile['borough_id'] = shapefile['borough'].map(borough_to_borough_idx_mappng)


In [7]:
for name in names:
    # change id, map borough to id
    csv_dict[name]["train"]["PUBoroughID"] = (csv_dict[name]["train"]["PULocationID"]-1).map(location_to_borough_idx_mapping)
    csv_dict[name]["train"]["DOBoroughID"] = (csv_dict[name]["train"]["DOLocationID"]-1).map(location_to_borough_idx_mapping)
    csv_dict[name]["valid"]["PUBoroughID"] = (csv_dict[name]["valid"]["PULocationID"]-1).map(location_to_borough_idx_mapping)
    csv_dict[name]["valid"]["DOBoroughID"] = (csv_dict[name]["valid"]["DOLocationID"]-1).map(location_to_borough_idx_mapping)

    # drop nans on PUBoroughID, DOBoroughID
    len_train_before = len(csv_dict[name]["train"])
    len_valid_before = len(csv_dict[name]["valid"])

    csv_dict[name]["train"].dropna(subset=["PUBoroughID", "DOBoroughID"], inplace=True)
    csv_dict[name]["valid"].dropna(subset=["PUBoroughID", "DOBoroughID"], inplace=True)

    csv_dict[name]["train"] = update_columns(csv_dict[name]["train"], name)
    csv_dict[name]["valid"] = update_columns(csv_dict[name]["valid"], name)

    len_train_after = len(csv_dict[name]["train"])
    len_valid_after = len(csv_dict[name]["valid"])

    print(f"{name} - train reduced: {(len_train_before-len_train_after)/len_train_before}%, num samples: {len_train_before-len_train_after}")
    print(f"{name} - valid reduced: {(len_valid_before-len_valid_after)/len_valid_before}%, num samples: {len_valid_before-len_valid_after}")



fhv - train reduced: 0.04675354286691461%, num samples: 46752
fhv - valid reduced: 0.046842594722158334%, num samples: 23422
green - train reduced: 0.009437283118493555%, num samples: 9437
green - valid reduced: 0.00951161953521859%, num samples: 4756
yellow - train reduced: 0.005849181324621063%, num samples: 5849
yellow - valid reduced: 0.0057577927194620996%, num samples: 2879


# All Data

In [36]:
from typing import Callable, List
from sklearn.metrics import mean_squared_error, mean_absolute_error


def train_per_dataset(
    experiment_name: str,
    csv_dict: Dict, 
    names: List[str], 
    transform_data: Callable, 
    fit_predict: Callable
    ) -> pd.DataFrame:
    
    df = pd.DataFrame([])
    for name in names:
        scaler = MinMaxScaler()
        tip_label = [label for label in csv_dict[name]["train"].columns if "tip" in label][0]

        csv_dict_transformed = {}
        csv_dict_transformed[name] = transform_data(csv_dict[name].copy(), name)
        
        y_train = csv_dict_transformed[name]["train"].copy()[tip_label].to_numpy()
        y_val = csv_dict_transformed[name]["valid"].copy()[tip_label].to_numpy()

        x_train = csv_dict_transformed[name]["train"].copy().drop(columns=[tip_label]).to_numpy()
        x_val = csv_dict_transformed[name]["valid"].copy().drop(columns=[tip_label]).to_numpy()

        x_train = scaler.fit_transform(x_train)
        x_val = scaler.transform(x_val)

        y_pred = fit_predict(x_train, y_train, x_val)
        
        mse = mean_squared_error(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        print(f"""
        ============================================================
        Dataset: {name}

        Train num: {len(y_train)}
        Val num: {len(y_val)}
        Train + Val num: {len(np.concatenate([y_train, y_val]))}

        Train Mean tip: {np.mean(y_train)}
        Val Mean tip: {np.mean(y_val)}
        Train + Val Mean tip: {np.mean(np.concatenate([y_train, y_val]))}
        
        Mean Squared Error: {mse}
        Mean Absolute Error: {mae}
        ============================================================
        """
        )
        
        row = pd.DataFrame.from_dict(
            {
                "experiment_name": [experiment_name],
                "name": [name],
                "mse": [mse],
                "mae": [mae],
            }
            )
        df = pd.concat([df, row])
        
    return df

In [9]:
csv_dict[name]["train"].columns

Index(['VendorID', 'passenger_count', 'trip_distance', 'RatecodeID',
       'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type',
       'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'congestion_surcharge', 'retail_price',
       'temperature_2m (°C)', 'relative_humidity_2m (%)', 'dew_point_2m (°C)',
       'apparent_temperature (°C)', 'precipitation (mm)', 'rain (mm)',
       'snowfall (cm)', 'snow_depth (m)', 'surface_pressure (hPa)',
       'cloud_cover (%)', 'wind_speed_10m (km/h)', 'is_day ()', 'PUBoroughID',
       'DOBoroughID', 'trip_time', 'month', 'weekday', 'hour'],
      dtype='object')

In [42]:
results = pd.DataFrame([])

## All Data - Regression model

In [43]:
from sklearn.linear_model import LinearRegression

def fit_predict(x_train, y_train, x_val):
    model = LinearRegression()
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    return y_pred

def transform_data(csv_dict, name):
    return csv_dict

df = train_per_dataset(
    experiment_name="all data - regression model",
    csv_dict=csv_dict, 
    names=names, 
    transform_data=transform_data, 
    fit_predict=fit_predict
    )

results = pd.concat([results, df])


        Dataset: fhv

        Train num: 953215
        Val num: 476593
        Train + Val num: 1429808

        Train Mean tip: 0.762667058323673
        Val Mean tip: 0.7605058823776264
        Train + Val Mean tip: 0.7619466809529671
        
        Mean Squared Error: 4.634459390855498
        Mean Absolute Error: 1.2076065665555733
        

        Dataset: green

        Train num: 990533
        Val num: 495264
        Train + Val num: 1485797

        Train Mean tip: 2.2083399139655118
        Val Mean tip: 2.2135968089746076
        Train + Val Mean tip: 2.210092206405047
        
        Mean Squared Error: 6.683244293988039
        Mean Absolute Error: 1.3983482675412024
        

        Dataset: yellow

        Train num: 994120
        Val num: 497139
        Train + Val num: 1491259

        Train Mean tip: 3.308909900213255
        Val Mean tip: 3.313971867023106
        Train + Val Mean tip: 3.310597401256256
        
        Mean Squared Error: 3.9543684977364024


## All Data - Experiment models

In [None]:
# from sklearn.svm import SVR
# from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

def fit_predict(x_train, y_train, x_val):
    # model = SVR(verbose=True)
    # model = MLPRegressor(hidden_layer_sizes=(512, 1024), verbose=True)
    model = RandomForestRegressor(n_estimators=100, n_jobs=-1, verbose=1)
    
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    return y_pred

def transform_data(csv_dict, name):
    return csv_dict

df = train_per_dataset(
    experiment_name="all data - x model",
    csv_dict=csv_dict, 
    names=names, 
    transform_data=transform_data, 
    fit_predict=fit_predict
    )

results = pd.concat([results, df])


## Apriori Data - Regression model

In [21]:
[(name, csv_dict[name]["train"].columns) for name in names]

[('fhv',
  Index(['PULocationID', 'DOLocationID', 'trip_miles', 'trip_time',
         'base_passenger_fare', 'tolls', 'bcf', 'sales_tax',
         'congestion_surcharge', 'tips', 'driver_pay', 'shared_request_flag',
         'shared_match_flag', 'access_a_ride_flag', 'wav_request_flag',
         'wav_match_flag', 'retail_price', 'temperature_2m (°C)',
         'relative_humidity_2m (%)', 'dew_point_2m (°C)',
         'apparent_temperature (°C)', 'precipitation (mm)', 'rain (mm)',
         'snowfall (cm)', 'snow_depth (m)', 'surface_pressure (hPa)',
         'cloud_cover (%)', 'wind_speed_10m (km/h)', 'is_day ()', 'PUBoroughID',
         'DOBoroughID', 'wait_time', 'month', 'weekday', 'hour'],
        dtype='object')),
 ('green',
  Index(['VendorID', 'store_and_fwd_flag', 'RatecodeID', 'PULocationID',
         'DOLocationID', 'passenger_count', 'trip_distance', 'fare_amount',
         'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
         'improvement_surcharge', 'payment_type', 'tr

In [44]:
from sklearn.linear_model import LinearRegression

def fit_predict(x_train, y_train, x_val):
    model = LinearRegression()
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    return y_pred

def transform_data(csv_dict, name):
    
    
    if name == 'fhv':
        labels_drop = [
            # 'PULocationID', 
            # 'DOLocationID', 
            # 'trip_miles', 
            'trip_time',
            'base_passenger_fare', 
            'tolls', 
            'bcf', 
            'sales_tax',
            'congestion_surcharge', 
            # 'tips', 
            'driver_pay', 
            # 'shared_request_flag',
            # 'shared_match_flag', 
            # 'access_a_ride_flag', 
            'wav_request_flag',
            'wav_match_flag', 
            # 'retail_price', 
            # 'temperature_2m (°C)',
            # 'relative_humidity_2m (%)', 
            # 'dew_point_2m (°C)',
            # 'apparent_temperature (°C)', 
            # 'precipitation (mm)', 
            # 'rain (mm)',
            # 'snowfall (cm)', 
            # 'snow_depth (m)', 
            # 'surface_pressure (hPa)',
            # 'cloud_cover (%)', 
            # 'wind_speed_10m (km/h)', 
            # 'is_day ()', 
            # 'PUBoroughID',
            # 'DOBoroughID', 
            # 'wait_time', 
            # 'month', 
            # 'weekday', 
            # 'hour'
            ]
    
    if name == 'green':
        labels_drop = [
            # 'VendorID', 
            # 'store_and_fwd_flag', 
            'RatecodeID', 
            # 'PULocationID',
            # 'DOLocationID', 
            # 'passenger_count', 
            # 'trip_distance', 
            'fare_amount',
            'extra', 
            'mta_tax', 
            # 'tip_amount', 
            'tolls_amount',
            'improvement_surcharge', 
            'payment_type', 
            # 'trip_type',
            'congestion_surcharge', 
            # 'retail_price', 
            # 'temperature_2m (°C)',
            # 'relative_humidity_2m (%)', 
            # 'dew_point_2m (°C)',
            # 'apparent_temperature (°C)', 
            # 'precipitation (mm)', 
            # 'rain (mm)',
            # 'snowfall (cm)', 
            # 'snow_depth (m)', 
            # 'surface_pressure (hPa)',
            # 'cloud_cover (%)', 
            # 'wind_speed_10m (km/h)', 
            # 'is_day ()', 
            # 'PUBoroughID',
            # 'DOBoroughID', 
            # 'trip_time', 
            # 'month', 
            # 'weekday', 
            # 'hour'
            ]
           
    if name == 'yellow':
        labels_drop = [
            # 'VendorID', 
            # 'passenger_count', 
            # 'trip_distance', 
            'RatecodeID',
            # 'store_and_fwd_flag', 
            # 'PULocationID', 
            # 'DOLocationID', 
            # 'payment_type',
            'fare_amount', 
            'extra', 
            'mta_tax', 
            # 'tip_amount', 
            'tolls_amount',
            'improvement_surcharge', 
            'congestion_surcharge', 
            # 'retail_price',
            # 'temperature_2m (°C)', 
            # 'relative_humidity_2m (%)', 
            # 'dew_point_2m (°C)',
            # 'apparent_temperature (°C)', 
            # 'precipitation (mm)', 
            # 'rain (mm)',
            # 'snowfall (cm)', 
            # 'snow_depth (m)', 
            # 'surface_pressure (hPa)',
            # 'cloud_cover (%)', 
            # 'wind_speed_10m (km/h)', 
            # 'is_day ()', 
            # 'PUBoroughID',
            # 'DOBoroughID', 
            # 'trip_time', 
            # 'month',
            # 'weekday', 
            # 'hour'
            ]

    
    csv_dict['train'] = csv_dict["train"].drop(columns=labels_drop)
    csv_dict['valid'] = csv_dict["valid"].drop(columns=labels_drop)
     
    return csv_dict

df = train_per_dataset(
    experiment_name="apriori data - regression model",
    csv_dict=csv_dict, 
    names=names, 
    transform_data=transform_data, 
    fit_predict=fit_predict
    )

results = pd.concat([results, df])


        Dataset: fhv

        Train num: 953215
        Val num: 476593
        Train + Val num: 1429808

        Train Mean tip: 0.762667058323673
        Val Mean tip: 0.7605058823776264
        Train + Val Mean tip: 0.7619466809529671
        
        Mean Squared Error: 4.8963749112554815
        Mean Absolute Error: 1.244448960581793
        

        Dataset: green

        Train num: 990533
        Val num: 495264
        Train + Val num: 1485797

        Train Mean tip: 2.2083399139655118
        Val Mean tip: 2.2135968089746076
        Train + Val Mean tip: 2.210092206405047
        
        Mean Squared Error: 7.627464099730798
        Mean Absolute Error: 1.5543121338027404
        

        Dataset: yellow

        Train num: 994120
        Val num: 497139
        Train + Val num: 1491259

        Train Mean tip: 3.308909900213255
        Val Mean tip: 3.313971867023106
        Train + Val Mean tip: 3.310597401256256
        
        Mean Squared Error: 7.705462275243002
 

## No Additional Data - Regression model

In [45]:
from sklearn.linear_model import LinearRegression

def fit_predict(x_train, y_train, x_val):
    model = LinearRegression()
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    return y_pred

def transform_data(csv_dict, name):
    
    
    if name == 'fhv':
        labels_drop = [
            # 'PULocationID', 
            # 'DOLocationID', 
            # 'trip_miles', 
            # 'trip_time',
            # 'base_passenger_fare', 
            # 'tolls', 
            # 'bcf', 
            # 'sales_tax',
            # 'congestion_surcharge', 
            # 'tips', 
            # 'driver_pay', 
            # 'shared_request_flag',
            # 'shared_match_flag', 
            # 'access_a_ride_flag', 
            # 'wav_request_flag',
            # 'wav_match_flag', 
            'retail_price', 
            'temperature_2m (°C)',
            'relative_humidity_2m (%)', 
            'dew_point_2m (°C)',
            'apparent_temperature (°C)', 
            'precipitation (mm)', 
            'rain (mm)',
            'snowfall (cm)', 
            'snow_depth (m)', 
            'surface_pressure (hPa)',
            'cloud_cover (%)', 
            'wind_speed_10m (km/h)', 
            # 'is_day ()', 
            # 'PUBoroughID',
            # 'DOBoroughID', 
            # 'wait_time', 
            # 'month', 
            # 'weekday', 
            # 'hour'
            ]
    
    if name == 'green':
        labels_drop = [
            # 'VendorID', 
            # 'store_and_fwd_flag', 
            # 'RatecodeID', 
            # 'PULocationID',
            # 'DOLocationID', 
            # 'passenger_count', 
            # 'trip_distance', 
            # 'fare_amount',
            # 'extra', 
            # 'mta_tax', 
            # 'tip_amount', 
            # 'tolls_amount',
            # 'improvement_surcharge', 
            # 'payment_type', 
            # 'trip_type',
            # 'congestion_surcharge', 
            'retail_price', 
            'temperature_2m (°C)',
            'relative_humidity_2m (%)', 
            'dew_point_2m (°C)',
            'apparent_temperature (°C)', 
            'precipitation (mm)', 
            'rain (mm)',
            'snowfall (cm)', 
            'snow_depth (m)', 
            'surface_pressure (hPa)',
            'cloud_cover (%)', 
            'wind_speed_10m (km/h)', 
            # 'is_day ()', 
            # 'PUBoroughID',
            # 'DOBoroughID', 
            # 'trip_time', 
            # 'month', 
            # 'weekday', 
            # 'hour'
            ]
           
    if name == 'yellow':
        labels_drop = [
            # 'VendorID', 
            # 'passenger_count', 
            # 'trip_distance', 
            # 'RatecodeID',
            # 'store_and_fwd_flag', 
            # 'PULocationID', 
            # 'DOLocationID', 
            # 'payment_type',
            # 'fare_amount', 
            # 'extra', 
            # 'mta_tax', 
            # 'tip_amount', 
            # 'tolls_amount',
            # 'improvement_surcharge', 
            # 'congestion_surcharge', 
            'retail_price',
            'temperature_2m (°C)', 
            'relative_humidity_2m (%)', 
            'dew_point_2m (°C)',
            'apparent_temperature (°C)', 
            'precipitation (mm)', 
            'rain (mm)',
            'snowfall (cm)', 
            'snow_depth (m)', 
            'surface_pressure (hPa)',
            'cloud_cover (%)', 
            'wind_speed_10m (km/h)', 
            # 'is_day ()', 
            # 'PUBoroughID',
            # 'DOBoroughID', 
            # 'trip_time', 
            # 'month',
            # 'weekday', 
            # 'hour'
            ]

    
    csv_dict['train'] = csv_dict["train"].drop(columns=labels_drop)
    csv_dict['valid'] = csv_dict["valid"].drop(columns=labels_drop)
     
    return csv_dict

df = train_per_dataset(
    experiment_name="no additional data - regression model",
    csv_dict=csv_dict, 
    names=names, 
    transform_data=transform_data, 
    fit_predict=fit_predict
    )

results = pd.concat([results, df])


        Dataset: fhv

        Train num: 953215
        Val num: 476593
        Train + Val num: 1429808

        Train Mean tip: 0.762667058323673
        Val Mean tip: 0.7605058823776264
        Train + Val Mean tip: 0.7619466809529671
        
        Mean Squared Error: 4.637112855270725
        Mean Absolute Error: 1.2075998834158606
        

        Dataset: green

        Train num: 990533
        Val num: 495264
        Train + Val num: 1485797

        Train Mean tip: 2.2083399139655118
        Val Mean tip: 2.2135968089746076
        Train + Val Mean tip: 2.210092206405047
        
        Mean Squared Error: 6.70013156118884
        Mean Absolute Error: 1.4006858458696765
        

        Dataset: yellow

        Train num: 994120
        Val num: 497139
        Train + Val num: 1491259

        Train Mean tip: 3.308909900213255
        Val Mean tip: 3.313971867023106
        Train + Val Mean tip: 3.310597401256256
        
        Mean Squared Error: 3.9560576062100137
 

In [46]:
results

Unnamed: 0,experiment_name,name,mse,mae
0,all data - regression model,fhv,4.634459,1.207607
0,all data - regression model,green,6.683244,1.398348
0,all data - regression model,yellow,3.954368,0.959758
0,apriori data - regression model,fhv,4.896375,1.244449
0,apriori data - regression model,green,7.627464,1.554312
0,apriori data - regression model,yellow,7.705462,1.658316
0,no additional data - regression model,fhv,4.637113,1.2076
0,no additional data - regression model,green,6.700132,1.400686
0,no additional data - regression model,yellow,3.956058,0.959167


In [47]:
results.sort_values(by="mae")

Unnamed: 0,experiment_name,name,mse,mae
0,no additional data - regression model,yellow,3.956058,0.959167
0,all data - regression model,yellow,3.954368,0.959758
0,no additional data - regression model,fhv,4.637113,1.2076
0,all data - regression model,fhv,4.634459,1.207607
0,apriori data - regression model,fhv,4.896375,1.244449
0,all data - regression model,green,6.683244,1.398348
0,no additional data - regression model,green,6.700132,1.400686
0,apriori data - regression model,green,7.627464,1.554312
0,apriori data - regression model,yellow,7.705462,1.658316


- wszystki dane
- dane które są znane przed wyjazdem taksówkarza
- bez danych paliwowych i pogodowych, żeby zobaczyć czy coś daje