In [195]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Geospatial libraries
from h3 import h3 
import geopandas as gp
from shapely.geometry.polygon import Polygon

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from joblib import dump, load
import itertools
import csv

In [196]:
df = pd.read_parquet('data/prepped/weather_taxi_merged_df.parquet')

In [197]:
# create necessary features
df['week'] = df['trip_start_timestamp'].dt.isocalendar().week
df['2_hour_window'] = df['trip_start_timestamp'].dt.floor('2h').dt.hour

In [208]:
def create_prediction_dataset(df, temporal_resolution, spatial_resolution):
    
    if temporal_resolution < 24: 
        
        features = [f'h3_res{spatial_resolution}_pickup', 'week', 'weekday', f'{temporal_resolution}_hour_window',
                    'Temperature','Humidity','Precip.','Wind Speed', 'trip_id']
        df_pred = df.groupby([f'h3_res{spatial_resolution}_pickup','week','weekday',f'{temporal_resolution}_hour_window']).agg({
            'Temperature':'mean',
            'Humidity':'mean',
            'Precip.':'mean',
            'Wind Speed':'mean',
            'trip_id':'count',
        }).reset_index()
    else:
        
        features = [f'h3_res{spatial_resolution}_pickup', 'week', 'weekday','Temperature','Humidity','Precip.',
                    'Wind Speed', 'trip_id']
        df_pred = df.groupby([f'h3_res{spatial_resolution}_pickup','week','weekday']).agg({
            'Temperature':'mean',
            'Humidity':'mean',
            'Precip.':'mean',
            'Wind Speed':'mean',
            'trip_id':'count',
        }).reset_index()
    
    df_pred.rename(columns={'trip_id':'demand'}, inplace=True)
    df_dummy = pd.get_dummies(df_pred[f'h3_res{spatial_resolution}_pickup'],prefix=f'h3_res{spatial_resolution}')
    df_encoded = pd.concat([df_pred,df_dummy], axis=1)
    df_encoded = df_encoded.drop(f'h3_res{spatial_resolution}_pickup', axis=1)
    
    print(f'dataset temporal: {temporal_resolution} spatial: {spatial_resolution}')
    
    return df_encoded

def split_scale_data(df):
    
    X = df.drop('demand', axis=1)
    y = df[['demand']]
    
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X)
    y_train_scaled = scaler.fit_transform(y).ravel()
    X_test_scaled = scaler.fit_transform(X_test)
    y_test_scaled = scaler.fit_transform(y_test).ravel()
    
    return X_train_scaled,y_train_scaled,X_test_scaled,y_test_scaled,scaler #evtl scaler für inverse scaling

def gridsearch(X_train_scaled,y_train_scaled, temporal_resolution, spatial_resolution):
    
#     hyperparameters = {
#         'kernel': ['linear', 'poly', 'rbf'],
#         'epsilon': [0.1, 0.01, 0.001],
#         'C': [1, 10],
#         'degree': [2, 3],  # Dies wird ignoriert, es sei denn, der Kernel ist 'poly'
#     }
    hyperparameters = [
    {
        'kernel': ['linear'],
        'epsilon': [0.1, 0.001],
        'C': [1,10]  # Fixed 'C' value for 'linear' kernel
    },
    {
        'kernel': ['rbf'],
        'epsilon': [0.1, 0.001],
        'C': [1,10]  # Fixed 'C' value for 'rbf' kernel
    },
    {
        'kernel': ['poly'],
        'epsilon': [0.1, 0.001],
        'C': [1, 10],  # Varying 'C' values for 'poly' kernel
        'degree': [2, 3]  # 'degree' only applies to 'poly' kernel
    }
]
    model = SVR()
    
    print('start gridsearch')
    grid_search = GridSearchCV(estimator=model, param_grid=hyperparameters, scoring='neg_root_mean_squared_error', verbose=3)
    grid_search.fit(X_train_scaled, y_train_scaled)
    
    best_parameters = grid_search.best_params_
    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    
    dump(best_model, f"data/models/model_spatial_{spatial_resolution}_temporal_{temporal_resolution}.joblib")
    
    with open(f"data/models/parameter_spatial_{spatial_resolution}_temporal_{temporal_resolution}.csv", "w", newline="") as f:
        w = csv.DictWriter(f, best_parameters.keys())
        w.writeheader()
        w.writerow(best_parameters)
    
    print(f'best parameters: {best_parameters}')
    
    return best_model
    
def model_evaluation(model, scaler, X_test_scaled, y_test_scaled, temporal_resolution, spatial_resolution):
    
    y_pred = model.predict(X_test_scaled)
    
    evaluation_dict = {
        'mse' : mean_squared_error(scaler.inverse_transform(y_test_scaled.reshape(-1, 1)),scaler.inverse_transform(y_pred.reshape(-1, 1))),
        'rmse' : np.sqrt(mse),
        'r2' : r2_score(scaler.inverse_transform(y_test_scaled.reshape(-1, 1)),scaler.inverse_transform(y_pred.reshape(-1, 1))),
    }
    with open(f"data/models/metrics_spatial_{spatial_resolution}_temporal_{temporal_resolution}.csv", "w", newline="") as f:
        w = csv.DictWriter(f, evaluation_dict.keys())
        w.writeheader()
        w.writerow(evaluation_dict)
    print(f'metrics: {evaluation_dict}')
    

In [None]:
spatial_resolutions = [7,8]
temporal_resolutions = [1,2,6,24]

for spatial_resolution, temporal_resolution in itertools.product(spatial_resolutions, temporal_resolutions):
    
    df_pred = create_prediction_dataset(df, temporal_resolution, spatial_resolution)
    
    df_pred = df_pred.sample(frac=0.01)
    
    X_train_scaled,y_train_scaled,X_test_scaled,y_test_scaled,scaler = split_scale_data(df_pred)
    
    model = gridsearch(X_train_scaled,y_train_scaled, temporal_resolution, spatial_resolution)
    
    model_evaluation(model, scaler, X_test_scaled, y_test_scaled, temporal_resolution, spatial_resolution)

dataset temporal: 1 spatial: 7
start gridsearch
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END ..C=1, epsilon=0.1, kernel=linear;, score=-0.577 total time=   2.1s
[CV 2/5] END ..C=1, epsilon=0.1, kernel=linear;, score=-0.726 total time=   1.9s
[CV 3/5] END ..C=1, epsilon=0.1, kernel=linear;, score=-0.823 total time=   1.8s
[CV 4/5] END ..C=1, epsilon=0.1, kernel=linear;, score=-0.577 total time=   1.8s
[CV 5/5] END ..C=1, epsilon=0.1, kernel=linear;, score=-0.726 total time=   2.0s
[CV 1/5] END .C=1, epsilon=0.01, kernel=linear;, score=-0.586 total time=   8.9s
[CV 2/5] END .C=1, epsilon=0.01, kernel=linear;, score=-0.729 total time=   8.8s
[CV 3/5] END .C=1, epsilon=0.01, kernel=linear;, score=-0.830 total time=   8.4s
[CV 4/5] END .C=1, epsilon=0.01, kernel=linear;, score=-0.582 total time=   8.9s
[CV 5/5] END .C=1, epsilon=0.01, kernel=linear;, score=-0.735 total time=   9.1s
[CV 1/5] END C=1, epsilon=0.001, kernel=linear;, score=-0.586 total time=  11.1s

[CV 3/5] END C=10, degree=2, epsilon=0.01, kernel=poly;, score=-0.750 total time=   3.9s
[CV 4/5] END C=10, degree=2, epsilon=0.01, kernel=poly;, score=-0.559 total time=   3.7s
[CV 5/5] END C=10, degree=2, epsilon=0.01, kernel=poly;, score=-0.583 total time=   4.3s
[CV 1/5] END C=10, degree=2, epsilon=0.001, kernel=poly;, score=-0.510 total time=   3.7s
[CV 2/5] END C=10, degree=2, epsilon=0.001, kernel=poly;, score=-0.638 total time=   3.8s
[CV 3/5] END C=10, degree=2, epsilon=0.001, kernel=poly;, score=-0.753 total time=   5.3s
[CV 4/5] END C=10, degree=2, epsilon=0.001, kernel=poly;, score=-0.561 total time=   4.4s
[CV 5/5] END C=10, degree=2, epsilon=0.001, kernel=poly;, score=-0.585 total time=   4.5s
[CV 1/5] END C=10, degree=3, epsilon=0.1, kernel=poly;, score=-0.415 total time=   0.5s
[CV 2/5] END C=10, degree=3, epsilon=0.1, kernel=poly;, score=-0.538 total time=   0.3s
[CV 3/5] END C=10, degree=3, epsilon=0.1, kernel=poly;, score=-0.658 total time=   0.4s
[CV 4/5] END C=10, 

In [184]:
X = df_sample.drop('demand', axis=1)
y = df_sample[['demand']]

In [185]:
y.shape

(9560, 1)

In [186]:
X = df_sample.drop('demand', axis=1)
y = df_sample[['demand']]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [187]:
X = df_sample.drop('demand', axis=1)
y = df_sample[['demand']]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
y_train_scaled = scaler.fit_transform(y_train).ravel()
X_test_scaled = scaler.fit_transform(X_test)
y_test_scaled = scaler.fit_transform(y_test).ravel()

In [188]:
y_train_scaled.shape

(7648,)

In [189]:
model = SVR(kernel='linear')
model.fit(X_train_scaled,y_train_scaled)
y_pred = model.predict(X_test_scaled)

In [190]:
y_pred = model.predict(X_test_scaled)

In [191]:
mse = mean_squared_error(scaler.inverse_transform(y_test_scaled.reshape(-1, 1)),scaler.inverse_transform(y_pred.reshape(-1, 1)))
display(mse)
display(np.sqrt(mse))
display(r2_score(scaler.inverse_transform(y_test_scaled.reshape(-1, 1)),scaler.inverse_transform(y_pred.reshape(-1, 1))))

9665.631222347347

98.31394215647823

0.7368093225613284