In [2]:
import os
import uuid
import pickle

import pandas as pd

import mlflow

from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline

In [13]:
year = 2021
month = 2
taxi_type = "green"

input_file = f""
model = "../../notebooks/models/lin_reg.bin"
output_file = f"output/{taxi_type}/{year:04d}-{month:02d}.parquet"

In [14]:
def generate_uuids(n):
    ride_ids = [str(uuid.uuid4()) for ride_id in range(n)]
    return ride_ids

def read_dataframe(filename: str):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    return df

def prepare_dictionaries(df: pd.DataFrame):
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']

    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    return dicts

In [12]:
def apply_model(input_file, model, output_file):

    df = read_dataframe(input_file)
    dicts = prepare_dictionaries(df)

    y_pred = model.predict(dicts)

    df_result = pd.DataFrame()
    df_result['ride_id'] = df['ride_id']
    df_result['lpep_pickup_datetime'] = df['lpep_pickup_datetime']
    df_result['PULocationID'] = df['PULocationID']
    df_result['DOLocationID'] = df['DOLocationID']
    df_result['actual_duration'] = df['duration'] 
    df_result['predicted_duration'] = y_pred
    df_result['diff'] = df_result['actual_duration'] - df_result['predicted_duration']   
    df_result['model_version'] = 'Version:1'

    df_result.to_parquet(output_file, index=False)

In [9]:
ride_ids = generate_uuids(1000)
print(ride_ids[:100])

['004e27de-a3fe-40eb-808a-279d7c2ac1f1', 'cbda0903-f12a-482a-b762-e3bfeac40ef5', 'c286ad94-d838-480a-8bdf-45e5929bd4cc', 'c627611d-f5d4-4fa0-bbfd-ab047102fc58', '8a1fe1ce-151b-4df6-a885-484350bd2bcc', 'c0aeb13f-1606-4bb1-8718-d8d02a56af77', 'dc693e13-6f25-4552-8264-d6f91fe9430b', '80d6a397-72ec-4c5a-a294-e8da5b55198e', '527a5f02-d7c2-4099-b260-94ff9e719159', '4148dd97-da9a-4c09-b556-fc4d726e2dd4', 'd0a3bb06-81e5-4db4-9e60-0e3151da8d0e', '00b777a4-988e-4c70-aef1-30240d6c29e3', 'd8c9473a-301a-4f72-9480-a1f77c821518', '2abc53df-46dd-4e3b-b696-bb6807d2468f', '14c811d5-c016-4d89-a8f4-7639e6ecb971', '0a38a1ce-fc62-477d-9d18-bf28a09d1e41', '60bc11d0-8887-423f-9e2e-c38fd0aefffa', '12f2c82a-4424-4e01-8729-1b6d5ae616ac', '3fda276e-9e5a-4aa4-be63-4cdf173006b0', 'c0e463da-6fee-4fd6-8969-ef7daddf9ac4', '5487dcb4-c8f6-4445-a494-eef5a7c82677', '31df5fcd-9231-4473-a045-c84a27913c7d', '257f853b-7d2b-4c6b-8b5e-9b408e36d8a9', '26a48dcb-1d80-42df-a6e0-297039c00026', '15c355b5-34f5-4271-9a20-fedfe68c0900',