# Mileage prediction
Predicting the increase of mileage that the vehicle will have on its next inspection. Since the basis of the prediction will be regression, predicted negative values will be discarded.

In [2]:
import datetime
import os
import random

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 6)
font = {'family' : 'sans-serif',
        'weight' : 'normal',
        'size'   : 14}

import matplotlib
matplotlib.rc('font', **font)

from sqlalchemy import create_engine, text
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100
import statsmodels.api as sm
import tsplot as ts

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid

from catboost import (CatBoostRegressor, Pool, sum_models,)

## Data preparation

In [3]:
def connect():
  username = os.environ["POSTGRES_USER"]
  password = os.environ["POSTGRES_PASSWORD"]
  conn_string = f"postgresql+psycopg2://{username}:{password}@{os.environ['POSTGRES_HOST']}/{os.environ['POSTGRES_DB']}"
  db = create_engine(conn_string)
  conn = db.connect()
  return conn

In [4]:
def load_data(conn, offset, limit):
    records = conn.execute(
        text(
            f"""SELECT * FROM vehicle_inspections_at_least_two
             ORDER BY vin, date ASC
             LIMIT {limit} OFFSET {offset}"""
        )
    ).all()

    data = pd.DataFrame(np.array(records).reshape((-1, 24)))
    
    data.columns = [
        "vin",
        "make",
        "model_primary",
        "motor_power",
        "motor_volume",
        "drive_type",
        "station_id",
        "date",
        "result",
        "mileage",
        "vehicle_age",
        "defects_a",
        "defects_b",
        "defects_c",
        "defects_0",
        "defects_1",
        "defects_2",
        "defects_3",
        "defects_4",
        "defects_5",
        "defects_6",
        "defects_7",
        "defects_8",
        "defects_9",
    ]

    for col, dt in zip(data.columns, [
        "object",
        "category",
        "category",
        np.float32,
        np.float32,
        "object",
        "category",
        "datetime64[ns]",
        "category",
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
    ]):
        data[col] = data[col].astype(dt)

    return data

### Add drive type features
Add flags for each drive type group (True when the vehicles has a gas/diesel/electric/... engine).

In [5]:
# Create grouped drive type flags
def add_drive_type_features(df: pd.DataFrame): 

    def drive_type_flags(row):
        data = str(row["drive_type"])

        benzin = False
        nafta = False
        elektropohon = False
        plyn = False

        if "Benzin" in data:
            benzin = True

        if "Nafta" in data:
            nafta = True
        
        if "LPG" in data or "CNG" in data or "LNG" in data or "BIO Metan" in data or "Vodík" in data:
            plyn = True
        
        if "Elektropohon" in data:
            elektropohon = True

        return (benzin, nafta, elektropohon, plyn)
        
    df[["benzin", "nafta", "elektropohon", "plyn"]] = df.apply(drive_type_flags, axis=1, result_type='expand')

    return df

### Impute missing values
Impute motor power and volume with the mean of the same make and model.
The mean is taken from a big chunk of the overall data, so it should be trustworthy.
If no non-null value is found in the given make+model group, the row will be dropped and no prediction will be available for such vehicles.

In [6]:
def impute_missing_values(df):
  df["motor_power"] = df.groupby(["make", "model_primary"], observed=False)["motor_power"].transform(lambda x: x.fillna(np.mean(x)))
  df["motor_volume"] = df.groupby(["make", "model_primary"], observed=False)["motor_volume"].transform(lambda x: x.fillna(np.mean(x)))
  return df

## Create training dataset

### Add mileage increase (explained variable)

In [7]:
def add_mileage_increase(df):
  mileage_increase = []

  for i, row in enumerate(df.itertuples()):
    # Break if we are at the last inspection.
    if i + 1 > df.shape[0] - 1:
      mileage_increase.append(np.nan)
      continue

    vin = row.vin
    next_vin = df.iloc[i + 1, 0]

    if vin == next_vin:
      current_mileage = row.mileage
      next_mileage = df.iloc[i + 1, 9]
      increase = next_mileage - current_mileage
      mileage_increase.append(increase)
    else:
      mileage_increase.append(np.nan)

  df[f"mileage_increase"] = np.array(mileage_increase)

  return df

### Remove columns and rows, create split arrays

In [8]:
def clean_df(df: pd.DataFrame):
  df = df.drop(columns=["vin", "drive_type", "station_id", "date"])
  df = df.dropna(how="any", subset=["mileage_increase"])
  df = df.reset_index().drop(columns=["index"])
  return df

## Train the model in batches, find optimal parameters

In [39]:
def train(X, y, prev_model):

    # training parameters
    params = {
        'task_type': 'GPU',
        'devices': '0:1',
        'iterations': 1000,
        'learning_rate': 0.03,
        'depth': 12,
        'loss_function': 'RMSE',
    }

    model = CatBoostRegressor(**params)

    batch = Pool(X, label=y, cat_features=["make", "model_primary", "result", "benzin", "nafta", "elektropohon", "plyn"])
    if prev_model:
        batch.set_baseline(prev_model.predict(batch))
    model.fit(X=batch)

    return model


## Evaluate model

In [34]:
# Train a batch.
def pipeline(conn, offset, limit, prev_model):
  # Load batch.
  df = load_data(conn, offset, limit)
  
  # Preprocess data.
  df = add_drive_type_features(df)
  df = impute_missing_values(df)
  df = df.dropna(axis=0, how="any", subset=["motor_power", "motor_volume"])
  df = add_mileage_increase(df)
  df = clean_df(df)

  # Split to train and test.
  X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["mileage_increase"]), df["mileage_increase"],
                           test_size=0.05, random_state=420)
  
  # Train the batch.
  model = train(X_train, y_train, prev_model)

  return model, X_test, y_test

In [35]:
conn = connect()

# Get total amount of data.
records = conn.execute(
        text(
            f"""SELECT count(*) FROM vehicle_inspections_at_least_two"""
        )
    ).all()
total_data = np.array(records)[0,0]

# Train model in batches.
processed = 0
batch_size = 1000000
X_tests = []
y_tests = []
prev_model = None
models = []
while processed < total_data:
  prev_model, X_test, y_test = pipeline(conn=conn, offset=processed, limit=batch_size, prev_model=prev_model)

  models.append(prev_model)
  X_tests.append(X_test)
  y_tests.append(y_test)

  processed += batch_size

0:	learn: 47209.4023565	total: 98.5ms	remaining: 1m 38s
1:	learn: 47122.6059881	total: 184ms	remaining: 1m 32s
2:	learn: 47033.1415639	total: 263ms	remaining: 1m 27s
3:	learn: 46951.9405794	total: 336ms	remaining: 1m 23s
4:	learn: 46887.5047084	total: 414ms	remaining: 1m 22s
5:	learn: 46808.7403556	total: 642ms	remaining: 1m 46s
6:	learn: 46736.1679650	total: 642ms	remaining: 1m 46s
7:	learn: 46665.6576985	total: 876ms	remaining: 2m 4s
8:	learn: 46581.4353957	total: 950ms	remaining: 1m 57s
9:	learn: 46515.2622769	total: 1.02s	remaining: 1m 52s
10:	learn: 46446.4260072	total: 1.09s	remaining: 1m 48s
11:	learn: 46392.3853429	total: 1.17s	remaining: 1m 45s
12:	learn: 46328.9291751	total: 1.25s	remaining: 1m 42s
13:	learn: 46257.8249205	total: 1.32s	remaining: 1m 40s
14:	learn: 46210.2282692	total: 1.4s	remaining: 1m 38s
15:	learn: 46148.1872361	total: 1.49s	remaining: 1m 37s
16:	learn: 46097.6501206	total: 1.57s	remaining: 1m 36s
17:	learn: 46037.5921271	total: 1.65s	remaining: 1m 35s
18:

In [36]:
# Join test datasets for each model.
summed_model = sum_models(models, weights=[1.0/len(models)] * len(models))
X_test = pd.concat(X_tests, axis=0)
y_test = pd.concat(y_tests)

In [49]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import mean_absolute_error

# Evaluate model
pred = summed_model.predict(X_test, prediction_type='RawFormulaVal')
# pred_class = [1.0 if val >= 0.5 else 0 for val in pred]

mae = mean_absolute_error(y_test, pred)

print('MAE:',mae)
  

MAE: 26936.856719165244


In [38]:
summed_model.save_model(f'models/2024-03-16-X_model')

## Train the model with all available data

In [40]:
# Train a batch of each of the 10 models for each defect category.
def pipeline(conn, offset, limit, prev_model):
  # Load batch.
  df = load_data(conn, offset, limit)
  
  # Preprocess data.
  df = add_drive_type_features(df)
  df = impute_missing_values(df)
  df = df.dropna(axis=0, how="any", subset=["motor_power", "motor_volume"])
  df = add_mileage_increase(df)
  df = clean_df(df)

  # Split to train and labels.

  X_train = df.drop(columns=["mileage_increase"])
  y_train = df["mileage_increase"]

  # Train a batch on each model.
  model = train(X_train, y_train, prev_model)
  
  return model

In [41]:
conn = connect()

# Prepare data
# prepare_view(conn)

# Get total amount of data.
records = conn.execute(
        text(
            f"""SELECT count(*) FROM vehicle_inspections_at_least_two"""
        )
    ).all()
total_data = np.array(records)[0,0]

# Train models in simultaneous batches.
processed = 0
batch_size = 1000000
# We train 7 models because the last 3 defect categories are very uncommon.
models = []
prev_model = None
while processed < total_data:
  prev_model = pipeline(conn=conn, offset=processed, limit=batch_size, prev_model=prev_model)

  models.append(prev_model)

  processed += batch_size

0:	learn: 47520.8315357
1:	learn: 47424.7028850	total: 82ms	remaining: 1m 21s
2:	learn: 47336.3608868	total: 313ms	remaining: 2m 36s
3:	learn: 47254.3363165	total: 390ms	remaining: 2m 9s
4:	learn: 47164.7328451	total: 467ms	remaining: 1m 56s
5:	learn: 47089.2617510	total: 554ms	remaining: 1m 50s
6:	learn: 47005.7703922	total: 631ms	remaining: 1m 44s
7:	learn: 46937.3342409	total: 714ms	remaining: 1m 41s
8:	learn: 46863.7125026	total: 785ms	remaining: 1m 37s
9:	learn: 46803.2570156	total: 873ms	remaining: 1m 36s
10:	learn: 46739.6441547	total: 969ms	remaining: 1m 35s
11:	learn: 46673.9846287	total: 1.05s	remaining: 1m 34s
12:	learn: 46608.0181621	total: 1.13s	remaining: 1m 32s
13:	learn: 46549.4409134	total: 1.21s	remaining: 1m 31s
14:	learn: 46488.4426823	total: 1.28s	remaining: 1m 30s
15:	learn: 46442.2338045	total: 1.36s	remaining: 1m 29s
16:	learn: 46397.6160911	total: 1.44s	remaining: 1m 28s
17:	learn: 46349.9036291	total: 1.52s	remaining: 1m 28s
18:	learn: 46297.0677502	total: 1.6

In [42]:
summed_model = sum_models(models, weights=[1.0/len(models)] * len(models))

In [43]:
summed_model.save_model(f'models/all-data-2024-03-16_model')

## Use the model for final prediction

In [50]:
def load_data(conn, offset, limit):
    records = conn.execute(
        text(
            f"""SELECT * FROM common_vehicle_last_inspections
             ORDER BY vin
             LIMIT {limit} OFFSET {offset}"""
        )
    ).all()

    data = pd.DataFrame(np.array(records).reshape((-1, 22)))
    
    data.columns = [
        "vin",
        "make",
        "model_primary",
        "motor_power",
        "motor_volume",
        "drive_type",
        "result",
        "mileage",
        "vehicle_age",
        "defects_a",
        "defects_b",
        "defects_c",
        "defects_0",
        "defects_1",
        "defects_2",
        "defects_3",
        "defects_4",
        "defects_5",
        "defects_6",
        "defects_7",
        "defects_8",
        "defects_9",
    ]

    for col, dt in zip(data.columns, [
        "object",
        "category",
        "category",
        np.float32,
        np.float32,
        "object",
        "category",
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
        np.float32,
    ]):
        data[col] = data[col].astype(dt)

    return data

In [51]:
def predict(X, model_path):
    model = CatBoostRegressor()
    model.load_model(model_path)

    pred = model.predict(X, prediction_type='RawFormulaVal')

    return pred

In [60]:
def pipeline(conn, offset, limit, model_path):
  print("  load")
  # Load batch.
  df = load_data(conn, offset, limit)
  
  print("  preprocess")
  # Preprocess data.
  df = add_drive_type_features(df)
  df = impute_missing_values(df)
  df = df.dropna(axis=0, how="any", subset=["motor_power", "motor_volume"])
  vin = df["vin"]
  current_mileage = df["mileage"]
  X = df = df.drop(columns=["vin", "drive_type"])

  print("  predict")
  # Predict.
  predicted_mileage_increase = predict(X, model_path)
  predicted_mileage_increase = pd.Series(predicted_mileage_increase)
  predicted_mileage_increase.mask(lambda x: x <= 0)
  
  # Assemble dataframe with predictions and write to DB.
  print("  assemble")
  result = pd.concat([vin, current_mileage, current_mileage + predicted_mileage_increase], axis=1)
  result.columns = ["vin", "current_mileage", "future_mileage"]
  result["incorrect"] = result["future_mileage"] <= result["current_mileage"]
  result["incorrect"] = result["incorrect"].mask(lambda x: x == True)
  result = result.dropna()
  result = result.drop(columns=["current_mileage", "incorrect"])
  print("  save")
  result.to_sql(con=conn, name="vehicles_mileage_prediction", if_exists='append', index=False)
  print("  commit")
  conn.commit()
  

In [61]:
conn = connect()
# prepare_data(conn)
# conn.commit()

# Get model paths.
model_path = "models/all-data-2024-03-16_model"

# Clear table.
conn.execute(text("DROP TABLE IF EXISTS vehicles_mileage_prediction"))

# Get total amount of data.
records = conn.execute(text("SELECT count(*) FROM common_vehicle_last_inspections")).all()
total_data = np.array(records)[0,0]

# Predict in batches.
processed = 0
batch_size = 1000000
while processed < total_data:
  print(f"batch {processed}")
  pipeline(conn=conn, offset=processed, limit=batch_size, model_path=model_path)
  processed += batch_size

batch 0
  load
  preprocess
  predict
  assemble
  save
  commit
batch 1000000
  load
  preprocess
  predict
  assemble
  save
  commit
batch 2000000
  load
  preprocess
  predict
  assemble
  save
  commit
batch 3000000
  load
  preprocess
  predict
  assemble
  save
  commit
batch 4000000
  load
  preprocess
  predict
  assemble
  save
  commit
batch 5000000
  load
  preprocess
  predict
  assemble
  save
  commit


Pretrained model is to be moved to `data/precomputed/mileage_predition`, where it is expected by the pipeline.