# Setup

In [None]:
import pandas as pd
import random
import numpy as np

# Classes

In [None]:
TARGET_TIRE_PRESSURE = 2.5

class Car:

    def __init__(self, model, make, year, number_seats, weight_in_kg, price_in_euro, fuel_level_in_percent):
        self.model: str = model
        self.make: str = make
        self.year: int = year
        self.number_seats: int = number_seats
        self.weight_in_kg: int = weight_in_kg
        self.price_in_euro: int = price_in_euro

        self.fuel_level_in_percent: float | None = fuel_level_in_percent  # 0 - 1
        self.speed: int = 0
        self.acceleration: float = 0

        self.cruise_control: bool = False
        self.cruise_control_speed: int | None = None

        self.engine_temperature: float = 20
        self.engine_rpm: int = 0
        self.engine_oil_pressure: float = 2
        self.tire_pressure: float = 2.5

In [None]:
class Environment:
    def __init__(self, ambient_light_level_in_lux: int, chance_of_drive_end_arbitrary: float = 0.01):
        self.ambient_light_level_in_lux: int = ambient_light_level_in_lux
        self.current_timestamp = 0  # seconds
        self.ended = False

        self.chance_of_drive_end_arbitrary = chance_of_drive_end_arbitrary

    def step(self, car: Car, next_acceleration: float):
        if self.ended is True:
            raise Exception("The drive has already ended")

        self.current_timestamp += 1

        # Compute ambient light level change
        # Domain: N [0, 10000]
        # Always: -= 100 * random.random()
        self.ambient_light_level_in_lux -= int(100 * random.random())

        # Probability of the drive ending is 1 if ambient light level is below 300
        # And 0.01 otherwise
        is_too_dark_to_drive = self.ambient_light_level_in_lux < 300
        end_drive_arbitrary = random.random() < self.chance_of_drive_end_arbitrary

        if is_too_dark_to_drive or end_drive_arbitrary:
            self.ended = True

        # We have a 2% chance that the driver changes the state of the cruise control (de-)activates it
        if random.random() < 0.02:
            car.cruise_control = not car.cruise_control

        # The tire pressure has a 2% chance of decreasing
        # The value of the change is between 0.1 and 0.2 (uniform distribution)
        # If the tire pressure is below 1.5, the tire pressure will be updated (inflate tires)
        # to the target tire pressure (2.5)
        if random.random() < 0.02:
            tire_pressure_change = random.uniform(0.1, 0.2)
            car.tire_pressure = max(1.5, car.tire_pressure - tire_pressure_change)
        if car.tire_pressure <= 1.5:
            car.tire_pressure = TARGET_TIRE_PRESSURE

        if car.cruise_control:
            # If cruise control is active, the cars speed is unchanged
            # And the cruise control speed is set to the current speed
            car.acceleration = 0
            car.cruise_control_speed = car.speed
        else:
            # If cruise control is not active, the cars speed is changed
            # And the cruise control speed is set to None
            car.acceleration = next_acceleration
            car.speed = int(next_acceleration * (car.tire_pressure / TARGET_TIRE_PRESSURE) + car.speed)
            car.cruise_control_speed = None

            # min speed 0, max speed 300, otherwise calculated speed
            car.speed = max(0, min(300, car.speed))

        # Compute the engine temperature change
        # Domain: [85, 115] celsius
        # Normal distribution (mean: 100, standard deviation: 5)
        # Correlation with acceleration: 0.7
        engine_temparature_change = random.normalvariate(100, 5) * 0.7 * car.acceleration
        car.engine_temperature = max(85, min(115, car.engine_temperature + engine_temparature_change))

        # Compute the engine rpm
        # Domain: [0, 6000]
        # Normal distribution (mean: 4000, standard deviation: 500)
        # Correlation with acceleration: 0.9
        engine_rpm_change = random.normalvariate(4000, 500) * 0.9 * car.acceleration
        car.engine_rpm = max(0, min(6000, car.engine_rpm + int(engine_rpm_change)))

        # Compute the oil pressure change
        # Domain: [0, 200]
        # Normal distribution (mean: 100, standard deviation: 7)
        # Correlation with engine rpm: 0.8
        engine_oil_pressure_change = random.normalvariate(100, 7) * 0.8 * car.engine_rpm
        car.engine_oil_pressure = max(0, min(200, car.engine_oil_pressure + engine_oil_pressure_change))

        # Compute the fuel level change
        # Always: -=0.005
        # If the car is accelerating: -=0.01
        # If the car is braking: +=0.01
        fuel_level_change = -0.005 * (TARGET_TIRE_PRESSURE / car.tire_pressure) # higher fuel change when tire pressure gets lower
        if car.acceleration > 0:
            fuel_level_change -= 0.01

        # The probability of the fuel level being refilled is 1 - fuel_level_in_percent^2
        # This means that the fuel level is refilled more often if the fuel level is low
        propability_of_refuel = 0


        if car.fuel_level_in_percent is None:
            propability_of_refuel = 0
        elif car.fuel_level_in_percent == 0:
            propability_of_refuel = 1
        else:
            propability_of_refuel = -np.log(car.fuel_level_in_percent) / 20

        if random.random() < propability_of_refuel:
            fuel_level_change += 1

        if car.fuel_level_in_percent is not None:
            car.fuel_level_in_percent = max(0, min(1, car.fuel_level_in_percent + fuel_level_change))


        

# Preconfigured cars

In [None]:
# Sample cars
audi_a7 = Car(model="A7", make="Audi", year=2020, number_seats=5, weight_in_kg=1645, price_in_euro=72400, fuel_level_in_percent=0.7)
bmw_3_series = Car(
    model="3 Series", make="BMW", year=2021, number_seats=5, weight_in_kg=1570, price_in_euro=50200, fuel_level_in_percent=0.8
)
mercedes_e_class = Car(
    model="E-Class", make="Mercedes-Benz", year=2019, number_seats=5, weight_in_kg=1780, price_in_euro=64500, fuel_level_in_percent=0.6
)
ford_mustang = Car(
    model="Mustang", make="Ford", year=2022, number_seats=4, weight_in_kg=1685, price_in_euro=56500, fuel_level_in_percent=0.7
)
toyota_camry = Car(
    model="Camry", make="Toyota", year=2020, number_seats=5, weight_in_kg=1525, price_in_euro=38900, fuel_level_in_percent=0.9
)
volkswagen_golf = Car(
    model="Golf", make="Volkswagen", year=2021, number_seats=5, weight_in_kg=1265, price_in_euro=27800, fuel_level_in_percent=0.8
)
tesla_model_s = Car(
    model="Model S", make="Tesla", year=2022, number_seats=5, weight_in_kg=2100, price_in_euro=79900, fuel_level_in_percent=0.85
)
honda_civic = Car(
    model="Civic", make="Honda", year=2019, number_seats=5, weight_in_kg=1295, price_in_euro=25500, fuel_level_in_percent=0.75
)
jaguar_f_type = Car(
    model="F-Type", make="Jaguar", year=2023, number_seats=2, weight_in_kg=1665, price_in_euro=81200, fuel_level_in_percent=0.7
)
# Append cars to a list
car_list = [audi_a7, bmw_3_series, mercedes_e_class, ford_mustang, toyota_camry, volkswagen_golf, tesla_model_s, honda_civic, jaguar_f_type]

# Generators

In [None]:
def generate_car() -> Car:
    car: Car = random.choice(car_list)

    car.year = random.randint(1990, 2024)

    price: int = car.price_in_euro
    for i in range(2024 - car.year):
        change_of_value = random.randint(-700, 100)
        price += change_of_value
    car.price_in_euro = max(20_000, price)

    car.fuel_level_in_percent = random.random()

    return car


def generate_env(chance_of_drive_end_arbitrary: float) -> Environment:
    lux = random.randint(5000, 15000)
    return Environment(ambient_light_level_in_lux=lux, chance_of_drive_end_arbitrary=chance_of_drive_end_arbitrary)


def create_row(car: Car, env: Environment, ride_number: int) -> pd.DataFrame:
    row = pd.DataFrame(
        [
            {
                "model": car.model,
                "make": car.make,
                "year": car.year,
                "number_seats": car.number_seats,
                "weight_in_kg": car.weight_in_kg,
                "price_in_euro": car.price_in_euro,
                "current_timestamp": env.current_timestamp,
                "fuel_level_in_percent": car.fuel_level_in_percent,
                "ambient_light_level_in_lux": env.ambient_light_level_in_lux,
                "speed": car.speed,
                "acceleration": car.acceleration,
                "ride": ride_number,
                "cruise_control": car.cruise_control,
                "cruise_control_speed": car.cruise_control_speed,
                "engine_temperature": car.engine_temperature,
                "engine_rpm": car.engine_rpm,
                "engine_oil_pressure": car.engine_oil_pressure,
                "tire_pressure": car.tire_pressure,
            }
        ]
    )

    return row

# Dataset generator

In [None]:
def generate_dataset(
    number_of_rows: int,
    custom_outlier_percentage_engine_temperature: float,
    custom_outlier_percentage_engine_rpm: float,
    number_of_order_inversions: int,
    chance_of_drive_end_arbitrary: float,
) -> pd.DataFrame:
    output_df = pd.DataFrame()
    # generate car
    car: Car = generate_car()
    env: Environment = generate_env(chance_of_drive_end_arbitrary=chance_of_drive_end_arbitrary)

    ride_number = 0
    next_acceleration = random.normalvariate(0, 5) + 0.2

    # while not enough rows
    for i in range(number_of_rows):
        # has ride ended?
        if env.ended:
            # generate car
            car = generate_car()
            env = generate_env(chance_of_drive_end_arbitrary=chance_of_drive_end_arbitrary)
            ride_number += 1

        # get row and add
        row: pd.DataFrame = create_row(car=car, env=env, ride_number=ride_number)
        output_df: pd.DataFrame = pd.concat([output_df, row], ignore_index=True)
        # output_df.iloc[i] = row
        # print(output_df)

        next_acceleration_diff = random.normalvariate(0.5, 5)
        next_acceleration = max(-10, min(10, next_acceleration + next_acceleration_diff))

        env.step(car=car, next_acceleration=next_acceleration)

    # manipulate data
    # apply outliers and nulls
    # Fuel sensor is broken between 0.85 and 0.9
    output_df["fuel_level_in_percent"] = output_df["fuel_level_in_percent"].map(lambda x: None if (x > 0.85 and x < 0.9) else x)

    # use time inaccuracy
    for i in range(number_of_order_inversions):
        index = random.randint(0, len(output_df) - 1)

        tmp = output_df.iloc[index]
        output_df.iloc[index] = output_df.iloc[index + 1]
        output_df.iloc[index + 1] = tmp

    # univariate outlier
    output_df["engine_temperature"] = output_df["engine_temperature"].map(lambda x: x**2 if (random.random() < custom_outlier_percentage_engine_temperature) else x)
    output_df["engine_rpm"] = output_df["engine_rpm"].map(lambda x: int(random.normalvariate(x,1000)) if (random.random() < custom_outlier_percentage_engine_rpm) else x)

    # missing completely at random
    output_df["engine_temperature"] = output_df["engine_temperature"].map(lambda x: None if (random.random() < 0.005) else x)
    output_df["engine_oil_pressure"] = output_df["engine_oil_pressure"].map(lambda x: None if (random.random() < 0.01) else x)

    return output_df

In [None]:
df = generate_dataset(number_of_rows=20_000,
                 custom_outlier_percentage_engine_temperature=0.005,
                 custom_outlier_percentage_engine_rpm=0.005,
                 chance_of_drive_end_arbitrary=0.01,
                 number_of_order_inversions=20)

In [None]:
df