#Data Generator Using Faker

In [19]:
!pip install faker
from faker import Faker
import random
import time
from datetime import datetime
import uuid



The following function creates a schema which simulates a passenger requesting a ride which has the pickup details, destination details, what kind of ride was requested, whats the current status of the request, who the passenger is, and the amount of time has passed since the passenger made the request(in milliseconds)

In [17]:
fake = Faker()

def generate_ride_request():
    return {
        "request_id": str(uuid.uuid4()),
        "passenger_id": str(uuid.uuid4()),
        "pickup_lat": fake.latitude(),
        "pickup_lng": fake.longitude(),
        "destination_lat": fake.latitude(),
        "destination_lng": fake.longitude(),
        "ride_type": random.choice(["standard", "premium", "pool"]),
        "status": random.choice(["requested", "cancelled", "accepted", "completed"]),
        "estimate_price": round(random.uniform(5, 50), 2),  # Random price between $5 and $50
        "demand_level": random.choice(["high", "low"]),
        "estimated_distance_km": round(random.uniform(1, 20), 2),  # Distance in km
        "datetime": datetime.now().isoformat()  # Current timestamp in ISO format
    }

# Example usage
print(generate_ride_request())

{'request_id': 'c842bcf3-fc93-48f7-a412-71efce516cba', 'passenger_id': 'e3387121-0b80-4278-b7bf-b5582946ab11', 'pickup_lat': Decimal('-52.6492425'), 'pickup_lng': Decimal('35.388465'), 'destination_lat': Decimal('-79.204484'), 'destination_lng': Decimal('175.389551'), 'ride_type': 'premium', 'status': 'cancelled', 'estimate_price': 42.03, 'demand_level': 'high', 'estimated_distance_km': 3.07, 'datetime': '2025-03-15T20:45:45.289243'}


The following functions creates a schema which simulates when a driver current status such as their current location, their availability, what kind of vehicle their driving and the time it took to capture their status.

In [18]:
def generate_driver_status():
    return {
        "driver_id": str(uuid.uuid4()),
        "current_lat": fake.latitude(),
        "current_lng": fake.longitude(),
        "status": random.choice(["available", "en_route", "on_trip", "offline"]),
        "vehicle_type": random.choice(["sedan", "SUV", "motorbike", "uber", "lyft", "cabify"]),
        "timestamp": datetime.now().isoformat()
    }

# Example usage
print(generate_driver_status())


{'driver_id': '2eda4535-612d-4018-af38-190d6dda2593', 'current_lat': Decimal('68.1452835'), 'current_lng': Decimal('32.209480'), 'status': 'on_trip', 'vehicle_type': 'uber', 'timestamp': '2025-03-15T20:45:47.543798'}


Here we generate 10 passenger requests and 10 driver status updates:

In [None]:
# Generate 10 ride requests
ride_requests = [generate_ride_request() for _ in range(10)]

# Generate 10 driver status updates
driver_status_updates = [generate_driver_status() for _ in range(10)]

print(ride_requests)
print(driver_status_updates)


[{'request_id': '23ab40b6-f343-44a9-baea-1b2a2fb7b8f1', 'passenger_id': '06dac076-96cb-4f22-90fa-2b48bd5671bd', 'pickup_lat': Decimal('55.276083'), 'pickup_lng': Decimal('150.346797'), 'destination_lat': Decimal('76.306982'), 'destination_lng': Decimal('-19.793847'), 'ride_type': 'standard', 'status': 'cancelled', 'estimate_price': 34.97, 'demand_level': 'low', 'estimated_distance_km': 8.08, 'datetime': '2025-03-12T15:57:15.419533'}, {'request_id': 'de30656c-07e7-4254-9fb5-bae989dbe553', 'passenger_id': 'ac17399d-fc09-420b-83a7-fd08fda1bec5', 'pickup_lat': Decimal('-83.6442335'), 'pickup_lng': Decimal('-137.210119'), 'destination_lat': Decimal('-9.405712'), 'destination_lng': Decimal('-13.547156'), 'ride_type': 'premium', 'status': 'accepted', 'estimate_price': 20.95, 'demand_level': 'low', 'estimated_distance_km': 4.39, 'datetime': '2025-03-12T15:57:15.419623'}, {'request_id': '3be3ad74-aff3-49da-a6f7-b5d6914aa395', 'passenger_id': '28baf15c-c173-4bcf-92b5-ce07a4698cf4', 'pickup_lat':

#Data Generator Using Mimesis

In [7]:
!pip install mimesis
from mimesis import Person, Datetime, Address, Transport
import random
import time
from datetime import datetime

Collecting mimesis
  Downloading mimesis-18.0.0-py3-none-any.whl.metadata (5.7 kB)
Downloading mimesis-18.0.0-py3-none-any.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mimesis
Successfully installed mimesis-18.0.0


In [8]:
person = Person("en")
datetime_provider = Datetime()
address = Address()
transport = Transport()

def generate_ride_request():
    return {
        "request_id": str(uuid.uuid4()),
        "passenger_id": str(uuid.uuid4()),
        "pickup_lat": address.latitude(),
        "pickup_lng": address.longitude(),
        "destination_lat": address.latitude(),
        "destination_lng": address.longitude(),
        "ride_type": random.choice(["standard", "premium", "pool"]),
        "status": random.choice(["requested", "cancelled", "accepted", "completed"]),
        "estimate_price": round(random.uniform(5, 50), 2),  # Price in USD
        "demand_level": random.choice(["high", "low"]),
        "estimated_distance_km": round(random.uniform(1, 20), 2),  # Distance in km
        "datetime": datetime.now().isoformat()
    }

In [9]:
def generate_driver_status():
    return {
        "driver_id": str(uuid.uuid4()),
        "current_lat": address.latitude(),
        "current_lng": address.longitude(),
        "status": random.choice(["available", "en_route", "on_trip", "offline"]),
        "vehicle_type": random.choice(["sedan", "SUV", "motorbike", "uber", "lyft", "cabify"]),
        "timestamp": datetime.now().isoformat()
    }

In [10]:
print(generate_ride_request())
print(generate_driver_status())

{'request_id': '8a5f1cb3-42b1-42e2-9124-59390164486f', 'passenger_id': '32a6be2e-5071-46c1-a81d-4a413189faee', 'pickup_lat': 29.191269, 'pickup_lng': -176.427245, 'destination_lat': -77.644042, 'destination_lng': 127.843473, 'ride_type': 'pool', 'status': 'cancelled', 'estimate_price': 44.49, 'demand_level': 'low', 'estimated_distance_km': 11.9, 'datetime': '2025-03-15T20:43:35.880240'}
{'driver_id': '071cf0aa-fd02-4865-a240-4d8c98ecd462', 'current_lat': -7.792137, 'current_lng': 22.000094, 'status': 'available', 'vehicle_type': 'motorbike', 'timestamp': '2025-03-15T20:43:35.880632'}


#Comparison between Faker and Mimesis:
##Faker:

*   Slower for large datasets.
*   Supports multiple localizations.
*   Specialized in general data types.

##Mimesis:

*   Optimized for performance.
*   Better language & region support.
*   More specialized data providers (transport, geolocation, etc).

#Saving the event data in JSON and AVRO formats

In [11]:
!pip install fastavro
from mimesis import Person, Datetime, Address
import json
import fastavro
import random
import uuid
from datetime import datetime

#We're selecting Mimesis as our generator since it has the better performance and is more specialized

person = Person("en")
datetime_provider = Datetime()
address = Address()

Collecting fastavro
  Downloading fastavro-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Downloading fastavro-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fastavro
Successfully installed fastavro-1.10.0


##Ride Request Schema & Driver Update Schema

In [12]:
ride_request_schema = {
    "type": "record",
    "name": "RideRequest",
    "namespace": "com.ridehailing",
    "fields": [
        {"name": "request_id", "type": "string"},
        {"name": "passenger_id", "type": "string"},
        {"name": "pickup_lat", "type": "double"},
        {"name": "pickup_lng", "type": "double"},
        {"name": "destination_lat", "type": "double"},
        {"name": "destination_lng", "type": "double"},
        {"name": "ride_type", "type": {"type": "enum", "name": "RideType", "symbols": ["standard", "premium", "pool"]}},
        {"name": "status", "type": {"type": "enum", "name": "RideStatus", "symbols": ["requested", "cancelled", "accepted", "completed"]}},
        {"name": "estimate_price", "type": "double"},
        {"name": "demand_level", "type": {"type": "enum", "name": "DemandLevel", "symbols": ["high", "low"]}},
        {"name": "estimated_distance_km", "type": "double"},
        {"name": "datetime", "type": "string"}
    ]
}

driver_status_schema = {
    "type": "record",
    "name": "DriverStatus",
    "namespace": "com.ridehailing",
    "fields": [
        {"name": "driver_id", "type": "string"},
        {"name": "current_lat", "type": "double"},
        {"name": "current_lng", "type": "double"},
        {"name": "status", "type": {"type": "enum", "name": "DriverStatusEnum", "symbols": ["available", "en_route", "on_trip", "offline"]}},
        {"name": "vehicle_type", "type": {"type": "enum", "name": "VehicleType", "symbols": ["sedan", "SUV", "motorbike", "uber", "lyft", "cabify"]}},
        {"name": "timestamp", "type": "string"}
    ]
}

##Ride Request code & Driver Update Code

In [13]:
def generate_ride_request():
    return {
        "request_id": str(uuid.uuid4()),
        "passenger_id": str(uuid.uuid4()),
        "pickup_lat": address.latitude(),
        "pickup_lng": address.longitude(),
        "destination_lat": address.latitude(),
        "destination_lng": address.longitude(),
        "ride_type": random.choice(["standard", "premium", "pool"]),
        "status": random.choice(["requested", "cancelled", "accepted", "completed"]),
        "estimate_price": round(random.uniform(5, 50), 2),
        "demand_level": random.choice(["high", "low"]),
        "estimated_distance_km": round(random.uniform(1, 20), 2),
        "datetime": datetime.now().isoformat()
    }

def generate_driver_status():
    return {
        "driver_id": str(uuid.uuid4()),
        "current_lat": address.latitude(),
        "current_lng": address.longitude(),
        "status": random.choice(["available", "en_route", "on_trip", "offline"]),
        "vehicle_type": random.choice(["sedan", "SUV", "motorbike", "uber", "lyft", "cabify"]),
        "timestamp": datetime.now().isoformat()
    }

##Saving the generated data into the AVRO files.

In [14]:
def write_to_avro(file_name, schema, data):
    with open(file_name, "wb") as out_file:
        fastavro.writer(out_file, schema, data)

# Generate and Save 100 Ride Requests
ride_requests = [generate_ride_request() for _ in range(100)]
write_to_avro("ride_requests.avro", ride_request_schema, ride_requests)

# Generate and Save 100 Driver Status Updates
driver_updates = [generate_driver_status() for _ in range(100)]
write_to_avro("driver_status.avro", driver_status_schema, driver_updates)

print("✅ Ride requests saved to ride_requests.avro")
print("✅ Driver status updates saved to driver_status.avro")

✅ Ride requests saved to ride_requests.avro
✅ Driver status updates saved to driver_status.avro


Verifying that the data was written correctly

In [15]:
def read_avro(file_name):
    with open(file_name, "rb") as avro_file:
        reader = fastavro.reader(avro_file)
        for record in reader:
            print(record)
            break  # Print only the first record

# Verify Ride Requests
read_avro("ride_requests.avro")

# Verify Driver Status Updates
read_avro("driver_status.avro")


{'request_id': '2f5f3823-e242-4123-bb74-5a3dfe76ce56', 'passenger_id': '2951ab24-8c59-4f2b-b1ac-bcad32506593', 'pickup_lat': -78.283998, 'pickup_lng': 127.482541, 'destination_lat': 82.529723, 'destination_lng': 172.109099, 'ride_type': 'pool', 'status': 'requested', 'estimate_price': 14.03, 'demand_level': 'high', 'estimated_distance_km': 10.91, 'datetime': '2025-03-15T20:43:49.267750'}
{'driver_id': 'e3a4ff92-3851-4292-ac27-b81c723215da', 'current_lat': 82.453372, 'current_lng': 82.577779, 'status': 'offline', 'vehicle_type': 'cabify', 'timestamp': '2025-03-15T20:43:49.271462'}


##Saving the generated data into the JSON files.

In [16]:
import json

# Function to Write Data to JSON File
def write_to_json(file_name, data):
    with open(file_name, "w") as json_file:
        json.dump(data, json_file, indent=4)

# Save Ride Requests Data to JSON
write_to_json("ride_requests.json", ride_requests)

# Save Driver Status Data to JSON
write_to_json("driver_status.json", driver_updates)

print("✅ Ride requests saved to ride_requests.json")
print("✅ Driver status updates saved to driver_status.json")


✅ Ride requests saved to ride_requests.json
✅ Driver status updates saved to driver_status.json
