# 01 (01): Generate Data

In this notebook, we generate some fake medical datasets to work with. One is fake patient data, containing personal details such as the name and address.

The other dataset contains information about appointments they have attended, that we aggregate later.

In [None]:
# Install the faker package (not included with Fabric by default)
!pip install faker

In [None]:
# Import packages
from datetime import datetime
from faker import Faker

import pandas as pd
import numpy as np
import random

In [None]:
# Initialize Faker with UK locale
fake = Faker('en_GB')

# Define number of records and random seed
num_records = 20000
SEED = 42

# Set random seeds to ensure we always get the same data
random.seed(SEED)
np.random.seed(SEED)
fake.seed_instance(SEED)

## Generate fake patient data

In [None]:
# Generate synthetic medical dataset with intentional errors
data = []
for i in range(1, num_records + 1):
    # Introduce unrealistic dates of birth
    dob = fake.date_of_birth(minimum_age=0, maximum_age=90)
    if random.random() < 0.005:
        dob = datetime.strptime(f"{random.randint(2025, 2030)}-{random.randint(1,12):02d}-{random.randint(1,28):02d}", "%Y-%m-%d").date()
    dob = pd.to_datetime(dob)

    # Introduce missing values randomly
    data.append({
        'patient_id': i,
        'name': fake.name(),
        'date_of_birth': dob,
        'address': fake.address().replace('\n', ', ') if random.random() > 0.01 else '',
        'phone_number': fake.phone_number() if random.random() > 0.01 else 'N/A',
        'is_public_patient': random.choice([1, 0]) 
    })

# Save to data frame
df = pd.DataFrame(data)
df.head(2)

## Generate fake appointment data

In [None]:
min_appointments, max_appointments= 0, 5

# Generate appointment history
appointment_data = []
for patient_id in range(1, num_records + 1):
    num_appointments = random.randint(min_appointments, max_appointments)
    
    # Assign 1 to 3 doctors per patient
    doctors = [fake.name() for _ in range(random.randint(1, 3))]
    
    for _ in range(num_appointments):
        appointment_date = fake.date_between(start_date='-2y', end_date='today')
        doctor_seen = random.choice(doctors)
        
        appointment_data.append({
            'patient_id': patient_id,
            'appointment_date': appointment_date,
            'doctor_seen': doctor_seen
        })

# Save to CSV
appointments_df = pd.DataFrame(appointment_data)
appointments_df.head(2)

## Upload data to the lakehouse

We will discuss this later. **For now, change the lakehouse path to your own.**

To extract the lakehouse path:

* navigate to your lakehouse,
* click on the three dots next to 'Tables',
* copy the ABFS path.

In [None]:
# Specify lakehouse path
abfs_path = 'abfss://490a35a8-ffa1-4c26-8ad2-f394ba2aaefd@onelake.dfs.fabric.microsoft.com/e5d53df5-727e-4244-8a8e-eb9d9b6ac78b/Tables'

tables = {
    'personal_df': df,
    'appointment_df': appointments_df
}

# Save the dataframes to the lakehouse
for table_name, table in tables.items():
    df_spark = spark.createDataFrame(table)
    (
    df_spark
        .write
        .mode('overwrite')
        .format('delta')
        .option('overwriteSchema', 'true')
        .save(f"{abfs_path}/{table_name}")
    )