# Clinic Patient and Appointment Management
In this notebook I have created a new SQL database, where there will be 4 tables: `Patients`, `Appointments`, `Doctors`, `TreatmentPlans`. 

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# Patients.csv 

npatients = 1000

# Primary key: Nominal
PatientId = np.unique(np.random.randint(7330000000, 7340000000, npatients, dtype=np.int64))

# Nominal
random_name_gender = np.loadtxt('random-names/random-names-gender.csv', delimiter=',', unpack = True,  dtype=str)
fullname = np.array([])
for i in range(len(random_name_gender[0])):
    fullname = np.append(fullname, random_name_gender[0][i].strip('"') + ' ' + random_name_gender[1][i].strip('"'))

# Nominal
gender = random_name_gender[2]
# Ratio
Age = np.random.randint(18, 100, npatients)

# Nominal
ContactNumber = np.unique(np.random.randint(7900000000, 7910000000, npatients, dtype=np.int64))
ContactNumber = np.array(['0' + str(number) for number in ContactNumber])

patients_df = pd.DataFrame({
    'PatientId': PatientId,
    'Name': fullname,
    'Gender': gender,
    'Age': Age,
    'ContactNumber': ContactNumber
})
patients_df.to_csv('Patients.csv', index=False)

In [5]:
patients_df.shape

(1000, 5)

In [6]:
patients_df.head()

Unnamed: 0,PatientId,Name,Gender,Age,ContactNumber
0,7330000188,Jordan Alexander,"""Male""",58,7900000255
1,7330001100,Cherry Williams,"""Female""",83,7900039813
2,7330013040,Vanessa Taylor,"""Female""",46,7900041110
3,7330038611,Richard Johnson,"""Male""",67,7900049383
4,7330039132,Rafael Watson,"""Male""",28,7900050838


In [7]:
# Appointments.csv

from datetime import datetime


nappointments = 3000  

# Nominal attribute
patient_ids = np.random.choice(patients_df['PatientId'], nappointments, replace=True)

doctor_names = ['Dr. Smith', 'Dr. Johnson', 'Dr. Williams', 'Dr. Jones', 'Dr. Brown', 
                'Dr. Davis', 'Dr. Miller', 'Dr. Wilson', 'Dr. Moore', 'Dr. Taylor']
# Nominal attribute
appointed_doctor = np.random.choice(doctor_names, nappointments)

current_year = datetime.now().year
years = np.full(nappointments, current_year)  


months = np.random.randint(1, 13, nappointments)
days = np.random.randint(1, 29, nappointments) 

hours = np.random.randint(8, 18, nappointments) 
minutes = np.random.randint(0, 60, nappointments)  

# Interval attribute
appointment_dates = [
    f"{year}-{str(month).zfill(2)}-{str(day).zfill(2)} {str(hour).zfill(2)}:{str(minute).zfill(2)}"
    for year, month, day, hour, minute in zip(years, months, days, hours, minutes)
]


appointments_df = pd.DataFrame({
    'PatientID': patient_ids,
    'DoctorName': appointed_doctor,
    'AppointmentDate': appointment_dates
})
appointments_df.to_csv('Appointments.csv', index=False)

In [8]:
appointments_df.head()

Unnamed: 0,PatientID,DoctorName,AppointmentDate
0,7332529926,Dr. Wilson,2024-11-16 17:20
1,7330210571,Dr. Miller,2024-05-12 12:36
2,7330616815,Dr. Moore,2024-12-23 17:56
3,7330685032,Dr. Smith,2024-03-06 17:11
4,7339463173,Dr. Taylor,2024-03-08 09:02


In [9]:
# Doctors Table

# Ordinal attribute
doctor_positions = [
    'Consultant', 'Senior Consultant', 'GP Partner', 
    'Salaried GP', 'Locum GP', 'Clinical Lead', 'Medical Director'
]

# Nominal attribute
specialties = [
    'General Medicine', 'Cardiology', 'Dermatology', 'Pediatrics', 'Neurology', 
    'Oncology', 'Orthopedics', 'Gastroenterology', 'Psychiatry', 'Emergency Medicine'
]

doctor_positions_assigned = np.random.choice(doctor_positions, len(doctor_names), replace=True)
doctor_specialties_assigned = np.random.choice(specialties, len(doctor_names), replace=True)

doctors_df = pd.DataFrame({
    'DoctorName': doctor_names,
    'Position': doctor_positions_assigned,
    'Specialty': doctor_specialties_assigned
})
doctors_df.to_csv('Doctors.csv', index=False)

In [10]:
doctors_df.head()

Unnamed: 0,DoctorName,Position,Specialty
0,Dr. Smith,Consultant,Orthopedics
1,Dr. Johnson,Consultant,Oncology
2,Dr. Williams,Locum GP,Orthopedics
3,Dr. Jones,Locum GP,Neurology
4,Dr. Brown,Medical Director,Orthopedics


In [11]:
# Treatment table
ntreatments = 3000

# Foreign key: Nominal
patient_ids = np.random.choice(patients_df['PatientId'], ntreatments, replace=True)

current_date = datetime.now()
years = np.full(nappointments, current_year)  

treatment_years = np.random.randint(2010, years[0], ntreatments)
treatment_month = np.random.randint(1, 13, ntreatments)
treatment_days = np.random.randint(1, 29, ntreatments)
treatment_dates = [f'{treatment_years[i]}-{str(treatment_month[i]).zfill(2)}-'
                     f'{str(treatment_days[i]).zfill(2)}' for i in range(ntreatments)]

# Ordinal 
statuses = ["Ongoing", "Completed", "Follow-Up"]

treatment_statuses = np.random.choice(statuses, ntreatments, replace=True)

treatment_plans_df = pd.DataFrame({
    'PatientID': patient_ids,
    'TreatmentDate': treatment_dates,
    'Status': treatment_statuses
})


In [12]:
treatment_plans_df.head()

Unnamed: 0,PatientID,TreatmentDate,Status
0,7338068010,2020-04-03,Follow-Up
1,7339071516,2020-03-18,Follow-Up
2,7338563633,2021-04-16,Follow-Up
3,7335368435,2017-11-13,Follow-Up
4,7332110182,2011-03-04,Completed


In [13]:
# Introducing deliberate missing values

n_points = 50  
random_indices = np.random.choice(treatment_plans_df.index, n_points, replace=False)

# Set selected rows' TreatmentDate and Status to NaN
treatment_plans_df.loc[random_indices, 'TreatmentDate'] = np.nan
treatment_plans_df.loc[random_indices, 'Status'] = np.nan

treatment_plans_df.to_csv('Treatments.csv', index=False)
# Check and print how many values are NaN
print(treatment_plans_df.isna().sum()) 


PatientID         0
TreatmentDate    50
Status           50
dtype: int64


In [14]:
# Introducing duplicate values to Patients table

n_points = 5
random_indices = np.random.choice(patients_df.index, n_points, replace=False)

duplicates = patients_df.loc[random_indices].copy()

patients_with_duplicates = pd.concat([patients_df, duplicates], ignore_index=True)

patients_with_duplicates = patients_with_duplicates.sample(frac=1).reset_index(drop=True)

print(random_indices)
print(duplicates)
print(patients_with_duplicates.shape)
patients_with_duplicates.tail()

patients_with_duplicates.to_csv('Patients.csv', index=False)

[421 529 245 737  28]
      PatientId             Name    Gender  Age ContactNumber
421  7334316062   Heather Barnes  "Female"   26   07904626573
529  7335380001     Jacob Warren    "Male"   74   07905746811
245  7332460045      Clark Clark    "Male"   20   07902815343
737  7337355337     Edith Howard  "Female"   30   07907444035
28   7330254145  Kristian Watson    "Male"   60   07900324965
(1005, 5)
