In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import uuid
import random
from datetime import datetime, timedelta
import string


In [2]:
task = pd.DataFrame([{
    "visit_id": "unique identifier for each visit/ w duplicate",
    "patient_id": "unique identifier for each patient",
    "nurse_id": "unique identifier for each nurse",
    "visit_start_time": "timestamp when the visit started/format",
    "visit_end_time": "timestamp when the visit ended/ w missing val/format/too short/too long",
    "service_type": "Medication Administration, Wound Care, Physical Therapy, General Check-up/typo",
    "visit_location": "North, South, East, West/typo",
    "nurse_notes": "free notes/w missing val/irrel text/sp char"
}])
task.transpose()

Unnamed: 0,0
visit_id,unique identifier for each visit/ w duplicate
patient_id,unique identifier for each patient
nurse_id,unique identifier for each nurse
visit_start_time,timestamp when the visit started/format
visit_end_time,timestamp when the visit ended/ w missing val/...
service_type,"Medication Administration, Wound Care, Physica..."
visit_location,"North, South, East, West/typo"
nurse_notes,free notes/w missing val/irrel text/sp char


In [3]:
""" create syntheic data for last 30 days """

today = datetime.today().replace(microsecond=0)
last_30_days = today - timedelta(days=30)
service_types = np.array(["Medication Administration", "Wound Care", "Physical Therapy", "General Check-up"])
visit_locations = np.array(["North", "South", "East", "West"])
# language pools for nurse notes based on service types
service_pools = {
    "Medication Administration": {
        "verbs": ["administered", "delivered", "provided"],
        "objects": ["medication", "injection", "IV line", "oral dose"]
    },
    "Wound Care": {
        "verbs": ["changed", "applied", "cleaned", "dressed"],
        "objects": ["wound dressing", "bandage", "incision site"]
    },
    "Physical Therapy": {
        "verbs": ["assisted", "guided", "monitored", "encouraged"],
        "objects": ["mobility exercise", "range of motion", "strength training", "gait practice"]
    },
    "General Check-up": {
        "verbs": ["checked", "reviewed", "monitored", "observed"],
        "objects": ["vital signs", "blood pressure", "temperature", "oxygen level", "patient condition"]
    }
}
urgent_keywords = ["Urgent", "ASAP", "Action", "Critical"]
followup_keywords = ["Follow-up",  "Review", "Ongoing assessment", "Monitoring"]
status_rules= {"in pain": urgent_keywords,"restless": urgent_keywords,
                "dizzy":followup_keywords,  "weak": followup_keywords,
                "improving": followup_keywords, "stable": [],
                "comfortable": []}
status_probs = {"stable": 0.28, "improving": 0.22, "comfortable": 0.12,"in pain":0.14,"restless":0.10,"dizzy":0.07, "weak": 0.07}
_status_keys = np.array(list(status_probs.keys()))
_status_p = np.array(list(status_probs.values()))

In [4]:
def generate_visit_ids(n):
    """Generate unique visit UUIDs."""
    return np.array([str(uuid.uuid4()) for _ in range(n)])

def build_id_pools(rng, n_patients=500, n_nurses=100):
    """Generate pools of patient and nurse IDs."""
    p_pool = np.array([f"P{pid}" for pid in rng.integers(10000, 99999, size=n_patients)])
    n_pool = np.array([f"N{nid}" for nid in rng.integers(1000,  9999,  size=n_nurses)])
    return p_pool, n_pool

def assign_ids(rng, pool, n):
    """Assign IDs to visits (allows repeats)."""
    return rng.choice(pool, size=n, replace=True)

def generate_times(rng, start_dt, days, n, min_visit_min=10, max_visit_min=120):
    """Generate start and end times for visits."""
    # assuming the service is 24/7.
    start_date = np.datetime64(start_dt)
    end_date   = start_date + np.timedelta64(days, "D")
    total_secs = (end_date - start_date).astype("timedelta64[s]").astype(int)

    start_offsets = rng.integers(0, total_secs + 1, size=n).astype("timedelta64[s]")
    start_times   = start_date + start_offsets

    durations = rng.integers(min_visit_min, max_visit_min + 1, size=n).astype("timedelta64[m]")
    end_times = start_times + durations
    return start_times, end_times

def generate_notes(rng, service_choices):
    """Generate random nurse notes."""
    notes = []
    for service in service_choices:
        verb = rng.choice(service_pools[service]["verbs"])
        obj = rng.choice(service_pools[service]["objects"])
        status = rng.choice(list(status_rules.keys()))
        note = f"The nurse {verb} the {obj}. The patient is {status}."

        # assign keywords based on status
        keywords = status_rules[status]
        if keywords:
            kw = rng.choice(keywords)
            if rng.random() < 0.5:
                note = f"{kw} is required. {note}"
            else:
                note = f"{note} {kw} is required."
        notes.append(note)
    return np.array(notes)

In [5]:
# functions to generate 1000 clearn visits for 30 days without noises
def visits(N=1000, seed=2026, days=30, min_visit_min=10, max_visit_min=120):
    # random seed
    rng = np.random.default_rng(seed)
    
    # visit IDs
    visit_ids = generate_visit_ids(N)
    # ID pools
    p_pool, n_pool = build_id_pools(rng)

    # Assign per-visit data
    patient_ids = assign_ids(rng, p_pool, N)
    nurse_ids = assign_ids(rng, n_pool, N)
    chosen_service_types = rng.choice(service_types,size=N, replace=True)
    chosen_visit_locations = rng.choice(visit_locations, size=N, replace=True)

    # Times
    start_times, end_times = generate_times(rng, last_30_days.replace(microsecond=0),
                                            days, N, min_visit_min, max_visit_min)

    # Notes
    notes = generate_notes(rng, chosen_service_types)

    # Assemble DataFrame
    df = pd.DataFrame({
        "visit_id": visit_ids,
        "patient_id": patient_ids,
        "nurse_id": nurse_ids,
        "visit_start_time": start_times,
        "visit_end_time": end_times,
        "service_type": chosen_service_types,
        "visit_location": chosen_visit_locations,
        "nurse_notes": notes,
    })

    return df

In [6]:
# generate the dataset
df = visits()
# overview of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   visit_id          1000 non-null   object        
 1   patient_id        1000 non-null   object        
 2   nurse_id          1000 non-null   object        
 3   visit_start_time  1000 non-null   datetime64[us]
 4   visit_end_time    1000 non-null   datetime64[us]
 5   service_type      1000 non-null   object        
 6   visit_location    1000 non-null   object        
 7   nurse_notes       1000 non-null   object        
dtypes: datetime64[us](2), object(6)
memory usage: 62.6+ KB


In [7]:
# preview of the first 5 rows
df.head()

Unnamed: 0,visit_id,patient_id,nurse_id,visit_start_time,visit_end_time,service_type,visit_location,nurse_notes
0,6a4a715f-94c5-4b5d-8f51-51f07f298fb6,P59581,N1565,2025-09-02 00:08:24,2025-09-02 00:55:24,Wound Care,East,The nurse changed the wound dressing. The pati...
1,27bfc4de-9dd2-4b6b-b05f-f9560a3fd00f,P66547,N5698,2025-08-24 22:42:55,2025-08-25 00:12:55,General Check-up,East,The nurse reviewed the oxygen level. The patie...
2,9c2d0018-8f20-4789-a383-906d49042fbc,P47919,N3012,2025-09-02 11:07:33,2025-09-02 11:56:33,Physical Therapy,East,Review is required. The nurse assisted the mob...
3,bbedc08d-0ada-4c8f-bac4-3008ac279a01,P18359,N8991,2025-09-15 21:24:42,2025-09-15 22:12:42,General Check-up,North,Follow-up is required. The nurse reviewed the ...
4,155c3765-22d5-41cc-95f5-a94ef19748ae,P45602,N7948,2025-08-31 07:19:11,2025-08-31 07:47:11,General Check-up,North,ASAP is required. The nurse observed the patie...


In [8]:
"""save the clean version"""

from pathlib import Path
import csv

target_dir = Path(r"C:/lsrgc/LittleSteps/data")
target_dir.mkdir(parents=True, exist_ok=True)

out_path = target_dir / "clean_visits.csv"
df.to_csv(out_path, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_MINIMAL)

print("Saved to:", out_path.resolve())

Saved to: C:\lsrgc\LittleSteps\data\clean_visits.csv


In [9]:
"""error rates"""

rates = dict(
    duplicate_visit_id_rate=0.03,     # ~3% rows will reuse an existing visit_id
    missing_end_rate=0.10,            # ~10% missing visit_end_time
    missing_notes_rate=0.10,          # ~10% missing nurse_notes
    inconsistent_time_fmt_rate=0.40,  # ~40% of time fields use inconsistent formats
    outlier_short_rate=0.03,          # very short visits (< 5 minute)
    outlier_long_rate=0.03,           # very long visits (> 6 hours but < 24 hours)
    typo_service_rate=0.08,           # service_type typos
    typo_location_rate=0.05,          # visit_location typos
    irrelevant_notes_rate=0.06        # junk/special chars in nurse_notes
)
rates['duplicate_visit_id_rate']

0.03

In [10]:
n= len(df)
seed = 2026
rng = np.random.default_rng(seed)

"""Outliers in visit_start_time or visit_end_time (e.g., visits lasting unusually short or long)"""

df["duration_min"] = (df["visit_end_time"] - df["visit_start_time"]).dt.total_seconds() / 60
print(df["duration_min"].describe())


#too short < 5 min
def visit_too_short(start):
    seconds = int(rng.integers(15,10*60)) # 15sec to 5 mins
    return start + timedelta(seconds = seconds)
#too long > 6 hours <24 hours
def visit_too_long(start):
    seconds = int(rng.integers(2*3600, 24*3600)) #2 to 24 hours
    return start + timedelta(seconds = seconds)
    
k_short = int(rates["outlier_short_rate"] * n)
k_long  = int(rates["outlier_long_rate"]  * n)
perm = rng.permutation(df.index.to_numpy())
idx_short = perm[:k_short]
idx_long  = perm[k_short:k_short+k_long]

df.loc[idx_short, "visit_end_time"] = df.loc[idx_short, "visit_start_time"].apply(visit_too_short)
df.loc[idx_long,  "visit_end_time"] = df.loc[idx_long,  "visit_start_time"].apply(visit_too_long)

df["duration_min"] = (df["visit_end_time"] - df["visit_start_time"]).dt.total_seconds() / 60
print("Shortest durations:\n", df["duration_min"].nsmallest(5))
print("Longest durations:\n", df["duration_min"].nlargest(5))


"""Duplicate visit_id records"""

idx_dup_id = rng.choice(df.index, size=int(rates['duplicate_visit_id_rate'] * n), replace=False)
for i in idx_dup_id:
    donor = rng.choice(df.index)                          # pick a random existing row
    df.loc[i, "visit_id"] = df.loc[donor, "visit_id"]

print(df.loc[idx_dup_id, "visit_id"].value_counts().sort_values())
print(df["visit_id"].value_counts()[lambda x: x > 1])


count    1000.000000
mean       64.025000
std        32.231184
min        10.000000
25%        36.000000
50%        62.500000
75%        92.000000
max       120.000000
Name: duration_min, dtype: float64
Shortest durations:
 113    0.400000
182    0.533333
98     0.550000
827    1.000000
297    1.200000
Name: duration_min, dtype: float64
Longest durations:
 736    1433.716667
76     1388.550000
221    1381.383333
975    1182.800000
891    1116.116667
Name: duration_min, dtype: float64
visit_id
364104f6-cd23-44fe-80ef-5ef73b20bd26    1
56eb76c1-5cfc-4ac4-8054-1d4010533a9f    1
0c18e2d7-62ba-4ffa-8795-892590a573fe    1
de17b966-fbde-4da5-b36a-c171809f614b    1
3abd6c21-5373-428b-8635-e49409d53943    1
fe5239d8-3fbd-499b-ba02-c7b9caddc9ae    1
a84a859a-a27a-4984-8bc4-4e697756a197    1
5f7a7c8d-4cdd-477f-ba70-10443522bbef    1
51f2210c-843a-4d28-964d-5e5662bf2e21    1
9f3116bd-b0cf-406c-bda5-049c30981013    1
9ef7d767-e4a6-4f94-8c9d-9f8ca5c0c346    1
31e6d04c-02cc-4adf-bd9b-94d47c0077bc    

In [11]:
#Typographical errors in service_type or visit_location

# typo pool. Improvement: function to minic more realisitic typos.
service_type_typo = {
    "Medicatn Adminstratino": "Medication Administration",
    "Wound Cae": "Wound Care",
    "Pyhcisal Therapy": "Physical Therapy",
    "General Chek-up": "General Check-up"
}

visit_location_typo = {
    "Notrh": "North",
    "Soutth": "South",
    "Easst": "East",
    "Wsst": "West"
}

idx_service  = rng.choice(df.index, size=int(rates["typo_service_rate"] *n), replace=False) 
idx_location = rng.choice(df.index, size=int(rates["typo_location_rate"]*n), replace=False) 

df.loc[idx_service, "service_type"] = rng.choice(list(service_type_typo.keys()), size=len(idx_service))
df.loc[idx_location, "visit_location"] = rng.choice(list(visit_location_typo.keys()), size=len(idx_location))

print(df.loc[idx_service, "service_type"].head(10))
print(df.loc[idx_location, "visit_location"].head(10))



266    Medicatn Adminstratino
974          Pyhcisal Therapy
890           General Chek-up
703          Pyhcisal Therapy
754                 Wound Cae
854          Pyhcisal Therapy
727                 Wound Cae
796          Pyhcisal Therapy
743    Medicatn Adminstratino
274           General Chek-up
Name: service_type, dtype: object
14      Notrh
550      Wsst
854    Soutth
136     Easst
142     Easst
451     Easst
457    Soutth
714      Wsst
503     Notrh
673     Easst
Name: visit_location, dtype: object


In [12]:
#Irrelevant text or special characters in nurse_notes
noise_snippets = [
    "##--##", "###$$$%%%^^^", "~@#`", "***CONFIDENTIAL***",
    ">>>???<<<", "~~~", "xyz123", "!!!", "N/A", "///\\\\"]
def inject_noise_each_sentence(note):
    """add a noise snippet after each sentence."""
    sentences = [s.strip() for s in note.split(".") if s.strip()]
    noisy_sentences = []
    for s in sentences:
        noise = rng.choice(noise_snippets)  
        noisy_sentences.append(f"{s}. {noise}")
    return " ".join(noisy_sentences)
    
idx_notes = rng.choice(df.index, size=int(rates['irrelevant_notes_rate'] * n), replace=False)
df.loc[idx_notes, "nurse_notes"] = df.loc[idx_notes, "nurse_notes"].apply(inject_noise_each_sentence)
print(df.loc[idx_notes, "nurse_notes"].head(10))



610    The nurse monitored the patient condition. ///...
857    The nurse changed the bandage. ###$$$%%%^^^ Th...
296    The nurse delivered the oral dose. ***CONFIDEN...
227    Review is required. ///\\ The nurse dressed th...
501    The nurse changed the wound dressing. ###$$$%%...
930    ASAP is required. !!! The nurse provided the m...
699    The nurse observed the vital signs. N/A The pa...
344    The nurse reviewed the patient condition. >>>?...
824    The nurse guided the gait practice. ~~~ The pa...
95     The nurse delivered the IV line. N/A The patie...
Name: nurse_notes, dtype: object


In [13]:
# #Missing values (e.g., visit_end_time for some records, nurse_notes for others)

missing_idx_end = rng.choice(df.index, size = int(rates['missing_end_rate'] * n), replace = False)
missing_idx_notes = rng.choice(df.index, size = int(rates['missing_notes_rate'] * n), replace = False)
df.loc[missing_idx_end, "visit_end_time"] = np.nan
df.loc[missing_idx_notes, "nurse_notes"] = np.nan

print(df.loc[missing_idx_end, "visit_end_time"].head(10))
print(df.loc[missing_idx_notes, "nurse_notes"].head(10))



629   NaT
438   NaT
407   NaT
138   NaT
81    NaT
713   NaT
789   NaT
169   NaT
613   NaT
658   NaT
Name: visit_end_time, dtype: datetime64[us]
867    NaN
182    NaN
461    NaN
484    NaN
260    NaN
885    NaN
722    NaN
308    NaN
244    NaN
143    NaN
Name: nurse_notes, dtype: object


In [14]:
#Inconsistent date/time formats
df["visit_start_time"] = df["visit_start_time"].astype("object")
df["visit_end_time"]   = df["visit_end_time"].astype("object")

time_formats = [
    "%Y-%m-%d %H:%M:%S",    # 2025-09-17 14:30:59
    "%m/%d/%Y %H:%M",       # 09/17/2025 14:30
    "%B %d, %Y %I:%M%p",    # September 17, 2025 02:30PM
    "%Y/%m/%d %H:%M",       # 2025/09/17 14:30
]
def random_inconsistent_time(x):
    """Format a Timestamp (or parsable string) into a random string format."""
    if pd.isna(x):
        return x
    ts = pd.to_datetime(x, errors="coerce")
    if pd.isna(ts):
        return str(x)
    fmt = rng.choice(time_formats)
    return pd.to_datetime(x).strftime(fmt)

idx_start = rng.choice(df.index, size=int(rates['inconsistent_time_fmt_rate'] * n), replace=False)
idx_end  = rng.choice(df.index, size=int(rates['inconsistent_time_fmt_rate'] * n), replace=False)

df.loc[idx_start, "visit_start_time"] = df.loc[idx_start, "visit_start_time"].apply(random_inconsistent_time)
df.loc[idx_end, "visit_end_time"]  = df.loc[idx_end, "visit_end_time"].apply(random_inconsistent_time)

print(df.dtypes)



visit_id             object
patient_id           object
nurse_id             object
visit_start_time     object
visit_end_time       object
service_type         object
visit_location       object
nurse_notes          object
duration_min        float64
dtype: object


In [15]:
df.head()

Unnamed: 0,visit_id,patient_id,nurse_id,visit_start_time,visit_end_time,service_type,visit_location,nurse_notes,duration_min
0,6a4a715f-94c5-4b5d-8f51-51f07f298fb6,P59581,N1565,2025-09-02 00:08:24,2025-09-02 00:55:24,Wound Care,East,The nurse changed the wound dressing. The pati...,47.0
1,27bfc4de-9dd2-4b6b-b05f-f9560a3fd00f,P66547,N5698,"August 24, 2025 10:42PM","August 25, 2025 12:12AM",General Check-up,East,The nurse reviewed the oxygen level. The patie...,90.0
2,9c2d0018-8f20-4789-a383-906d49042fbc,P47919,N3012,2025-09-02 11:07:33,2025/09/02 11:56,Physical Therapy,East,Review is required. The nurse assisted the mob...,49.0
3,bbedc08d-0ada-4c8f-bac4-3008ac279a01,P18359,N8991,2025-09-15 21:24:42,2025-09-15 22:12:42,General Check-up,North,Follow-up is required. The nurse reviewed the ...,48.0
4,155c3765-22d5-41cc-95f5-a94ef19748ae,P45602,N7948,2025-08-31 07:19:11,2025-08-31 07:47:11,Pyhcisal Therapy,North,ASAP is required. The nurse observed the patie...,28.0


In [16]:
import re
def has_special_char(series):
    pattern = re.compile(r"[^a-zA-Z0-9\s\-\.:/,]") 
    return series.astype(str).apply(lambda x: bool(pattern.search(x))).any()

for col in df.columns:
    has_dups = df[col].duplicated().any()
    has_missing = df[col].isna().any()
    has_special = has_special_char(df[col]) if df[col].dtype == "object" else False

    print(f"{col}: duplicates={has_dups}, missing={has_missing}, special_chars={has_special}")
    

visit_id: duplicates=True, missing=False, special_chars=False
patient_id: duplicates=True, missing=False, special_chars=False
nurse_id: duplicates=True, missing=False, special_chars=False
visit_start_time: duplicates=False, missing=False, special_chars=False
visit_end_time: duplicates=True, missing=True, special_chars=False
service_type: duplicates=True, missing=False, special_chars=False
visit_location: duplicates=True, missing=False, special_chars=False
nurse_notes: duplicates=True, missing=True, special_chars=True
duration_min: duplicates=True, missing=False, special_chars=False


In [17]:
df = df.drop(columns = "duration_min")
df.head()

Unnamed: 0,visit_id,patient_id,nurse_id,visit_start_time,visit_end_time,service_type,visit_location,nurse_notes
0,6a4a715f-94c5-4b5d-8f51-51f07f298fb6,P59581,N1565,2025-09-02 00:08:24,2025-09-02 00:55:24,Wound Care,East,The nurse changed the wound dressing. The pati...
1,27bfc4de-9dd2-4b6b-b05f-f9560a3fd00f,P66547,N5698,"August 24, 2025 10:42PM","August 25, 2025 12:12AM",General Check-up,East,The nurse reviewed the oxygen level. The patie...
2,9c2d0018-8f20-4789-a383-906d49042fbc,P47919,N3012,2025-09-02 11:07:33,2025/09/02 11:56,Physical Therapy,East,Review is required. The nurse assisted the mob...
3,bbedc08d-0ada-4c8f-bac4-3008ac279a01,P18359,N8991,2025-09-15 21:24:42,2025-09-15 22:12:42,General Check-up,North,Follow-up is required. The nurse reviewed the ...
4,155c3765-22d5-41cc-95f5-a94ef19748ae,P45602,N7948,2025-08-31 07:19:11,2025-08-31 07:47:11,Pyhcisal Therapy,North,ASAP is required. The nurse observed the patie...


In [18]:
from pathlib import Path
import csv

target_dir = Path(r"C:/lsrgc/LittleSteps/data")
target_dir.mkdir(parents=True, exist_ok=True)

out_path = target_dir / "visits.csv"
df.to_csv(out_path, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_MINIMAL)

print("Saved to:", out_path.resolve())

Saved to: C:\lsrgc\LittleSteps\data\visits.csv
