In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Set random seed 
np.random.seed(42)

# Generate base data
n_records = 1000

# Set categorical data
hospitals = ['Memorial Hospital', 'City Medical Center', 'University Hospital', 'Community Healthcare']
doctors = ['Dr. Smith', 'Dr. Johnson', 'Dr. Williams', 'Dr. Brown', 'Dr. Jones', 'Dr. Garcia', 'Dr. Miller', 'Dr. Davis']
blood_types = ['A+', 'A-', 'B+', 'B-', 'O+', 'O-', 'AB+', 'AB-']
diagnoses = ['Pneumonia', 'Heart Disease', 'Diabetes', 'Hypertension', 'Appendicitis', 'Fracture', 'Stroke', 'Cancer']
treatments = ['Medication', 'Surgery', 'Physical Therapy', 'Chemotherapy', 'Radiation', 'Preventive Care']
medications = ['Amoxicillin', 'Lisinopril', 'Metformin', 'Ibuprofen', 'Omeprazole', 'Simvastatin']

# Generate dates
end_date = datetime.now()
start_date = end_date - timedelta(days=365)


admission_dates = [start_date + timedelta(days=int(x)) for x in np.random.randint(0, 365, n_records)]

# Create dataset
data = {
    'PatientID': range(1000, 1000 + n_records),
    'PatientName': [f'Patient_{i}' for i in range(n_records)],  # Anonymized names
    'Age': np.random.normal(50, 20, n_records).astype(int).clip(18, 95),
    'Gender': np.random.choice(['M', 'F'], n_records),
    'Bloodtype': np.random.choice(blood_types, n_records),
    'Hospital': np.random.choice(hospitals, n_records),
    'DoctorName': np.random.choice(doctors, n_records),
    'Diagnosis': np.random.choice(diagnoses, n_records),
    'Treatment': np.random.choice(treatments, n_records),
    'AdmissionDate': admission_dates,
    'RoomNumber': np.random.randint(100, 500, n_records),
    'DailyCost': np.random.normal(1000, 200, n_records).clip(700, 1500).round(2),
    'RecoveryRating': np.random.normal(7.5, 1.5, n_records).clip(1, 10).round(1)
}

# Calculate length of stay and discharge dates
los = np.random.lognormal(2, 0.5, n_records).astype(int).clip(1, 30)
data['DischargeDate'] = [ad + timedelta(days=int(los[i])) for i, ad in enumerate(admission_dates)]
data['LengthOfStay'] = los

# Calculate total bill
data['TotalBill'] = (los * data['DailyCost'] * (1 + np.random.normal(0, 0.1, n_records))).round(2)

# Add prescription details
data['Prescription'] = [', '.join(np.random.choice(medications, np.random.randint(1, 4))) for _ in range(n_records)]

# Create DataFrame
df = pd.DataFrame(data)

# Add realistic patterns
# 1. Older patients tend to have longer stays
df.loc[df['Age'] > 70, 'LengthOfStay'] += np.random.randint(1, 5, len(df[df['Age'] > 70]))

# 2. Certain diagnoses have higher costs
df.loc[df['Diagnosis'].isin(['Cancer', 'Heart Disease']), 'DailyCost'] *= 1.3

# 3. Recovery rating correlation with length of stay
df.loc[df['LengthOfStay'] > 15, 'RecoveryRating'] -= 1

# Round monetary values
df['DailyCost'] = df['DailyCost'].round(2)
df['TotalBill'] = df['TotalBill'].round(2)

# Export to CSV
df.to_csv('healthcare_analysis.csv', index=False)

# Print dataset info
print("Dataset Overview:")
print(f"Number of records: {len(df)}")
print("\nSample of the data:")
print(df.head())
print("\nSummary statistics:")
print(df.describe())

# Print unique values in categorical columns
categorical_columns = ['Hospital', 'DoctorName', 'Diagnosis', 'Treatment', 'Bloodtype', 'Gender']
print("\nUnique values in categorical columns:")
for col in categorical_columns:
    print(f"\n{col}:")
    print(df[col].value_counts())

Dataset Overview:
Number of records: 1000

Sample of the data:
   PatientID PatientName  Age Gender Bloodtype              Hospital  \
0       1000   Patient_0   36      F        O+  Community Healthcare   
1       1001   Patient_1   50      M        O-   University Hospital   
2       1002   Patient_2   32      F        B+  Community Healthcare   
3       1003   Patient_3   42      F       AB+   City Medical Center   
4       1004   Patient_4   70      M        B-   City Medical Center   

     DoctorName Diagnosis         Treatment              AdmissionDate  \
0    Dr. Garcia    Stroke   Preventive Care 2024-05-09 16:18:41.247950   
1    Dr. Garcia    Stroke         Radiation 2025-01-10 16:18:41.247950   
2  Dr. Williams  Fracture  Physical Therapy 2024-10-24 16:18:41.247950   
3     Dr. Brown    Stroke           Surgery 2024-05-13 16:18:41.247950   
4     Dr. Davis    Cancer        Medication 2024-04-08 16:18:41.247950   

   RoomNumber  DailyCost  RecoveryRating              Disch