In [None]:
parameter_ranges = {
    'Hemoglobin': {
        'normal': {'male': (13.5, 17.5), 'female': (12.0, 15.5)},
        'low': ['Anemia', 'Blood loss', 'Chronic disease', 'Nutritional deficiency', 'Bone marrow disorder', 'Kidney disease'],
        'high': ['Dehydration', 'Polycythemia vera', 'Lung disease', 'High altitude adaptation']
    },
    'RBC': {
        'normal': {'male': (4.5, 5.9), 'female': (4.0, 5.2)},
        'low': ['Anemia', 'Bone marrow failure', 'Nutritional deficiency', 'Chronic inflammation', 'Hemolysis'],
        'high': ['Dehydration', 'Polycythemia vera', 'Hypoxia', 'Kidney tumor']
    },
    'HCT': {
        'normal': {'male': (40, 50), 'female': (36, 46)},
        'low': ['Anemia', 'Bleeding', 'Nutritional deficiency', 'Bone marrow disorder'],
        'high': ['Dehydration', 'Polycythemia vera', 'Chronic lung disease']
    },
    'MCV': {
        'normal': (80, 100),
        'low': ['Iron deficiency anemia', 'Thalassemia', 'Chronic disease'],
        'high': ['Vitamin B12 deficiency', 'Folate deficiency', 'Liver disease', 'Hypothyroidism']
    },
    'MCH': {
        'normal': (27, 33),
        'low': ['Iron deficiency anemia', 'Thalassemia'],
        'high': ['Macrocytic anemia', 'Reticulocytosis']
    },
    'MCHC': {
        'normal': (32, 36),
        'low': ['Iron deficiency anemia', 'Thalassemia'],
        'high': ['Hereditary spherocytosis', 'Hemoglobin C disease']
    },
    'RDW-CV': {
        'normal': (11.5, 14.5),
        'low': [],
        'high': ['Iron deficiency anemia', 'Vitamin B12 deficiency', 'Hemoglobinopathy', 'Myelodysplasia']
    },
    'RDW-SD': {
        'normal': (39, 46),
        'low': [],
        'high': ['Iron deficiency anemia', 'Vitamin B12 deficiency']  # Similar to RDW-CV
    },
    'WBC': {
        'normal': (4.0, 11.0),
        'low': ['Viral infection', 'Bone marrow disorder', 'Autoimmune disease', 'Severe infection'],
        'high': ['Bacterial infection', 'Leukemia', 'Inflammation', 'Stress response']
    },
    'NEU%': {
        'normal': (40, 70),
        'low': ['Viral infection', 'Autoimmune disorder', 'Chemotherapy effect'],
        'high': ['Bacterial infection', 'Acute inflammation', 'Steroid use']
    },
    'LYM%': {
        'normal': (20, 40),
        'low': ['HIV/AIDS', 'Immunosuppression', 'Radiation exposure'],
        'high': ['Viral infection', 'Chronic infection', 'Lymphoma']
    },
    'MON%': {
        'normal': (2, 10),
        'low': [],
        'high': ['Chronic infection', 'Autoimmune disease', 'Myeloproliferative disorder']
    },
    'EOS%': {
        'normal': (0, 6),
        'low': [],
        'high': ['Allergic disorder', 'Parasitic infection', 'Autoimmune disease']
    },
    'BAS%': {
        'normal': (0, 2),
        'low': [],
        'high': ['Allergic reaction', 'Chronic inflammation', 'Myeloproliferative disorder']
    },
    'LYM#': {
        'normal': (1.0, 4.0),
        'low': ['HIV/AIDS', 'Immunosuppression'],
        'high': ['Viral infection', 'Lymphoma']
    },
    'GRA#': {
        'normal': (1.8, 7.0),
        'low': ['Chemotherapy effect', 'Bone marrow failure'],
        'high': ['Bacterial infection', 'Inflammation']
    },
    'PLT': {
        'normal': (150, 450),
        'low': ['Viral infection', 'Autoimmune disorder', 'Bone marrow disorder'],
        'high': ['Inflammation', 'Iron deficiency', 'Myeloproliferative disorder']
    },
    'ESR': {
        'normal': {'male': (0, 15), 'female': (0, 20)},
        'low': [],
        'high': ['Inflammation', 'Infection', 'Autoimmune disease', 'Malignancy']
    }
}

In [None]:
def generate_labels(row):
    conditions = []
    sex = row['Sex'].lower() if pd.notna(row['Sex']) else None
    
    for param, ranges in parameter_ranges.items():
        if param not in row or pd.isna(row[param]):
            continue
            
        value = row[param]
        
        if isinstance(ranges['normal'], dict):
            if sex in ranges['normal']:
                normal_min, normal_max = ranges['normal'][sex]
            else:
                continue 
        else:
            normal_min, normal_max = ranges['normal']
        
        if value < normal_min and 'low' in ranges:
            conditions.extend(ranges['low'])
        elif value > normal_max and 'high' in ranges:
            conditions.extend(ranges['high'])
    
    return list(set(conditions)) if conditions else ['Normal']

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import skewnorm
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

def generate_synthetic_data(n_samples=1000):
    """Generate synthetic blood test data with realistic distributions"""
    
    # Generate integer ages between 18-90
    age = np.random.randint(18, 91, size=n_samples)
    sex = np.random.choice(['male', 'female'], size=n_samples, p=[0.5, 0.5])
    
    # Initialize DataFrame
    data = pd.DataFrame({
        'Age': age,  # Now properly as integers
        'Sex': sex
    })
    
    # Helper function for skewed normal distributions
    def skewed_normal(alpha, loc, scale, size):
        return skewnorm.rvs(alpha, loc=loc, scale=scale, size=size)
    
    # Generate parameters with correlations
    for i in range(n_samples):
        current_sex = data.at[i, 'Sex']
        current_age = data.at[i, 'Age']  # Already an integer
        
        # Hemoglobin (correlated with sex and age)
        if current_sex == 'male':
            hgb_mean = 15.0 - (current_age * 0.01)
            hgb = np.random.normal(hgb_mean, 1.2)
        else:
            hgb_mean = 13.5 - (current_age * 0.008)
            hgb = np.random.normal(hgb_mean, 1.0)
        data.at[i, 'Hemoglobin'] = np.clip(hgb, 8, 20)
        
        # RBC (correlated with hemoglobin)
        rbc_factor = hgb / 15.0
        if current_sex == 'male':
            rbc = np.random.normal(5.0 * rbc_factor, 0.5)
        else:
            rbc = np.random.normal(4.5 * rbc_factor, 0.4)
        data.at[i, 'RBC'] = np.clip(rbc, 3.0, 7.0)
        
        # WBC (slightly right-skewed)
        wbc = skewed_normal(5, 7.0, 2.0, 1)[0]
        data.at[i, 'WBC'] = np.clip(wbc, 2.0, 20.0)
        
        # Platelets
        if random.random() < 0.95:
            plt = np.random.normal(250, 50)
        else:
            plt = np.random.normal(600, 100)  # Thrombocytosis cases
        data.at[i, 'PLT'] = np.clip(plt, 50, 800)
        
        # MCV (age-dependent)
        mcv_age_factor = 1 + (current_age - 40) * 0.002
        mcv = np.random.normal(90 * mcv_age_factor, 5)
        data.at[i, 'MCV'] = np.clip(mcv, 60, 110)
        
        # Anemia-related parameters
        if hgb < 12:
            data.at[i, 'MCH'] = np.clip(np.random.normal(26, 2), 20, 32)
            data.at[i, 'MCHC'] = np.clip(np.random.normal(31, 1.5), 28, 34)
            data.at[i, 'RDW-CV'] = np.clip(np.random.normal(15, 2), 11, 25)
        else:
            data.at[i, 'MCH'] = np.clip(np.random.normal(30, 1.5), 25, 34)
            data.at[i, 'MCHC'] = np.clip(np.random.normal(34, 1), 32, 36)
            data.at[i, 'RDW-CV'] = np.clip(np.random.normal(13, 1), 11, 18)
            
        # Differential counts (sum to ~100)
        neu = np.random.normal(55, 10)
        lym = np.random.normal(35, 8)
        mon = np.random.normal(7, 2)
        eos = np.random.normal(2, 1)
        bas = 100 - (neu + lym + mon + eos)
        
        data.at[i, 'NEU%'] = np.clip(neu, 40, 80)
        data.at[i, 'LYM%'] = np.clip(lym, 15, 50)
        data.at[i, 'MON%'] = np.clip(mon, 1, 12)
        data.at[i, 'EOS%'] = np.clip(eos, 0, 8)
        data.at[i, 'BAS%'] = np.clip(bas, 0, 3)
        
        # Absolute counts
        data.at[i, 'LYM#'] = data.at[i, 'LYM%']/100 * data.at[i, 'WBC']
        data.at[i, 'GRA#'] = data.at[i, 'NEU%']/100 * data.at[i, 'WBC']
        
        # ESR
        if current_sex == 'male':
            esr = np.random.normal(10 + current_age/20, 5)
        else:
            esr = np.random.normal(15 + current_age/15, 7)
        data.at[i, 'ESR'] = np.clip(esr, 1, 100)
    
    # Proper type conversion and rounding
    data['Age'] = data['Age'].astype(int)  # Ensure Age is integer
    
    decimals = {
        'Hemoglobin': 1, 'RBC': 2, 'WBC': 1, 'PLT': 0,
        'MCV': 0, 'MCH': 1, 'MCHC': 1, 'RDW-CV': 1,
        'NEU%': 1, 'LYM%': 1, 'MON%': 1, 'EOS%': 1, 'BAS%': 1,
        'LYM#': 2, 'GRA#': 2, 'ESR': 0
    }
    
    for col, dec in decimals.items():
        data[col] = data[col].round(dec)
    
    # Add 5% missing values
    for col in data.columns:
        if col not in ['Age', 'Sex']:
            data.loc[data.sample(frac=0.05).index, col] = np.nan
    
    return data

# Generate and save data
synthetic_data = generate_synthetic_data(10000)
synthetic_data.to_csv('synthetic_blood_reports.csv', index=False)

print("First 5 samples (Age as integer):")
print(synthetic_data.head())

First 5 samples (Age as integer):
   Age     Sex  Hemoglobin   RBC  WBC    PLT   MCV   MCH  MCHC  RDW-CV  NEU%  \
0   69    male        14.9  4.64  6.9  214.0  93.0  28.9  34.7    12.3  46.0   
1   32    male        15.0  5.15  9.5  291.0  91.0  32.0  34.5    13.4  70.6   
2   89  female        13.3  4.18  9.8  263.0  97.0  30.5  34.8    13.0  57.2   
3   78    male        15.0  5.17  9.5  159.0  88.0  31.1  33.4    13.4  56.1   
4   38    male        13.9  4.45  7.7  296.0  89.0  31.5  35.5    13.9  60.3   

   LYM%  MON%  EOS%  BAS%  LYM#  GRA#   ESR  
0  49.1   8.9   0.1   0.0  3.39  3.17  10.0  
1  43.8   5.3   2.6   0.0  4.15  6.68   8.0  
2  46.1   7.9   2.2   0.0  4.50  5.59  14.0  
3  33.7   6.8   1.1   2.3  3.21  5.35  14.0  
4  37.6   8.7   2.1   0.0  2.91  4.66  17.0  


In [None]:
synthetic_data['Conditions'] = synthetic_data.apply(generate_labels, axis=1)
print("\nData with Conditions:")
print(synthetic_data[['Age', 'Sex', 'Conditions']].head())
# Save the DataFrame to a CSV file
synthetic_data.to_csv('synthetic_data_with_conditions1.csv', index=False)