In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import math

In [None]:
df = pd.read_csv('/mnt/mdrive/home1/akshat/CVT/data/analysis/heamoglobin.csv')
df.head(3)

Unnamed: 0,person_id,index_visit_id,index_start,index_end,procedure_concept_id,procedure_datetime,measurement_concept_id,measurement_datetime,value_as_number,range_low,range_high,phase_label
0,942,819,2023-02-20,2023-02-20,4203153,2023-04-06 00:00:00,3005872,2023-02-20 13:43:10.996,14.3,,,Pre-Op
1,942,1204,2023-03-11,2023-04-08,4203153,2023-04-06 00:00:00,3005872,2023-03-11 12:49:47.588,14.1,,,Intra-Op
2,942,1204,2023-03-11,2023-04-08,4203153,2023-04-06 00:00:00,3005872,2023-03-29 16:29:42.337,13.7,,,Intra-Op


In [29]:
df['procedure_datetime'] = pd.to_datetime(df['procedure_datetime'],format='mixed')


In [30]:
df['procedure_datetime'] = pd.to_datetime(df['procedure_datetime'], format='mixed')

In [35]:
# 1. Convert to datetime using 'coerce' to turn bad data into NaT (Not a Time)
#    and utc=True to handle any timezone offsets found in the strings.
df['procedure_datetime'] = pd.to_datetime(df['procedure_datetime'], format='mixed', errors='coerce', utc=True)
df['measurement_datetime'] = pd.to_datetime(df['measurement_datetime'], format='mixed', errors='coerce', utc=True)

# 2. Remove timezone information (make them "timezone-naive")
#    This fixes the "DatetimeArray vs ndarray" conflict.
df['procedure_datetime'] = df['procedure_datetime'].dt.tz_localize(None)
df['measurement_datetime'] = df['measurement_datetime'].dt.tz_localize(None)

# 3. Now calculate 't' (days)
df['t'] = (df['measurement_datetime'] - df['procedure_datetime']).dt.days

# Check the results
print(df[['procedure_datetime', 'measurement_datetime', 't']].head())

  procedure_datetime    measurement_datetime   t
0         2023-04-06 2023-02-20 13:43:10.996 -45
1         2023-04-06 2023-03-11 12:49:47.588 -26
2         2023-04-06 2023-03-29 16:29:42.337  -8
3         2023-04-06 2023-04-06 18:11:57.090   0
4         2023-04-06 2023-04-07 00:14:28.510   1


In [32]:
import os 
output_folder = 'heamoglobin'
os.makedirs(output_folder, exist_ok=True)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import os  # Required for folder operations

# --- Configuration ---
output_folder = 'heamoglobin'
batch_size = 10  # Number of patients per plot
low_threshold = 12
high_threshold = 17

# 1. Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)
print(f"Output folder ready: {output_folder}/")


# Ensure person_id is string so it plots as discrete categories
if 'person_id' in df.columns:
    df['person_id'] = df['person_id'].astype(str)

# 2. Setup Batching Logic
unique_patients = df['person_id'].unique()
num_batches = math.ceil(len(unique_patients) / batch_size)

print(f"Total patients: {len(unique_patients)}")
print(f"Generating {num_batches} plots...")

# 3. Iterate through batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = start_idx + batch_size
    
    # Get the specific list of patients for this round
    batch_patients = unique_patients[start_idx:end_idx]
    
    # Filter the main DataFrame to only include these patients
    batch_df = df[df['person_id'].isin(batch_patients)]
    
    # --- Plotting ---
    plt.figure(figsize=(12, 7))
    sns.set_theme(style="whitegrid")
    
    # Plot only the current batch
    ax = sns.lineplot(
        data=batch_df, 
        x='t', 
        y='value_as_number', 
        hue='person_id', 
        palette='bright', 
        marker='o', 
        linewidth=1.5
    )
    
    # A. Reference Line (Procedure Date)
    plt.axvline(x=0, color='red', linestyle='--', linewidth=2, label='Procedure Date')

    # B. Threshold Lines (Low/High)
    plt.axhline(y=low_threshold, color='orange', linestyle='--', linewidth=2, label=f'Low ({low_threshold})')
    plt.axhline(y=high_threshold, color='orange', linestyle='--', linewidth=2, label=f'High ({high_threshold})')
    
    # Optional: Shade the healthy range for better visualization
    plt.axhspan(low_threshold, high_threshold, color='green', alpha=0.05)

    # C. Phase Arrows
    y_min, y_max = ax.get_ylim()
    y_range = y_max - y_min
    arrow_y_pos = y_min - (y_range * 0.12)
    t_min, t_max = batch_df['t'].min(), batch_df['t'].max()

    # Pre-Procedure Arrow
    if t_min < 0:
        ax.annotate('', xy=(t_min, arrow_y_pos), xytext=(-0.5, arrow_y_pos),
                    arrowprops=dict(arrowstyle="->", color='green', lw=2), annotation_clip=False)
        ax.text(t_min/2, arrow_y_pos, 'Pre-Procedure', ha='center', va='bottom', color='green', fontweight='bold')

    # Post-Procedure Arrow
    if t_max > 0:
        ax.annotate('', xy=(t_max, arrow_y_pos), xytext=(0.5, arrow_y_pos),
                    arrowprops=dict(arrowstyle="->", color='purple', lw=2), annotation_clip=False)
        ax.text(t_max/2, arrow_y_pos, 'Post-Procedure', ha='center', va='bottom', color='purple', fontweight='bold')

    # D. Formatting
    plt.xlabel('Days relative to Procedure (t)', labelpad=25)
    plt.ylabel('Haemoglobin Measurement')
    plt.title(f'Haemoglobin Trajectories (Batch {i+1} of {num_batches})')
    
    # Legend settings (outside plot)
    plt.legend(title='Patient ID', bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1, fontsize='small')
    plt.subplots_adjust(bottom=0.2, right=0.80) # Adjust right margin for legend space
    
    # E. Save and Close
    filename = f"heamoglobin_batch_{i+1}.png"
    save_path = os.path.join(output_folder, filename)
    
    plt.savefig(save_path)
    print(f"Saved {save_path}")
    plt.close() # Important to free memory between loops

Output folder ready: heamoglobin/
Total patients: 1018
Generating 102 plots...
Saved heamoglobin/heamoglobin_batch_1.png
Saved heamoglobin/heamoglobin_batch_2.png
Saved heamoglobin/heamoglobin_batch_3.png
Saved heamoglobin/heamoglobin_batch_4.png
Saved heamoglobin/heamoglobin_batch_5.png
Saved heamoglobin/heamoglobin_batch_6.png
Saved heamoglobin/heamoglobin_batch_7.png
Saved heamoglobin/heamoglobin_batch_8.png
Saved heamoglobin/heamoglobin_batch_9.png
Saved heamoglobin/heamoglobin_batch_10.png
Saved heamoglobin/heamoglobin_batch_11.png
Saved heamoglobin/heamoglobin_batch_12.png
Saved heamoglobin/heamoglobin_batch_13.png
Saved heamoglobin/heamoglobin_batch_14.png
Saved heamoglobin/heamoglobin_batch_15.png
Saved heamoglobin/heamoglobin_batch_16.png
Saved heamoglobin/heamoglobin_batch_17.png
Saved heamoglobin/heamoglobin_batch_18.png
Saved heamoglobin/heamoglobin_batch_19.png
Saved heamoglobin/heamoglobin_batch_20.png
Saved heamoglobin/heamoglobin_batch_21.png
Saved heamoglobin/heamoglob

In [None]:
# 1. Prepare Data
# Ensure person_id is a string/category so Seaborn treats it as discrete colors, not a gradient
df['person_id'] = df['person_id'].astype(str)

# 2. Setup Plot
plt.figure(figsize=(12, 7))
sns.set_theme(style="whitegrid") # Optional: Makes the background look nicer

# 3. Create Line Plot
# hue='person_id' (did for distiction of patients )
ax = sns.lineplot(
    data=df, 
    x='t', 
    y='value_as_number', 
    hue='person_id', 
    palette='bright', # Choose a vibrant color palette
    marker='o',       # Add dots at data points
    linewidth=2
)

# 4. Add Procedure Reference (t=0)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2, label='Procedure Date')

# 5. Add Phase Arrows (Standard Matplotlib code)
y_min, y_max = ax.get_ylim()
y_range = y_max - y_min
arrow_y_pos = y_min - (y_range * 0.12)
t_min, t_max = df['t'].min(), df['t'].max()

# Pre-Procedure Arrow
if t_min < 0:
    ax.annotate('', xy=(t_min, arrow_y_pos), xytext=(-0.5, arrow_y_pos),
                arrowprops=dict(arrowstyle="->", color='green', lw=2), annotation_clip=False)
    ax.text(t_min/2, arrow_y_pos, 'Pre-Procedure', ha='center', va='bottom', color='green', fontweight='bold')

# Post-Procedure Arrow
if t_max > 0:
    ax.annotate('', xy=(t_max, arrow_y_pos), xytext=(0.5, arrow_y_pos),
                arrowprops=dict(arrowstyle="->", color='purple', lw=2), annotation_clip=False)
    ax.text(t_max/2, arrow_y_pos, 'Post-Procedure', ha='center', va='bottom', color='purple', fontweight='bold')

# Formatting
plt.xlabel('Days relative to Procedure (t)', labelpad=25)
plt.ylabel('Measurement Value')
plt.title('Measurement Trajectories by Patient')
plt.legend(title='Patient ID', bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout to fit legend and arrows
plt.subplots_adjust(bottom=0.2, right=0.85)

plt.savefig('seaborn_patient_lines.png')