<a href="https://colab.research.google.com/github/nmansour67/skills-introduction-to-github/blob/main/Data_Generator_Prompt1_Dataset_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ============================================================================
# ROBOTIC SURGERY DATASET GENERATOR
# Phase 1: Synthetic Data Generation with BMI Interaction Effect
# ============================================================================
#
# Purpose: Generate 500 surgical cases comparing robotic vs. standard surgery
# Key Feature: Robotic surgery performance VARIES by patient BMI
# Output: robot_surgery_data.csv ready for Phase 2 analysis
# ============================================================================

print("="*80)
print("ü§ñ ROBOTIC SURGERY DATASET GENERATOR")
print("="*80)
print("""
GENERATING SYNTHETIC DATASET:
500 surgical cases with BMI-dependent interaction effect

KEY RELATIONSHIP TO EMBED:
- Low BMI (<35): Robotic surgery is FASTER
- High BMI (‚â•35): Robotic surgery is SLOWER
- Standard surgery: Relatively consistent across BMI range

This dataset will be downloaded for analysis in Phase 2.
""")

# ============================================================================
# SECTION 1: INSTALL LIBRARIES
# ============================================================================

print("\nüì¶ SECTION 1: Installing Required Libraries")
print("="*80)

import subprocess
import sys

print("Installing pandas and numpy...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "pandas"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numpy"])

import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

print("‚úÖ Libraries installed successfully!\n")

# ============================================================================
# SECTION 2: CONFIGURATION PARAMETERS
# ============================================================================

print("‚öôÔ∏è SECTION 2: Configuration Parameters")
print("="*80)

# Total number of surgical cases
N_TOTAL = 500
N_STANDARD = 250
N_ROBOTIC = 250

print(f"Total Cases: {N_TOTAL}")
print(f"  ‚Ä¢ Standard Surgery: {N_STANDARD} cases")
print(f"  ‚Ä¢ Robotic Surgery:  {N_ROBOTIC} cases")

# BMI Distribution Parameters
BMI_MEAN = 30
BMI_STD = 6
BMI_MIN = 18
BMI_MAX = 50

print(f"\nBMI Distribution:")
print(f"  ‚Ä¢ Mean: {BMI_MEAN} kg/m¬≤")
print(f"  ‚Ä¢ Standard Deviation: {BMI_STD} kg/m¬≤")
print(f"  ‚Ä¢ Range: {BMI_MIN}-{BMI_MAX} kg/m¬≤")

# Critical BMI threshold for interaction effect
BMI_THRESHOLD = 35

print(f"\n‚ö†Ô∏è CRITICAL THRESHOLD: BMI = {BMI_THRESHOLD} kg/m¬≤")
print(f"  ‚Ä¢ Below {BMI_THRESHOLD}: Robot EFFICIENT (faster)")
print(f"  ‚Ä¢ Above {BMI_THRESHOLD}: Robot INEFFICIENT (slower)")

print("\n‚úÖ Configuration complete\n")

# ============================================================================
# SECTION 3: DURATION CALCULATION FUNCTIONS
# ============================================================================

print("üî¨ SECTION 3: Defining Surgical Duration Models")
print("="*80)

def calculate_standard_surgery_duration(bmi):
    """
    Calculate duration for standard (open/laparoscopic) surgery

    Model:
    - Baseline: 120 minutes
    - BMI effect: +0.5 min per BMI point above 30
    - Variability: ¬±15 minutes (normal distribution)
    """
    base_duration = 120
    bmi_effect = max(0, (bmi - 30) * 0.5)
    mean_duration = base_duration + bmi_effect

    # Add random variability
    duration = np.random.normal(mean_duration, 15)

    # Ensure reasonable bounds (60-240 minutes)
    duration = np.clip(duration, 60, 240)

    return round(duration, 1)


def calculate_robotic_surgery_duration(bmi):
    """
    Calculate duration for robotic surgery

    INTERACTION EFFECT:
    - IF BMI < 35: Faster (90 min baseline, ¬±12 min variability)
    - IF BMI ‚â• 35: Slower (150 min baseline, ¬±20 min variability)
    """
    if bmi < BMI_THRESHOLD:
        # Low BMI: Robot is EFFICIENT
        base_duration = 90  # 30 min faster than standard
        bmi_effect = (bmi - 25) * 0.2
        std_dev = 12
    else:
        # High BMI: Robot is INEFFICIENT
        base_duration = 150  # 30 min slower than standard
        bmi_effect = (bmi - BMI_THRESHOLD) * 1.5
        std_dev = 20

    mean_duration = base_duration + bmi_effect

    # Add random variability
    duration = np.random.normal(mean_duration, std_dev)

    # Ensure reasonable bounds
    duration = np.clip(duration, 60, 300)

    return round(duration, 1)


print("‚úÖ Duration models defined:")
print("  ‚Ä¢ Standard surgery: calculate_standard_surgery_duration()")
print("  ‚Ä¢ Robotic surgery:  calculate_robotic_surgery_duration()")
print("\n")

# ============================================================================
# SECTION 4: GENERATE STANDARD SURGERY CASES
# ============================================================================

print("üîß SECTION 4: Generating Standard Surgery Cases")
print("="*80)

standard_cases = []

for i in range(N_STANDARD):
    # Generate Patient ID
    patient_id = f"STD-{i+1:04d}"

    # Generate BMI (truncated normal distribution)
    bmi = np.random.normal(BMI_MEAN, BMI_STD)
    bmi = np.clip(bmi, BMI_MIN, BMI_MAX)
    bmi = round(bmi, 1)

    # Calculate surgical duration
    duration = calculate_standard_surgery_duration(bmi)

    # Store case
    standard_cases.append({
        'Patient_ID': patient_id,
        'BMI': bmi,
        'Surgery_Type': 'Standard',
        'Duration_Minutes': duration
    })

print(f"‚úÖ Generated {len(standard_cases)} Standard Surgery cases")

# ============================================================================
# SECTION 5: GENERATE ROBOTIC SURGERY CASES
# ============================================================================

print("\nü§ñ SECTION 5: Generating Robotic Surgery Cases")
print("="*80)

robotic_cases = []

for i in range(N_ROBOTIC):
    # Generate Patient ID
    patient_id = f"ROB-{i+1:04d}"

    # Generate BMI (same distribution)
    bmi = np.random.normal(BMI_MEAN, BMI_STD)
    bmi = np.clip(bmi, BMI_MIN, BMI_MAX)
    bmi = round(bmi, 1)

    # Calculate surgical duration (with interaction effect!)
    duration = calculate_robotic_surgery_duration(bmi)

    # Store case
    robotic_cases.append({
        'Patient_ID': patient_id,
        'BMI': bmi,
        'Surgery_Type': 'Robot',
        'Duration_Minutes': duration
    })

print(f"‚úÖ Generated {len(robotic_cases)} Robotic Surgery cases")

# ============================================================================
# SECTION 6: CREATE DATAFRAME
# ============================================================================

print("\n\nüìä SECTION 6: Creating Combined Dataset")
print("="*80)

# Combine both surgery types
all_cases = standard_cases + robotic_cases

# Create DataFrame
df = pd.DataFrame(all_cases)

# Shuffle rows to mix surgery types
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"‚úÖ Dataset created: {len(df)} total cases")
print(f"\nDataFrame shape: {df.shape[0]} rows √ó {df.shape[1]} columns")
print(f"Columns: {list(df.columns)}")

# ============================================================================
# SECTION 7: DISPLAY SUMMARY STATISTICS
# ============================================================================

print("\n\nüìà SECTION 7: Summary Statistics (Demonstrating Interaction)")
print("="*80)

print("\nüìã OVERALL STATISTICS:")
print(df.describe())

print("\n\nüîç BY SURGERY TYPE:")
for surgery_type in ['Standard', 'Robot']:
    subset = df[df['Surgery_Type'] == surgery_type]
    print(f"\n{surgery_type} Surgery ({len(subset)} cases):")
    print(f"  BMI: {subset['BMI'].mean():.1f} ¬± {subset['BMI'].std():.1f} kg/m¬≤")
    print(f"  Duration: {subset['Duration_Minutes'].mean():.1f} ¬± {subset['Duration_Minutes'].std():.1f} min")

# Calculate overall averages
avg_standard = df[df['Surgery_Type']=='Standard']['Duration_Minutes'].mean()
avg_robot = df[df['Surgery_Type']=='Robot']['Duration_Minutes'].mean()

print(f"\n\nüìä OVERALL COMPARISON:")
print(f"  Standard: {avg_standard:.1f} min")
print(f"  Robot:    {avg_robot:.1f} min")
print(f"  Difference: {avg_robot - avg_standard:+.1f} min")

# BMI-stratified analysis (revealing interaction)
print(f"\n\nüí° BMI-STRATIFIED ANALYSIS (The Interaction Effect):")

# Low BMI group (<35)
low_standard = df[(df['Surgery_Type']=='Standard') & (df['BMI']<35)]['Duration_Minutes'].mean()
low_robot = df[(df['Surgery_Type']=='Robot') & (df['BMI']<35)]['Duration_Minutes'].mean()

print(f"\nBMI < {BMI_THRESHOLD}:")
print(f"  Standard: {low_standard:.1f} min")
print(f"  Robot:    {low_robot:.1f} min")
print(f"  Difference: {low_robot - low_standard:+.1f} min")
if low_robot < low_standard:
    print(f"  ‚Üí Robot is {abs(low_robot - low_standard):.1f} min FASTER ‚úì")

# High BMI group (‚â•35)
high_standard = df[(df['Surgery_Type']=='Standard') & (df['BMI']>=35)]['Duration_Minutes'].mean()
high_robot = df[(df['Surgery_Type']=='Robot') & (df['BMI']>=35)]['Duration_Minutes'].mean()

print(f"\nBMI ‚â• {BMI_THRESHOLD}:")
print(f"  Standard: {high_standard:.1f} min")
print(f"  Robot:    {high_robot:.1f} min")
print(f"  Difference: {high_robot - high_standard:+.1f} min")
if high_robot > high_standard:
    print(f"  ‚Üí Robot is {abs(high_robot - high_standard):.1f} min SLOWER ‚úó")

print(f"\n‚ö†Ô∏è INTERACTION EFFECT DETECTED:")
print(f"  Robot performance DEPENDS on patient BMI!")
print(f"  Crossover occurs around BMI = {BMI_THRESHOLD} kg/m¬≤")

# ============================================================================
# SECTION 8: DISPLAY SAMPLE DATA
# ============================================================================

print("\n\nüìã SECTION 8: Sample Data Preview")
print("="*80)

print("\nFirst 10 rows:")
print(df.head(10).to_string(index=False))

print("\n\nLast 10 rows:")
print(df.tail(10).to_string(index=False))

# ============================================================================
# SECTION 9: SAVE TO CSV
# ============================================================================

print("\n\nüíæ SECTION 9: Saving Dataset to CSV")
print("="*80)

filename = 'robot_surgery_data.csv'
filepath = f'/tmp/{filename}'

df.to_csv(filepath, index=False)

print(f"‚úÖ Dataset saved successfully!")
print(f"   Filename: {filename}")
print(f"   Location: {filepath}")
print(f"   Size: {len(df)} rows √ó {len(df.columns)} columns")

# ============================================================================
# SECTION 10: DOWNLOAD FILE
# ============================================================================

print("\n\nüì• SECTION 10: Downloading File to Your Computer")
print("="*80)

from google.colab import files

print(f"\nüîΩ Downloading {filename}...\n")

files.download(filepath)

print("="*80)
print("‚úÖ DOWNLOAD COMPLETE!")
print("="*80)

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print(f"""
üì¶ YOU NOW HAVE: {filename}

DATASET CONTENTS:
  ‚Ä¢ {len(df)} surgical cases
  ‚Ä¢ {len(df.columns)} columns: {', '.join(df.columns)}
  ‚Ä¢ Surgery types: {df['Surgery_Type'].value_counts().to_dict()}
  ‚Ä¢ BMI range: {df['BMI'].min():.1f} - {df['BMI'].max():.1f} kg/m¬≤

EMBEDDED INTERACTION EFFECT:
  ‚úì Low BMI (<35): Robot ~{abs(low_robot - low_standard):.0f} min faster
  ‚úì High BMI (‚â•35): Robot ~{abs(high_robot - high_standard):.0f} min slower
  ‚úì Crossover threshold: BMI ‚âà {BMI_THRESHOLD} kg/m¬≤

NEXT STEPS:
  1. Save {filename} to a known location on your computer
  2. Open a NEW Google Colab notebook
  3. Run PHASE 2 analysis code (upload this CSV when prompted)
  4. Discover the interaction effect through statistical analysis!

The dataset is ready for Phase 2 analysis. üìä
""")

print("="*80)
print("üéØ DATA GENERATION COMPLETE - PROCEED TO PHASE 2")
print("="*80)

ü§ñ ROBOTIC SURGERY DATASET GENERATOR

GENERATING SYNTHETIC DATASET:
500 surgical cases with BMI-dependent interaction effect

KEY RELATIONSHIP TO EMBED:
- Low BMI (<35): Robotic surgery is FASTER
- High BMI (‚â•35): Robotic surgery is SLOWER
- Standard surgery: Relatively consistent across BMI range

This dataset will be downloaded for analysis in Phase 2.


üì¶ SECTION 1: Installing Required Libraries
Installing pandas and numpy...
‚úÖ Libraries installed successfully!

‚öôÔ∏è SECTION 2: Configuration Parameters
Total Cases: 500
  ‚Ä¢ Standard Surgery: 250 cases
  ‚Ä¢ Robotic Surgery:  250 cases

BMI Distribution:
  ‚Ä¢ Mean: 30 kg/m¬≤
  ‚Ä¢ Standard Deviation: 6 kg/m¬≤
  ‚Ä¢ Range: 18-50 kg/m¬≤

‚ö†Ô∏è CRITICAL THRESHOLD: BMI = 35 kg/m¬≤
  ‚Ä¢ Below 35: Robot EFFICIENT (faster)
  ‚Ä¢ Above 35: Robot INEFFICIENT (slower)

‚úÖ Configuration complete

üî¨ SECTION 3: Defining Surgical Duration Models
‚úÖ Duration models defined:
  ‚Ä¢ Standard surgery: calculate_standard_surgery_dura

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úÖ DOWNLOAD COMPLETE!

üì¶ YOU NOW HAVE: robot_surgery_data.csv

DATASET CONTENTS:
  ‚Ä¢ 500 surgical cases
  ‚Ä¢ 4 columns: Patient_ID, BMI, Surgery_Type, Duration_Minutes
  ‚Ä¢ Surgery types: {'Robot': 250, 'Standard': 250}
  ‚Ä¢ BMI range: 18.0 - 48.5 kg/m¬≤

EMBEDDED INTERACTION EFFECT:
  ‚úì Low BMI (<35): Robot ~29 min faster
  ‚úì High BMI (‚â•35): Robot ~29 min slower
  ‚úì Crossover threshold: BMI ‚âà 35 kg/m¬≤

NEXT STEPS:
  1. Save robot_surgery_data.csv to a known location on your computer
  2. Open a NEW Google Colab notebook
  3. Run PHASE 2 analysis code (upload this CSV when prompted)
  4. Discover the interaction effect through statistical analysis!

The dataset is ready for Phase 2 analysis. üìä

üéØ DATA GENERATION COMPLETE - PROCEED TO PHASE 2
