# Import Libraries

In [2]:
import pandas as pd
import numpy as np

## Set random seed for reproducibility

In [4]:
np.random.seed(42)

# Number of simulated batches

In [6]:
n_samples = 100

# Create basic protocol inputs

In [8]:
df = pd.DataFrame({
    'Batch_ID': [f'Batch_{i+1:03}' for i in range(n_samples)],
    'BMP4': np.random.uniform(0, 100, n_samples),          # cytokine level in ng/mL
    'ActivinA': np.random.uniform(0, 100, n_samples),
    'FGF2': np.random.uniform(0, 100, n_samples),
    'Wnt3a': np.random.uniform(0, 100, n_samples),
    'O2_Level': np.random.choice([5, 10, 20], n_samples),   # % O2 concentration
    'Seeding_Density': np.random.uniform(5e4, 2e5, n_samples),
    'Passage_Number': np.random.randint(5, 30, n_samples),
})

In [9]:
df.to_csv("ipsc_differentiation_data.csv", index=False)

In [10]:
df.head()

Unnamed: 0,Batch_ID,BMP4,ActivinA,FGF2,Wnt3a,O2_Level,Seeding_Density,Passage_Number
0,Batch_001,37.454012,3.142919,64.203165,5.168172,5,85625.970694,6
1,Batch_002,95.071431,63.641041,8.413996,53.135463,10,110033.433931,10
2,Batch_003,73.199394,31.435598,16.162871,54.063512,5,121657.233632,22
3,Batch_004,59.865848,50.857069,89.855419,63.74299,5,62433.698254,6
4,Batch_005,15.601864,90.756647,60.642906,72.609133,10,129255.518195,22


In [11]:
# Add gene expression features (log-scale, centered at ~5)
marker_genes = ['POU5F1', 'NANOG', 'SOX2', 'CDX2', 'NEUROD1']

for gene in marker_genes:
    df[gene] = np.random.normal(loc=5, scale=1.5, size=n_samples)

In [12]:
# Simulate Purity (based on BMP4, ActivinA, SOX2)
df['Purity'] = (
    0.3 * df['BMP4'] +
    0.2 * df['ActivinA'] +
    0.4 * df['SOX2'] +
    np.random.normal(0, 5, n_samples)
) / 2.5

# Simulate Viability (negatively impacted by passage number, low O2)
df['Viability'] = (
    100 -
    0.8 * df['Passage_Number'] -
    0.5 * (20 - df['O2_Level']) +
    np.random.normal(0, 5, n_samples)
)

# Simulate Yield (depends on NANOG and Seeding Density)
df['Yield'] = (
    0.0005 * df['Seeding_Density'] +
    5 * df['NANOG'] +
    np.random.normal(0, 10, n_samples)
)

In [13]:
# Clip outcome metrics to realistic bounds
df['Purity'] = df['Purity'].clip(0, 100)
df['Viability'] = df['Viability'].clip(0, 100)
df['Yield'] = df['Yield'].clip(0, 500)

In [None]:
# df.to_csv("ipsc_differentiation_data_with_genes.csv", index=False)

In [6]:
df = pd.read_csv("../data/ipsc_differentiation_data_with_genes.csv")
x = df.drop(columns=["Purity", "Viability", "Yield","Batch_ID"])

In [7]:
x.to_csv("../data/features_only.csv",index=False)