In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from datetime import datetime, timedelta

# Creating Data for RNA manufacturing

I picked essential features for the production, sensory output from different machines. 

1) Temperature
2) pH
3) Pressure
4) Dissolved Oxygen (%) -precise oxygenation levels to support enzymatic activity
5) Conductivity (mS/cm) - ionic strength for RNA stability
6) UV absorbance (260nm) - to monitor RNA concentration in real-time.
7) RNA Folding Efficiency - Circular Dichroism Spectroscopy
8) Nanopore RNA Integrity Score - detecting intact and degradaded RNA molecules
9) Enzyme Concentration - T7 RNA polymerase

I created a system which measures all qualities in 3 batches simultanously -every 15 minutes

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Parameters for simulation
num_rows_per_batch = 100  # Number of data points per batch
batch_ids = ["Batch_001", "Batch_002", "Batch_003"]
start_time = datetime(2024, 12, 8, 8, 0)  # Starting timestamp
time_step = timedelta(minutes=15)  # Interval between readings

# Generate data
data = {
    "Timestamp": [],
    "Batch ID": [],
    "Temperature (°C)": [],
    "pH": [],
    "Pressure (kPa)": [],
    "Quality Metric (%)": [],
    "Dissolved Oxygen (%)": [],
    "Conductivity (mS/cm)": [],
    "UV Absorbance": [],
    "RNA Folding Efficiency (%)": [],
    "Nanopore RNA Integrity (Score)": [],
    "Enzyme Concentration (U/mL)": []
}

for batch in batch_ids:
    for i in range(num_rows_per_batch):
        timestamp = start_time + i * time_step
        data["Timestamp"].append(timestamp)
        data["Batch ID"].append(batch)
        data["Temperature (°C)"].append(np.random.normal(37.5, 0.5))  # Mean 37.5, SD 0.5
        data["pH"].append(np.random.normal(7.4, 0.1))  
        data["Pressure (kPa)"].append(np.random.normal(101.5, 1.0))  
        data["Quality Metric (%)"].append(np.random.uniform(85, 95))  # Random values between 85 and 95
        data["Dissolved Oxygen (%)"].append(np.random.normal(60, 2))
        data["Conductivity (mS/cm)"].append(np.random.normal(15, 0.5))
        data["UV Absorbance"].append(np.random.normal(0.8, 1.0))
        data["RNA Folding Efficiency (%)"].append(np.random.uniform(90, 100))
        data["Nanopore RNA Integrity (Score)"].append(np.random.uniform(7.5, 10))
        data["Enzyme Concentration (U/mL)"].append(np.random.uniform(100, 130))
                                            




In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,Timestamp,Batch ID,Temperature (°C),pH,Pressure (kPa),Quality Metric (%),Dissolved Oxygen (%),Conductivity (mS/cm),UV Absorbance,RNA Folding Efficiency (%),Nanopore RNA Integrity (Score),Enzyme Concentration (U/mL)
0,2024-12-08 08:00:00,Batch_001,38.107275,7.321717,100.490226,86.708730,59.391629,14.729690,0.301743,90.025372,8.419629,110.592979
1,2024-12-08 08:15:00,Batch_001,37.968837,7.415947,100.941397,87.822100,61.032041,13.279366,1.657361,93.871107,8.813845,105.625916
2,2024-12-08 08:30:00,Batch_001,37.408535,7.439668,100.695474,92.763296,60.392487,15.221677,1.859735,94.537502,8.345944,123.800014
3,2024-12-08 08:45:00,Batch_001,36.808186,7.460966,101.250553,94.101104,60.629405,16.039229,1.413965,96.031035,8.585404,103.887323
4,2024-12-08 09:00:00,Batch_001,37.626759,7.401403,101.310707,92.208867,61.116176,15.273418,-0.123650,92.789293,8.613030,107.240410
...,...,...,...,...,...,...,...,...,...,...,...,...
295,2024-12-09 07:45:00,Batch_003,37.681209,7.515834,101.378554,90.561076,59.244882,14.434512,0.720013,93.046872,7.809654,122.071441
296,2024-12-09 08:00:00,Batch_003,38.399242,7.357286,101.116423,86.182746,58.981271,15.037926,0.816031,90.793619,8.184243,129.071439
297,2024-12-09 08:15:00,Batch_003,37.529792,7.348742,102.700948,85.639828,57.848483,13.891028,1.108147,93.570071,7.795571,102.805039
298,2024-12-09 08:30:00,Batch_003,36.711263,7.399176,101.509033,90.214193,63.947165,15.745305,1.949443,99.425983,9.618155,119.397999


In [4]:
df.to_csv("bioreactor_data_normal.csv", index=False)

In [5]:
# I choose problematic batch to be 2
problematic_batch = "Batch_002"
start_anomaly = datetime(2024, 12, 8, 10, 0)
end_anomaly = datetime(2024, 12, 8, 12, 0)



In [6]:
df.loc[
    (df["Batch ID"] == problematic_batch) & (df["Timestamp"] >= start_anomaly) & (df["Timestamp"] <= end_anomaly),
    ["pH", "Quality Metric (%)", "RNA Folding Efficiency (%)", "Nanopore RNA Integrity (Score)"]
] = np.array([
    np.random.uniform(6.5, 6.9, sum((df["Batch ID"] == problematic_batch) &
                                    (df["Timestamp"] >= start_anomaly) &
                                    (df["Timestamp"] <= end_anomaly))),
    np.random.uniform(60, 80, sum((df["Batch ID"] == problematic_batch) &
                                   (df["Timestamp"] >= start_anomaly) &
                                   (df["Timestamp"] <= end_anomaly))),
    np.random.uniform(80, 85, sum((df["Batch ID"] == problematic_batch) &
                                   (df["Timestamp"] >= start_anomaly) &
                                   (df["Timestamp"] <= end_anomaly))),
    np.random.uniform(5.5, 7.0, sum((df["Batch ID"] == problematic_batch) &
                                    (df["Timestamp"] >= start_anomaly) &
                                    (df["Timestamp"] <= end_anomaly))),
]).T  # Transpose to align columns and rows


In [7]:
df_problematic = df[df["Batch ID"] == problematic_batch]

In [8]:
df_problematic.loc[105:130]

Unnamed: 0,Timestamp,Batch ID,Temperature (°C),pH,Pressure (kPa),Quality Metric (%),Dissolved Oxygen (%),Conductivity (mS/cm),UV Absorbance,RNA Folding Efficiency (%),Nanopore RNA Integrity (Score),Enzyme Concentration (U/mL)
105,2024-12-08 09:15:00,Batch_002,37.818196,7.394577,102.14777,91.51732,58.263361,13.86483,2.181796,95.398945,9.761256,100.545088
106,2024-12-08 09:30:00,Batch_002,37.821812,7.403121,99.858721,88.983202,56.608158,14.942868,0.855046,96.954392,9.483685,109.609576
107,2024-12-08 09:45:00,Batch_002,36.990602,7.372321,102.209753,90.861361,58.822875,15.154895,1.429215,90.4466,9.067296,127.273485
108,2024-12-08 10:00:00,Batch_002,36.847717,6.682774,99.6715,66.571545,60.680213,15.240626,0.544767,83.06833,6.917928,118.049638
109,2024-12-08 10:15:00,Batch_002,37.392697,6.604184,101.080858,73.548643,59.675513,15.170567,0.946382,84.928646,5.925278,100.595827
110,2024-12-08 10:30:00,Batch_002,37.922665,6.643476,99.983823,70.775812,56.962588,15.345374,0.378192,82.550709,6.118317,105.74176
111,2024-12-08 10:45:00,Batch_002,37.924641,6.543264,100.142268,77.57993,60.354121,14.402486,0.037496,82.022336,6.133653,110.579078
112,2024-12-08 11:00:00,Batch_002,37.108474,6.566848,101.373971,64.835331,59.318107,15.595452,-0.339056,83.736663,5.746956,123.641816
113,2024-12-08 11:15:00,Batch_002,37.768181,6.762166,101.749383,79.207261,60.12578,14.685551,0.536678,84.166427,6.443538,125.758324
114,2024-12-08 11:30:00,Batch_002,37.673954,6.858759,101.694263,72.21906,61.869699,14.012573,0.644152,81.501742,6.13183,126.302199


In [9]:
df.to_csv("bioreactor_data_problematic.csv", index=False)