# Modelos datathon

## Acesso datos externos a fases de producción

In [23]:
import pickle
import pandas as pd

In [24]:
from pathlib import Path
dir = Path('.')
if (dir / 'info.pkl').exists:
    info = pd.read_pickle('info.pkl')
    print('Leyendo datos de info.pkl')
else:
    print('Creando info.pkl')
    datos = pd.read_pickle("dict_dataframes.pkl")
    info = {}
    info['biorreactores'] = {}
    info['centrifugas'] = {}

    biorreactores = info['biorreactores']
    centrifugas = info['centrifugas']
    for k in datos.keys():
        if 'Bio' in k:
            biorreactores[k.split('/')[0].split('.')[0].split(' ')[-1]] = datos[k]
        elif 'Cen' in k:
            centrifugas[k.split('/')[0].split('.')[0].split(' ')[-1]] = datos[k]
        else:
            info[k.split('/')[0].split('.')[0]] = datos[k]
    with open('info.pkl', 'wb') as file:
        pickle.dump(info, file)

Leyendo datos de info.pkl


Ahora los datos tienen una forma más estructurada y fácil de acceder.

In [25]:
from datetime import datetime
def encontrar_subdataset_entre_fechas(df: pd.DataFrame, fecha_inicio: str, fecha_fin: str) -> pd.DataFrame:
    result = {}
    result['DateTime'] = df['DateTime'].map(lambda x: datetime.strptime(x[:-4], '%Y-%m-%d %H:%M:%S'))
    col_bool = (result['DateTime'] >= datetime.strptime(fecha_inicio, '%Y-%m-%d %H:%M:%S')) & (result['DateTime'] <= datetime.strptime(fecha_fin, '%Y-%m-%d %H:%M:%S'))
    return df[col_bool]

encontrar_subdataset_entre_fechas(info['biorreactores']['13169'], '2023-03-17 23:00:00', '2023-03-18 02:00:00')

Unnamed: 0,DateTime,13169_FERM0101.Agitation_PV,13169_FERM0101.Air_Sparge_PV,13169_FERM0101.Biocontainer_Pressure_PV,13169_FERM0101.DO_1_PV,13169_FERM0101.DO_2_PV,13169_FERM0101.Gas_Overlay_PV,13169_FERM0101.Load_Cell_Net_PV,13169_FERM0101.pH_1_PV,13169_FERM0101.pH_2_PV,13169_FERM0101.PUMP_1_PV,13169_FERM0101.PUMP_1_TOTAL,13169_FERM0101.PUMP_2_PV,13169_FERM0101.PUMP_2_TOTAL,13169_FERM0101.Single_Use_DO_PV,13169_FERM0101.Single_Use_pH_PV,13169_FERM0101.Temperatura_PV
284,2023-03-17 23:00:00.000,0.0,0.0,480.0,0.0,,0.0,-19.599628,1.56356,-0.011332,0.0,27.280002,0.0,1486.330176,655.892432,799.623975,16.146649
285,2023-03-17 23:15:00.000,0.0,0.0,480.0,0.0,,0.0,-19.496784,1.56356,-0.011332,0.0,27.280002,0.0,1486.330176,655.892432,799.623975,16.788062
286,2023-03-17 23:30:00.000,0.0,0.0,480.0,0.0,0.0,0.0,-19.504235,1.56356,-0.011332,0.0,27.280002,0.0,1486.330176,655.892432,799.623975,17.027268
287,2023-03-17 23:45:00.000,0.0,0.0,480.0,0.0,,0.0,-19.27078,1.56356,-0.011332,0.0,27.280002,0.0,1486.330176,655.892432,799.623975,17.838418
288,2023-03-18 00:00:00.000,0.0,0.0,480.0,0.0,,0.0,-19.2,1.56356,-0.011332,0.0,27.280002,0.0,1486.330176,655.892432,799.623975,18.057414
289,2023-03-18 00:15:00.000,0.0,0.0,480.0,0.0,,0.0,-19.2,1.56356,-0.011332,0.0,27.280002,0.0,1486.330176,655.892432,799.623975,18.07987
290,2023-03-18 00:30:00.000,0.0,0.0,480.0,0.0,,0.0,-19.2,1.56356,-0.011332,0.0,8.894705,0.0,1486.330176,655.892432,799.623975,18.098775
291,2023-03-18 00:45:00.000,0.0,0.0,480.0,0.0,,0.0,-19.2,1.56356,-0.011332,0.0,27.280002,0.0,1486.330176,655.892432,799.623975,18.094757
292,2023-03-18 01:00:00.000,0.0,0.0,480.0,0.0,,0.0,-19.2,1.56356,-0.011332,0.0,27.280002,0.0,249.033884,655.892432,799.623975,18.012873
293,2023-03-18 01:15:00.000,0.0,0.0,480.0,0.0,,0.0,-19.2,1.56356,-0.011332,0.0,1.429191,0.0,1486.330176,655.892432,799.623975,18.07546


## Obtención de datos

En algunos datasets el ID del lote es `Lote` y en otros es `Orden`

In [26]:
len(info['OF 123456 v02']['Lote'].unique()) == len(info['OF 123456 v02']['Orden'].unique())

True

Creamos un traductor de IDs

In [27]:
from utils import limpiar_string_lote

lote_a_orden = {}
orden_a_lote = {}

for index, row in info['OF 123456 v02'].iterrows():
    orden = row['Orden']
    lote = limpiar_string_lote(row['Lote'])
    lote_a_orden[lote] = orden
    orden_a_lote[orden] = lote
orden_a_lote

{200178572: 23019,
 200179217: 23020,
 200181620: 23021,
 200182428: 23022,
 200182429: 23023,
 200182430: 23024,
 200182431: 23025,
 200182432: 23026,
 200182433: 23027,
 200182434: 23028,
 200182435: 23029,
 200182436: 23030,
 200182437: 23031,
 200182440: 23032,
 200182441: 23033,
 200182442: 23034,
 200182443: 23035,
 200182444: 23036,
 200185569: 23038,
 200185570: 23039,
 200182445: 23040,
 200184533: 23041,
 200184534: 23042,
 200184603: 23043,
 200184604: 23044,
 200184605: 23045,
 200184606: 23046,
 200184607: 23047,
 200184609: 23048,
 200184610: 23049,
 200184611: 23050,
 200184612: 23051,
 200187023: 23053,
 200187024: 23054,
 200187025: 23055,
 200182448: 23057,
 200187026: 23056,
 200182446: 23061,
 200187027: 23060,
 200187028: 23063,
 200187029: 23064,
 200187030: 23065,
 10005176: 23273,
 200187032: 23067,
 200187033: 23068,
 200188840: 23069,
 200188841: 23070,
 200188842: 23071,
 200188843: 23072,
 200188844: 23073,
 200188846: 23075,
 200188847: 23076,
 200188848: 2

En el módulo `utils.py` se han creado las funciones necesarias para que dado un lote y la hora de entrada y salida de un biorreactor obtenga todos los datos del biorreactor mientras el lote estaba en él, rellenando con ceros al final para que todos los datos de biorreactores de un lote tengan las mismas dimensiones. Es decir, solo el lote que más tiempo ha estado en un biorreactor no tendrá ceros al final.

In [28]:
from utils import buscar_registros_biorreactor

buscar_registros_biorreactor(info=info, biorreactor=13169, fecha_inicio='2023-03-17 23:00:00', fecha_fin='2023-03-18 02:00:00', n_registros= 15) # n_registros debe ser el número máximo de registros que un biorreactor puede tener

[[0.0,
  0.0,
  480.0,
  0.0,
  nan,
  0.0,
  -19.59962766133024,
  1.5635604858398402,
  -0.011331558227539042,
  0.0,
  27.28000183105472,
  0.0,
  1486.330175781248,
  655.8924316406249,
  799.6239746093752,
  16.146648745372882],
 [0.0,
  0.0,
  480.0,
  0.0,
  nan,
  0.0,
  -19.49678390348008,
  1.5635604858398402,
  -0.011331558227539042,
  0.0,
  27.28000183105472,
  0.0,
  1486.330175781248,
  655.8924316406249,
  799.6239746093752,
  16.78806189345856],
 [0.0,
  0.0,
  480.0,
  0.0,
  0.0,
  0.0,
  -19.5042352985844,
  1.5635604858398402,
  -0.011331558227539042,
  0.0,
  27.28000183105472,
  0.0,
  1486.330175781248,
  655.8924316406249,
  799.6239746093752,
  17.02726750796056],
 [0.0,
  0.0,
  480.0,
  0.0,
  nan,
  0.0,
  -19.270779786220643,
  1.5635604858398402,
  -0.011331558227539042,
  0.0,
  27.28000183105472,
  0.0,
  1486.330175781248,
  655.8924316406249,
  799.6239746093752,
  17.838417824518398],
 [0.0,
  0.0,
  480.0,
  0.0,
  nan,
  0.0,
  -19.200000000000003,

In [29]:
from utils import buscar_registros_centrifuga

buscar_registros_centrifuga(info, 12912, '2023-03-17 23:00:00', '2023-03-18 02:00:00', n_registros=15) # n_registros debe ser el número máximo de registros que un biorreactor puede tener

[[0.0, 0.0, 100.0, 0.0, 0.004108810424804688, nan, 0.0],
 [0.0, 0.0, 100.0, 0.0, 0.004108100093574144, nan, 0.0],
 [0.0, 0.0, 100.0, 0.0, 0.004108810424804688, nan, 0.0],
 [0.0, 0.0, 100.0, 0.0, 0.004108810424804688, nan, 0.0],
 [0.0, 0.0, 100.0, 0.0, 0.004108810424804688, nan, 0.0],
 [0.0, 0.0, 100.0, 0.0, 0.004412584755555072, nan, 0.0],
 [0.0, 0.0, 100.0, 0.0, 0.00384588877107956, nan, 0.0],
 [0.0, 0.0, 100.0, 0.0, 0.004125701377253896, nan, 0.0],
 [0.0, 0.0, 100.0, 0.0, 0.004964436911059817, nan, 0.0],
 [0.0, 0.0, 100.0, 0.0, 0.004395243181169152, nan, 0.0],
 [0.0, 0.0, 100.0, 0.0, 0.004116180744324904, nan, 0.0],
 [0.0, 0.0, 100.0, 0.0, 0.004398155212402344, nan, 0.0],
 [0.0, 0.0, 100.0, 0.0, 0.004108810424804688, nan, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]

In [30]:
from utils import buscar_registros_temperaturas
buscar_registros_temperaturas(info, '2023-03-17 23:00:00', '2023-03-18 02:00:00', n_registros=15) # n_registros debe ser el número máximo de registros que un biorreactor puede tener

[[21.0785815413964,
  44.8971456303654,
  23.0277404785156,
  39.3779528938923,
  20.3486709594727,
  36.2485542297363,
  20.2690963745117,
  47.6236953735352],
 [20.9725369506967,
  43.1119141711372,
  21.6715902671927,
  40.6776146469457,
  20.2112274169922,
  35.8470764160156,
  20.1388893127441,
  47.8949661254883],
 [23.922718792525,
  37.0043127275235,
  22.8135163502505,
  37.5125481087746,
  20.0665512084961,
  35.4470996427859,
  20.0267639160156,
  47.8985824584961],
 [23.6734716392996,
  36.6790166993381,
  23.3329257965088,
  35.9509451023306,
  20.088249206543,
  33.6460747908853,
  20.0267639160156,
  47.6273155212402],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.

## Creación datos input

Ahora vamos a iterar sobre los lotes que existen extrayendo sus datos de cada una de las fases, concatenando los distintos datos en un vector.

In [31]:
lotes = list(info['OF 123456 v02']['Lote'].unique())
lotes = [limpiar_string_lote(x) for x in lotes]
lotes

[23019,
 23020,
 23021,
 23022,
 23023,
 23024,
 23025,
 23026,
 23027,
 23028,
 23029,
 23030,
 23031,
 23032,
 23033,
 23034,
 23035,
 23036,
 23038,
 23039,
 23040,
 23041,
 23042,
 23043,
 23044,
 23045,
 23046,
 23047,
 23048,
 23049,
 23050,
 23051,
 23053,
 23054,
 23055,
 23057,
 23056,
 23061,
 23060,
 23063,
 23064,
 23065,
 23273,
 23067,
 23068,
 23069,
 23070,
 23071,
 23072,
 23073,
 23075,
 23076,
 23077,
 23078,
 23079,
 23080,
 23082,
 23083,
 23084,
 23081,
 23085,
 23086,
 23087,
 23088,
 23089,
 23090,
 23091,
 23092,
 23093,
 23094,
 23095,
 23096,
 23097,
 23098,
 23099,
 23100,
 23101,
 23102,
 23103,
 23106,
 23107,
 23105,
 23104,
 23108,
 23109,
 23110,
 23111,
 23112,
 23113,
 23114,
 23115,
 23116,
 23117,
 23121,
 23118,
 23119,
 23120,
 23122,
 23123,
 23126,
 23127,
 23124,
 23125,
 23130,
 23129,
 23134,
 23131,
 23135,
 23132,
 23133,
 23136,
 23137,
 23138,
 23139,
 24003,
 24004,
 24005,
 24007,
 24008,
 24010,
 24009,
 24011,
 24012,
 24014,
 24015,


Tenemos todos los id de lotes, ahora vamos a extraer la información.

## Siguientes pasos

Hay que generar un vector por cada dato de entrada que se le va a pasar a la red neuronal, por ejemplo un vector para los biorreactores, otro para los de producción y así sucesivamente. Todos tendrán la misma longitud y cada uno de los vectores será procesado por una capa de entrada, para más tarde combinarse en la red y dar una única salida, la predicción de producto 1.

In [32]:
info['Fases producción v02']

Unnamed: 0,LOTE,Orden en el encadenado,LOTE parental,ID Bioreactor,Fecha/hora inicio,Fecha/hora fin,Volumen de inóculo utilizado,Turbidez inicio cultivo,Turbidez fin cultivo,Viabilidad final cultivo,ID Centrífuga,Centrifugación 1 turbidez,Centrifugación 2 turbidez,Producto 1,Producto 2
0,23019,1,,14615,2023-03-21 07:30:00,2023-03-23 06:30:00,82.40,17.28,91.20,184000000,17825,,,1747.920,6.00
1,23020,1,,14616,2023-03-21 07:30:00,2023-03-23 06:30:00,80.40,18.80,91.20,181600000,14246,,,1676.160,6.56
2,23021,1,,13170,2023-03-22 07:30:00,2023-03-24 06:30:00,66.40,16.16,86.40,248000000,17825,,,1928.496,8.08
3,23022,1,,14614,2023-03-22 07:30:00,2023-03-24 06:30:00,85.60,18.48,83.20,229600000,12912,,,1782.800,5.92
4,23023,1,,14615,2023-03-28 07:27:00,2023-03-30 10:00:00,77.60,17.12,74.40,132800000,17825,26.56,20.88,1861.840,2.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,24049,1,,14617,2024-03-16 09:22:00,2024-03-18 08:23:00,83.60,18.88,72.64,164800000,12912,30.56,17.00,1342.800,4.88
148,24050,1,,14614,2024-03-23 08:57:00,2024-03-25 08:28:00,84.16,17.76,67.60,152000000,6379,29.44,26.64,1422.800,3.68
149,24051,1,,13169,2024-03-23 08:57:00,2024-03-25 08:33:00,84.16,17.76,80.80,160800000,12912,33.44,19.32,1486.560,5.52
150,24052,2,24050.0,14614,2024-03-25 13:28:00,2024-03-27 08:51:00,86.40,17.28,69.04,148000000,14246,23.68,18.20,1857.280,6.00
