# Modelo predictivo

### Importar paquetes

In [1]:
import pandas as pd
import numpy as np
import os
import re

### Limpieza de los datos

Empezaremos por todas las transformaciones necesarias tras el análisis previo. El objetivo es conseguir un conjunto de entrenamiento y test para nuestros modelos.

A pesar de que el reto específica que se nos proporcionarían diferentes conjuntos de test, tan solo contamos con uno. Eso nos hizo sospechar, tras un examen manual, nos dimos cuenta que los lotes de test están mezclados con el resto. Así que lo primero será separar dichos lotes y a partir de ahi construir la tabla con los campos relevantes. 

Este proceso será iterativo, por lo que complica la entrega, ya que solo podemos entregar un archivo. Por tanto se irá documentando, las diferentes transformaciones que se harán en distintos puntos del proceso hasta conseguir un modelo con buen rendimiento.

Como primer acercamiento será obtener los valores agregados de las distintas fases para poder predecir la variable objetivo.
Para ello necesitaremos crear un `pipeline` que garantize el orden de los pasos, dado que la entrada que se nos proporciona para el test es insuficiente para realizar una predicción

In [2]:
df_cultivo_test = pd.read_excel('raw_data/Fases producción_test v02.xlsx', sheet_name='Cultivo final', engine='openpyxl')
df_cultivo_test.head(5)

Unnamed: 0,LOTE,Orden en el encadenado,LOTE parental,ID Bioreactor,Fecha/hora inicio,Fecha/hora fin,Volumen de inóculo utilizado,Turbidez inicio cultivo,Turbidez fin cultivo,Viabilidad final cultivo,ID Centrífuga,Centrifugación 1 turbidez,Centrifugación 2 turbidez,Producto 1,Producto 2
0,24054,1,,14616,2024-04-16 08:12:00,2024-04-18 07:28:00,81.6,15.44,85.6,184800000,14246.0,27.84,23.96,,
1,24055,1,,14614,2024-04-13 08:18:00,2024-04-15 08:30:00,,14.32,73.68,175200000,12912.0,30.96,23.16,,
2,24056,1,,14615,2024-04-13 08:18:00,2024-04-15 08:15:00,,14.56,82.4,168000000,14246.0,29.52,28.88,,
3,24057,1,,13170,2024-04-16 08:12:00,2024-04-18 07:41:00,82.4,17.76,78.96,180800000,12912.0,31.04,25.32,,
4,24058,2,24055.0,14614,2024-04-15 12:28:00,2024-04-17 08:14:00,87.2,18.0,82.4,144800000,12912.0,26.08,20.36,,


In [3]:
lotes_test = df_cultivo_test['LOTE'].unique()
len(lotes_test)

56

In [4]:
df_cultivo = pd.read_excel('raw_data/Fases producción v02.xlsx', sheet_name='Cultivo final', engine='openpyxl')
df_cultivo.head(5)

Unnamed: 0,LOTE,Orden en el encadenado,LOTE parental,ID Bioreactor,Fecha/hora inicio,Fecha/hora fin,Volumen de inóculo utilizado,Turbidez inicio cultivo,Turbidez fin cultivo,Viabilidad final cultivo,ID Centrífuga,Centrifugación 1 turbidez,Centrifugación 2 turbidez,Producto 1,Producto 2
0,23019,1,,14615,2023-03-21 07:30:00,2023-03-23 06:30:00,82.4,17.28,91.2,184000000,17825,,,1747.92,6.0
1,23020,1,,14616,2023-03-21 07:30:00,2023-03-23 06:30:00,80.4,18.8,91.2,181600000,14246,,,1676.16,6.56
2,23021,1,,13170,2023-03-22 07:30:00,2023-03-24 06:30:00,66.4,16.16,86.4,248000000,17825,,,1928.496,8.08
3,23022,1,,14614,2023-03-22 07:30:00,2023-03-24 06:30:00,85.6,18.48,83.2,229600000,12912,,,1782.8,5.92
4,23023,1,,14615,2023-03-28 07:27:00,2023-03-30 10:00:00,77.6,17.12,74.4,132800000,17825,26.56,20.88,1861.84,2.96


___

In [5]:
def get_orden_data():
    # Cargamos datos
    df = pd.read_excel('raw_data/OF 123456 v02.xlsx', engine='openpyxl')

    # Corregimos caracteres especiales en Lote 
    df['LOTE'] = df['Lote'].apply(lambda x: int(x.replace('/', '').replace('P', '')))

    # Renombrado de columnas
    df = df.rename(columns={'Orden': 'orden', 'Cantidad entregada': 'cantidad'})
    
    # LOTE como indice y variables escogidas
    df = df.set_index('LOTE')
    final_cols = ['orden', 'cantidad']

    return df[final_cols]

In [6]:
def get_preinoculo_data(lotes):
    # Carga de datos
    df = pd.read_excel('raw_data/Fases producción v02.xlsx', sheet_name='Preinóculo', header=[0, 1], na_values=['NA', 'N.A'], engine='openpyxl')
    # Corregir cabeceras
    new_cols = [l2 if 'Unnamed' in l1 else f"{l1}-{l2}" for l1, l2 in df.columns.to_list()]
    df.columns = new_cols

    # Filtro de lotes
    df = df[df['LOTE'].isin(lotes)]

    # Imputación valores perdidos por la media
    for c in df.select_dtypes(include='float').columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
        df[c] = df[c].fillna(df[c].mean())
        
    # Valores de pH y turbidez seleccionados para siguiente fase
    ph_cols = [col for col in df.columns if col.startswith('pH')]
    turbidez_cols = [col for col in df.columns if col.startswith('Turbidez')]
    lineas_cols = [col for col in df.columns if col.startswith('Línea')]

    # el ph y turbidez finales las consideramos como la mezcla de las dos líneas escogidas
    df['ph'] = (
        (
            df[ph_cols].values *
            df[lineas_cols].values
        ).sum(axis=1) /
        df[lineas_cols].sum(axis=1)
    ).round(3)
    
    df['turbidez'] = (
        (
            df[turbidez_cols].values *
            df[lineas_cols].values
        ).sum(axis=1) /
        df[lineas_cols].sum(axis=1)
    ).round(2)

    # Calculo duración
    # 1. obtenemos las horas
    # 2. corregimos las fechas "al reves"
    # 3. corregimos las horas negativas
    ini_col = 'Fecha/hora inicio'
    fin_col = 'Fecha/hora fin'
    df['duracion_horas'] = (df[fin_col] - df[ini_col]).dt.total_seconds() / 3600
    df.loc[df['duracion_horas'] < 0, [ini_col, fin_col]] = df.loc[df['duracion_horas'] < 0, [fin_col, ini_col]].values
    df['duracion_horas'] = df['duracion_horas'].abs().round(2)

    # LOTE como indice y variables escogidas
    df = df.set_index('LOTE')
    final_cols = ['ph','turbidez','duracion_horas']

    return df[final_cols]


In [7]:
def get_inoculo_data(lotes, df_preinoculo):
    # Carga de datos
    df = pd.read_excel('raw_data/Fases producción v02.xlsx', sheet_name='Inóculo', engine='openpyxl')
    df['Viabilidad final cultivo'] = pd.to_numeric(df['Viabilidad final cultivo'], errors='coerce')

    # Filtrado de lotes 
    df = df[df['LOTE'].isin(lotes)]
    df = df.dropna(subset=['Fecha/hora inicio','Fecha/hora fin'])

    # Renombrado de columnas
    df = df.rename(columns={'Fecha/hora inicio': 'ts_inicio', 
                            'Fecha/hora fin': 'ts_fin', 
                            'ID bioreactor': 'id_bioreactor',
                            'Turbidez inicio cultivo': 'turbidez_ini',
                            'Turbidez final culttivo': 'turbidez_fin',
                            'Viabilidad final cultivo': 'viabilidad'}
    )

    # Imputación de valores perdidos
    df.loc[df['turbidez_ini'].isna(), ['turbidez_ini']] = (
        df[df['turbidez_ini'].isna()]
        .join(df_preinoculo, 
              on='LOTE', how='left', rsuffix='preinoculo')
        ['turbidez']
    )
    df['turbidez_dif'] = df['turbidez_fin'] - df['turbidez_ini']

    # > Si con la anterior imputación volvemos a tener un NaN en la resta, imputamos por la media
    df['turbidez_dif'] = df['turbidez_dif'].fillna(df['turbidez_dif'].mean())
    
    # Calculo duracion de la fase en horas
    ini_col = 'ts_inicio'
    fin_col = 'ts_fin'
    df['duracion_horas'] = (df[fin_col] - df[ini_col]).dt.total_seconds() / 3600
    df.loc[df['duracion_horas'] < 0, [ini_col, fin_col]] = df.loc[df['duracion_horas'] < 0, [fin_col, ini_col]].values
    df['duracion_horas'] = df['duracion_horas'].abs().round(2)

    # LOTE como indice y variables escogidas
    df = df.set_index('LOTE')
    final_cols = ['id_bioreactor', 'ts_inicio', 'ts_fin', 'turbidez_dif', 'viabilidad', 'duracion_horas']

    return df[final_cols]

In [8]:
def get_cultivo_data(lotes):

    # Carga de datos
    df = pd.read_excel('raw_data/Fases producción v02.xlsx', sheet_name='Cultivo final', engine='openpyxl')
    df['Viabilidad final cultivo'] = pd.to_numeric(df['Viabilidad final cultivo'], errors='coerce')

    # Filtrado de lotes 
    df = df[df['LOTE'].isin(lotes)]

    # Renombrado de columnas
    df = df.rename(columns={'Fecha/hora inicio': 'ts_inicio', 
                            'Fecha/hora fin': 'ts_fin', 
                            'ID Bioreactor': 'id_bioreactor',
                            'ID Centrífuga': 'id_centrifugadora',
                            'LOTE parental': 'lote_padre',
                            'Volumen de inóculo utilizado': 'volumen_ini',
                            'Turbidez inicio cultivo': 'turbidez_ini',
                            'Turbidez fin cultivo': 'turbidez_fin',
                            'Centrifugación 1 turbidez':'turbidez_cfg1',
                            'Centrifugación 2 turbidez':'turbidez_cfg2',
                            'Viabilidad final cultivo': 'viabilidad',
                            'Producto 1': 'producto'}
    )

    # Imputar valores perdidos
    df['lote_padre'] = df['lote_padre'].fillna(0)
    df['lote_padre'] = df['lote_padre'].astype(int)
    # La variable volumen del dataset de inoculo no tiene misma semantica, se imputa por la media (no hay demasiada desviacion)
    df['volumen_ini'] = df['volumen_ini'].fillna(df['volumen_ini'].mean())

    # # NO ME GUSTA ESTA IDEA
    # df['turbidez_cfg1'] = df['turbidez_cfg1'].fillna(df['turbidez_cfg1'].mean())
    # df['turbidez_cfg2'] = df['turbidez_cfg2'].fillna(df['turbidez_cfg2'].mean())

    # Calculos de diferencias
    df['turbidez_dif'] = (df['turbidez_fin'] - df['turbidez_ini']).round(2) # la turbidez aumenta en los bioreactores
    # df['turbidez_cfg_dif'] = (df['turbidez_cfg1'] - df['turbidez_cfg2']).round(2)  # la turbidez disminuye tras cada centrifugado
    

    ####  ME LLEVO LA OPERACIÓN AL DATASET CON INFORMACIóN DE LAS CENTRIFUGADORAS
    ####  AUNQUE DAN RESULTADOS DISTINTOS: POR EJEMPLO EL LOTE 23023, AQUI SALE 5.68 Y ABAJO SALE 6.24


    # Calculo duracion de la fase en horas
    ini_col = 'ts_inicio'
    fin_col = 'ts_fin'
    df['duracion_horas'] = (df[fin_col] - df[ini_col]).dt.total_seconds() / 3600
    df.loc[df['duracion_horas'] < 0, [ini_col, fin_col]] = df.loc[df['duracion_horas'] < 0, [fin_col, ini_col]].values
    df['duracion_horas'] = df['duracion_horas'].abs().round(2)

    # LOTE como indice y variables escogidas
    df = df.set_index('LOTE')
    final_cols = ['id_bioreactor', 'id_centrifugadora', 'lote_padre', 'ts_inicio', 'ts_fin', 
                  'volumen_ini', 'turbidez_dif', 'duracion_horas', 'producto'] # turbidez_cfg_dif

    return df[final_cols]
    

In [9]:
def concatenate_bioreactor_data():
    bioreactores_files = [f for f in os.listdir('./raw_data') if not os.path.isdir(f) and f.startswith('Biorreactor')]
    bioreactores_dfs = []
    for bio_fname in bioreactores_files:
        
        numbers = re.findall(r'\d+', bio_fname)
        id_bioreactor = int(numbers[0])

        df = pd.read_excel(f'./raw_data/{bio_fname}', sheet_name='Datos', index_col='DateTime', parse_dates=True, engine='openpyxl')
        
        clean_cols = [col.split('.')[1].lower() for col in df.columns]
        renamed_cols = {old: new for old, new in zip(df.columns, clean_cols)}
        
        df = df.rename(columns=renamed_cols)
        df['id_bioreactor'] = id_bioreactor
        bioreactores_dfs.append(df)

    df = pd.concat(bioreactores_dfs, axis=0).sort_index().set_index('id_bioreactor', append=True)

    df.to_csv('processed_data/concatenated_bioreactor_data.csv')
    return df

In [10]:
def load_concatenated_bioreactor_data():
    if os.path.exists('processed_data/concatenated_bioreactor_data.csv'):
        return pd.read_csv('./processed_data/concatenated_bioreactor_data.csv', index_col=[0,1], parse_dates=True)
    return concatenate_bioreactor_data()

In [11]:
def concatentate_cfg_data():
    centrifugadora_files = [f for f in os.listdir('./raw_data') if not os.path.isdir(f) and f.startswith('Centrífuga')]
    centrifugadora_dfs = []
    cols_name_map = {
        'CTF0101.EN_Parcial' : 'n_descargas_parciales',
        'CTF0101.EN_Total': 'n_descargas_totales',
        'D01780551.PV': 'valvula_apertura',
        'D01906041.PV': 'caudal',
        'D01916047.PV': 'contrapresion',
        'D01916503.PV': 'presion_agua',
        'D01919022.PV': 'velocidad_separacion'
    }
    for cfg_fname in centrifugadora_files:
        
        numbers = re.findall(r'\d+', cfg_fname)
        id_centrifugadora = int(numbers[0])

        df = pd.read_excel(f'./raw_data/{cfg_fname}', sheet_name='Datos', index_col='DateTime', parse_dates=True, engine='openpyxl')

        new_cols = df.columns.map(lambda x: x.replace(f'{id_centrifugadora}_',''))
        df = df.rename(columns={old_col: new_col for old_col, new_col in zip(df.columns, new_cols)})
        df = df.rename(columns=cols_name_map)
        df['id_centrifugadora'] = id_centrifugadora

        centrifugadora_dfs.append(df)

    df = pd.concat(centrifugadora_dfs, axis=0).sort_index().set_index('id_centrifugadora', append=True)

    df.to_csv('processed_data/concatentated_cfg_data.csv')
    return df

In [12]:
def load_concatentated_cfg_data():
    if os.path.exists('processed_data/concatentated_cfg_data.csv'):
        return pd.read_csv('./processed_data/concatentated_cfg_data.csv', index_col=[0,1], parse_dates=True)
    return concatentate_cfg_data()

In [13]:
def get_cfg_time_data():

    # Carga de datos
    df = pd.read_excel('raw_data/Horas inicio fin centrífugas.xlsx', parse_dates=["DATEVALUE"] , engine='openpyxl')

    df = df.rename(columns={
        'EQUIPO': 'id_centrifugadora',
        'Orden': 'orden',
        'Operación': 'operacion',
        'DATEVALUE': 'ts'
    })
    df['operacion'] = df['operacion'].map({'Centrifugació 1 ini': 'cfg_1_ini', 
                                           'Centrifugació 1 fi':'cfg_1_fin', 
                                           'Centrifugació 2 ini': 'cfg_2_ini', 
                                           'Centrifugació 2 fi': 'cfg_2_fin'
                                           }
    )

    df_pivot = df.pivot(columns='operacion', values=['ts'])
    df_pivot.columns = list(map("_".join, df_pivot.columns))

    index_cols = ['id_centrifugadora', 'orden']
    df = df[index_cols]
    df = pd.concat([df, df_pivot], axis=1)
    df = df.groupby(by=index_cols).first()

    df['duracion_cfg_1'] = ((df['ts_cfg_1_fin'] - df['ts_cfg_1_ini']).dt.total_seconds() / 3600).round(2)
    df['duracion_cfg_2'] = ((df['ts_cfg_2_fin'] - df['ts_cfg_2_ini']).dt.total_seconds() / 3600).round(2)

    return df


In [14]:
def get_ipc_data(lotes):

    # Cargar datos 
    df_cultivo = pd.read_excel('raw_data/Cinéticos IPC.xlsx', sheet_name='Cultivos finales', engine='openpyxl')
    df_cfg = pd.read_excel('raw_data/Cinéticos IPC.xlsx', sheet_name='Centrifugación', engine='openpyxl')

    df_cultivo['Turbidez'] = pd.to_numeric(df_cultivo['Turbidez'], errors='coerce')
    df_cultivo['Glucosa g/L'] = pd.to_numeric(df_cultivo['Glucosa g/L'], errors='coerce')

    df_cfg['Turbidez'] = pd.to_numeric(df_cfg['Turbidez'], errors='coerce')

    # Filtrar lotes
    df_cultivo['LOTE'] = df_cultivo['Lote'].apply(lambda l: int(l.replace('P','')) if type(l) != int else l)
    df_cfg['LOTE'] = df_cfg['Lote'].apply(lambda l: int(l.replace('P','')) if type(l) != int else l)

    df_cultivo = df_cultivo[df_cultivo['LOTE'].isin(lotes)]
    df_cfg = df_cfg[df_cfg['LOTE'].isin(lotes)]

    # Agrupación de datos
    df_cultivo = (
        df_cultivo
        .groupby(by='LOTE', as_index=False)
        .agg({'Fecha': ['count'], 'Turbidez': ['mean', np.ptp], 'Viabilidad': ['mean', np.ptp], 'Glucosa g/L': ['mean', np.ptp]})
        .set_index('LOTE')
    )
    df_cultivo.columns = [ '_'.join(str(s).lower() for s in col) for col in df_cultivo.columns]

    df_cfg = (
        df_cfg
        .groupby(by=['LOTE', 'Centrífuga', 'Centrifugada (1 o 2)'])
        .agg(turbidez_cfg_last=('Turbidez', 'last')) # mean # max_vol=('Volumen centrifugado (L)', 'max'), 
        .reset_index()
        .rename(columns={'Centrífuga': 'id_centrifugadora'})
    )

    df_pivot = df_cfg.pivot(columns='Centrifugada (1 o 2)', values=['turbidez_cfg_last']) # 'max_vol', 
    df_pivot.columns = [ '_'.join(str(s) for s in col) for col in df_pivot.columns]

    df_cfg = pd.concat([df_cfg, df_pivot], axis=1).drop(['Centrifugada (1 o 2)', 'turbidez_cfg_last'], axis=1) # 'max_vol',

    df_cfg = df_cfg.groupby(by=['LOTE', 'id_centrifugadora'], as_index=False).first().set_index('LOTE')
    df_cfg['turbidez_cfg_dif'] = (df_cfg['turbidez_cfg_last_1'] - df_cfg['turbidez_cfg_last_2']).round(2)
    
    df = df_cultivo.merge(df_cfg, on='LOTE')
    df = df.rename(columns={
        'fecha_count': 'muestras',
        'glucosa g/l_mean': 'glucosa_mean',
        'glucosa g/l_ptp': 'glucosa_ptp',
    })
    return df


In [16]:
df_info_general = get_orden_data()
lotes = df_info_general.index.unique()

df_preinoculo = get_preinoculo_data(lotes)
df_inoculo = get_inoculo_data(lotes, df_preinoculo)
df_cultivo = get_cultivo_data(lotes)

df_bioreactor = load_concatenated_bioreactor_data()
df_cfg = load_concatentated_cfg_data()

df_ipc = get_ipc_data(lotes)

Ahora que tenemos todos los variables agregadas, pasamos a la fase de cruzar "tablas" y construir los conjuntos de datos de entrenamiento y test

In [26]:
# Agregar las variables de los bioreactores en inoculo y cultivo

df_inoculo.head(5)

Unnamed: 0_level_0,id_bioreactor,ts_inicio,ts_fin,turbidez_dif,viabilidad,duracion_horas
LOTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
23023,14618,2023-03-27 07:22:00,2023-03-28 07:29:00,13.44,106400000.0,24.12
23024,14618,2023-03-27 07:22:00,2023-03-28 07:29:00,13.44,106400000.0,24.12
23025,13172,2023-03-28 07:42:00,2023-03-29 06:43:00,10.64,84800000.0,23.02
23026,13172,2023-03-28 07:42:00,2023-03-29 06:43:00,10.64,84800000.0,23.02
23027,13172,2023-04-03 13:30:00,2023-04-04 11:35:00,4.64,104800000.0,22.08


In [20]:
df_bioreactor.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,agitation_pv,air_sparge_pv,biocontainer_pressure_pv,do_1_pv,do_2_pv,gas_overlay_pv,load_cell_net_pv,ph_1_pv,ph_2_pv,pump_1_pv,pump_1_total,pump_2_pv,pump_2_total,single_use_do_pv,single_use_ph_pv,temperatura_pv
DateTime,id_bioreactor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2023-03-15,14618,72.0,0.0,1.869612,15.953993,,4.000057,169.6,-527.387158,5.998478,0.0,104.159973,0.0,0.0,16.713527,6.016,29.456006
2023-03-15,14614,80.0,0.0,0.57266,0.0,-0.00553,4.000087,1576.8,-0.156925,5.888288,0.0,14.88,0.0,191.200293,799.991992,799.967969,30.216161
2023-03-15,14617,0.0,0.0,480.0,0.0,,0.0,0.0,1.707277,-1.969051,0.0,49.599991,0.0,2014.737695,799.991992,800.03999,19.319995
2023-03-15,14615,80.0,0.0,0.268969,18.73203,0.0,3.999874,1636.8,5.914182,-0.234763,0.0,22.439996,0.0,550.186572,20.719247,5.904,30.239898
2023-03-15,14616,80.0,0.0,0.715311,16.557993,0.0,3.999639,1652.8,5.929625,-389.260962,0.0,39.679996,0.0,391.860913,17.361165,5.872,29.607996


In [60]:
def filter_bioreactor_data(df_bioreactor, row):
       filter_condition = ((df_bioreactor.index.get_level_values('DateTime') >= row['ts_inicio']) & 
       (df_bioreactor.index.get_level_values('DateTime') <= row['ts_fin']) & 
       (df_bioreactor.index.get_level_values('id_bioreactor') == row['id_bioreactor']))

       df_filter = df_bioreactor[filter_condition]

       cols = ['agitation_pv', 'air_sparge_pv', 'gas_overlay_pv', 'single_use_do_pv', 'temperatura_pv']
       return pd.Series(df_filter[cols].mean().round(2))

In [76]:
agg_data = df_inoculo.apply(lambda row: filter_bioreactor_data(df_bioreactor, row), axis=1)
df_inoculo.merge(agg_data, on='LOTE') # .isna().sum() -> un registro con viabilidad nula

Unnamed: 0_level_0,id_bioreactor,ts_inicio,ts_fin,turbidez_dif,viabilidad,duracion_horas,agitation_pv,air_sparge_pv,gas_overlay_pv,single_use_do_pv,temperatura_pv
LOTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
23023,14618,2023-03-27 07:22:00,2023-03-28 07:29:00,13.44,106400000.0,24.12,72.00,2.61,4.00,23.40,29.44
23024,14618,2023-03-27 07:22:00,2023-03-28 07:29:00,13.44,106400000.0,24.12,72.00,2.61,4.00,23.40,29.44
23025,13172,2023-03-28 07:42:00,2023-03-29 06:43:00,10.64,84800000.0,23.02,70.43,1.03,3.70,23.92,29.49
23026,13172,2023-03-28 07:42:00,2023-03-29 06:43:00,10.64,84800000.0,23.02,70.43,1.03,3.70,23.92,29.49
23027,13172,2023-04-03 13:30:00,2023-04-04 11:35:00,4.64,104800000.0,22.08,69.46,1.09,3.91,42.53,29.23
...,...,...,...,...,...,...,...,...,...,...,...
24101,13171,2024-06-28 07:16:00,2024-06-29 07:06:00,11.92,91200000.0,23.83,71.98,1.53,4.00,22.97,29.46
24103,13171,2024-06-28 07:16:00,2024-06-29 07:06:00,11.92,91200000.0,23.83,71.98,1.53,4.00,22.97,29.46
24104,13172,2024-07-01 07:01:00,2024-07-02 08:01:00,11.92,82400000.0,25.00,72.00,1.40,4.00,24.69,29.46
24105,13172,2024-07-01 07:01:00,2024-07-02 08:01:00,11.92,82400000.0,25.00,72.00,1.40,4.00,24.69,29.46


In [74]:
agg_data = df_cultivo.apply(lambda row: filter_bioreactor_data(df_bioreactor, row), axis=1)
df_cultivo.merge(agg_data, on='LOTE').head(5) #.isna().sum() -> hay alguna col con un registro nulo

Unnamed: 0_level_0,id_bioreactor,id_centrifugadora,lote_padre,ts_inicio,ts_fin,volumen_ini,turbidez_dif,duracion_horas,producto,agitation_pv,air_sparge_pv,gas_overlay_pv,single_use_do_pv,temperatura_pv
LOTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
23019,14615,17825,0,2023-03-21 07:30:00,2023-03-23 06:30:00,82.4,73.92,47.0,1747.92,80.0,18.78,4.0,19.8,29.6
23020,14616,14246,0,2023-03-21 07:30:00,2023-03-23 06:30:00,80.4,72.4,47.0,1676.16,80.0,15.05,4.0,21.58,29.6
23021,13170,17825,0,2023-03-22 07:30:00,2023-03-24 06:30:00,66.4,70.24,47.0,1928.496,80.0,14.03,4.0,683.02,29.6
23022,14614,12912,0,2023-03-22 07:30:00,2023-03-24 06:30:00,85.6,64.72,47.0,1782.8,80.0,9.32,4.0,680.95,29.58
23023,14615,17825,0,2023-03-28 07:27:00,2023-03-30 10:00:00,77.6,57.28,50.55,1861.84,80.0,9.23,4.0,656.04,29.56


In [91]:
get_cfg_time_data()

Unnamed: 0_level_0,Unnamed: 1_level_0,ts_cfg_1_fin,ts_cfg_1_ini,ts_cfg_2_fin,ts_cfg_2_ini,duracion_cfg_1,duracion_cfg_2
id_centrifugadora,orden,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12912,10005236,2023-11-30 11:46:33,2023-11-30 08:38:34,2023-11-30 14:27:29,2023-11-30 12:50:22,3.13,1.62
12912,10005261,2024-02-02 11:03:18,2024-02-02 08:39:16,2024-02-02 13:30:48,2024-02-02 11:50:13,2.40,1.68
12912,200182428,2023-03-24 11:32:53,2023-03-24 08:11:53,2023-03-24 14:29:39,2023-03-24 13:09:42,3.35,1.33
12912,200182436,2023-04-20 10:35:17,2023-04-20 07:19:39,2023-04-20 17:27:43,2023-04-20 17:15:13,3.26,0.21
12912,200182442,2023-04-27 16:00:23,2023-04-27 12:53:01,2023-04-27 18:54:16,2023-04-27 17:27:08,3.12,1.45
...,...,...,...,...,...,...,...
17825,200209970,2024-07-22 10:27:37,2024-07-22 07:08:59,2024-07-22 13:04:36,2024-07-22 11:27:14,3.31,1.62
17825,200209983,2024-07-29 10:35:08,2024-07-29 07:13:21,2024-07-29 13:09:30,2024-07-29 11:24:48,3.36,1.74
17825,200209985,2024-07-31 11:05:43,2024-07-31 07:26:37,2024-07-31 13:36:41,2024-07-31 11:56:32,3.65,1.67
17825,200210971,2024-07-24 10:29:42,2024-07-24 06:47:07,2024-07-24 13:01:55,2024-07-24 11:18:34,3.71,1.72


In [93]:
df_cfg_time = get_cfg_time_data()
df_cfg_time = (
    df_info_general
    .reset_index() # no perder LOTE
    .merge(df_cfg_time.reset_index(), on='orden') # para poder cruzar con id_centrifugadora
    .set_index('LOTE')
    .drop(columns=['cantidad', 'orden'])
    )
df_cfg_time

Unnamed: 0_level_0,id_centrifugadora,ts_cfg_1_fin,ts_cfg_1_ini,ts_cfg_2_fin,ts_cfg_2_ini,duracion_cfg_1,duracion_cfg_2
LOTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
23019,17825,2023-03-23 10:29:43,2023-03-23 07:53:57,2023-03-23 13:03:38,2023-03-23 12:13:20,2.60,0.84
23020,14246,2023-03-23 10:26:25,2023-03-23 08:00:49,2023-03-23 13:34:10,2023-03-23 12:11:04,2.43,1.38
23021,17825,2023-03-24 11:53:41,2023-03-24 08:31:15,2023-03-24 14:30:45,2023-03-24 13:13:11,3.37,1.29
23022,12912,2023-03-24 11:32:53,2023-03-24 08:11:53,2023-03-24 14:29:39,2023-03-24 13:09:42,3.35,1.33
23023,17825,2023-03-30 12:26:49,2023-03-30 09:29:25,2023-03-30 14:52:53,2023-03-30 13:32:39,2.96,1.34
...,...,...,...,...,...,...,...
24108,14246,2024-07-08 11:07:41,2024-07-08 08:09:55,2024-07-08 13:32:56,2024-07-08 11:49:51,2.96,1.72
24106,12912,2024-07-03 11:00:11,2024-07-03 07:43:09,2024-07-03 13:19:16,2024-07-03 11:49:07,3.28,1.50
24107,14246,2024-07-03 11:08:40,2024-07-03 07:50:38,2024-07-03 13:33:24,2024-07-03 11:59:14,3.30,1.57
24109,12912,2024-07-05 11:13:40,2024-07-05 07:43:06,2024-07-05 13:43:20,2024-07-05 12:01:21,3.51,1.70


In [104]:
def filter_cfg_data(df_cfg, row, cfg_number):
    filter_condition = ((df_cfg.index.get_level_values('DateTime') >= row[f'ts_cfg_{cfg_number}_ini']) & 
       (df_cfg.index.get_level_values('DateTime') <= row[f'ts_cfg_{cfg_number}_fin']) & 
       (df_cfg.index.get_level_values('id_centrifugadora') == row['id_centrifugadora']))

    
    filter_data = df_cfg[filter_condition]
    filter_data.columns = [f'{col}_cfg_{cfg_number}' for col in filter_data.columns]
    return pd.Series(filter_data.mean().round(2))

In [108]:
agg_data = df_cfg_time.apply(lambda row: filter_cfg_data(df_cfg, row, 1), axis=1) # necesita ejecutarse de nuevo con '2'
# agg_data
df_cfg_time.merge(agg_data, on='LOTE')
# filter_cfg_data(df_cfg, df_cfg_time.iloc[1], 1)

Unnamed: 0_level_0,id_centrifugadora,ts_cfg_1_fin,ts_cfg_1_ini,ts_cfg_2_fin,ts_cfg_2_ini,duracion_cfg_1,duracion_cfg_2,velocidad_separacion_cfg_1,presion_agua_cfg_1,contrapresion_cfg_1,caudal_cfg_1,valvula_apertura_cfg_1,n_descargas_totales_cfg_1,n_descargas_parciales_cfg_1
LOTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
23019,17825,2023-03-23 10:29:43,2023-03-23 07:53:57,2023-03-23 13:03:38,2023-03-23 12:13:20,2.60,0.84,6679.59,,3.20,482.61,26.91,0.00,0.0
23020,14246,2023-03-23 10:26:25,2023-03-23 08:00:49,2023-03-23 13:34:10,2023-03-23 12:11:04,2.43,1.38,6677.31,,3.20,501.89,24.42,0.00,0.0
23021,17825,2023-03-24 11:53:41,2023-03-24 08:31:15,2023-03-24 14:30:45,2023-03-24 13:13:11,3.37,1.29,6680.76,,3.20,486.43,26.83,0.00,0.0
23022,12912,2023-03-24 11:32:53,2023-03-24 08:11:53,2023-03-24 14:29:39,2023-03-24 13:09:42,3.35,1.33,6678.62,,2.94,474.87,32.80,0.36,0.0
23023,17825,2023-03-30 12:26:49,2023-03-30 09:29:25,2023-03-30 14:52:53,2023-03-30 13:32:39,2.96,1.34,5549.73,,2.47,384.89,39.71,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24108,14246,2024-07-08 11:07:41,2024-07-08 08:09:55,2024-07-08 13:32:56,2024-07-08 11:49:51,2.96,1.72,5726.70,1.51,2.76,410.93,34.87,0.00,0.0
24106,12912,2024-07-03 11:00:11,2024-07-03 07:43:09,2024-07-03 13:19:16,2024-07-03 11:49:07,3.28,1.50,4890.35,1.35,2.03,329.54,42.78,0.07,0.0
24107,14246,2024-07-03 11:08:40,2024-07-03 07:50:38,2024-07-03 13:33:24,2024-07-03 11:59:14,3.30,1.57,5248.13,1.38,2.50,417.24,39.82,0.00,0.0
24109,12912,2024-07-05 11:13:40,2024-07-05 07:43:06,2024-07-05 13:43:20,2024-07-05 12:01:21,3.51,1.70,4865.15,1.34,2.32,350.62,43.07,0.00,0.0


In [18]:
df = df_preinoculo.merge(df_inoculo, on='LOTE', suffixes=('_preinoculo', '_inoculo'))
df.head(5)

Unnamed: 0_level_0,ph,turbidez,duracion_horas_preinoculo,id_bioreactor,ts_inicio,ts_fin,turbidez_dif,viabilidad,duracion_horas_inoculo
LOTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
23023,5.5,28.12,26.35,14618,2023-03-27 07:22:00,2023-03-28 07:29:00,13.44,106400000.0,24.12
23024,5.5,28.12,26.35,14618,2023-03-27 07:22:00,2023-03-28 07:29:00,13.44,106400000.0,24.12
23025,5.5,27.04,265.7,13172,2023-03-28 07:42:00,2023-03-29 06:43:00,10.64,84800000.0,23.02
23026,5.5,27.04,265.7,13172,2023-03-28 07:42:00,2023-03-29 06:43:00,10.64,84800000.0,23.02
23027,5.392,33.16,32.5,13172,2023-04-03 13:30:00,2023-04-04 11:35:00,4.64,104800000.0,22.08
