In [26]:
# Open VSCode and create a new project folder for your machine learning project.
# Open the terminal in VSCode by going to Terminal > New Terminal.
# Create a new environment using conda or pip. For example, to create a new environment with conda:
#     conda create --name myenv
#     conda activate myenv
# Install the necessary packages for your machine learning project. For example, to install scikit-learn:
#     conda install pandas numpy scikit-learn flask
#     pip install -r requirements.txt             use this if you had initially loaded some packages to the file
# Export the dependencies of your project by running the command:
#     pip freeze > requirements.txt

<br>Table of Content:
* [Import Libraries](#1)
* [Load Data from Excel files](#2)
* [Data Extraction](#3)


<a id="1"></a> <br>
## Import Libraries

In [27]:
# Data Analysis      
import pandas as pd          # data analysis library for handling structured data             
import numpy as np           # mathematical library for working with numerical data
import sys 
import os 
# Add the previous directory to sys.path
sys.path.append(os.path.abspath('../'))
from metrics import *


# Visualization
import matplotlib.pyplot as plt     # data visualization library for creating graphs and charts
%matplotlib inline
import seaborn as sns        # data visualization library based on matplotlib for creating more attractive visualizations
import missingno as msno    #visualize missing data


# Ignore warnings
import warnings
warnings.filterwarnings("ignore") 

pd.set_option('display.max_rows', 50) 
pd.set_option('display.max_columns', 500) 
pd.set_option('display.width', 1000) 

<a id="3"></a> <br>
## Data Extraction

In [28]:
root_dir = r"C:\Users\pault\OneDrive - University of Oklahoma\GRA - Bio-Manufacturing\1. ML-Cytovance-OU-Research"
current_wd = r"C:\Users\pault\OneDrive - University of Oklahoma\GRA - Bio-Manufacturing\1. ML-Cytovance-OU-Research\titer"



os.chdir(root_dir)

In [29]:
# Column names to check and their replacements
columns_to_replace = {
    'input_Vessel Type': 'input_vessel_type',
    'input_Vessel Volume': 'input_vessel_volume',
    'input_Vessel Name': 'input_vessel_name',
    'input_Agitation (rpm)': 'input_agitation_rpm',
    'input_DO (%)': 'input_DO_%',
    'input_pH setpoint': 'input_pH_setpoint',
    'input_Gas flow': 'input_gas_flow',
    'input_Air (%)': 'input_air_%',
    'input_O2': 'input_O2',
    'input_Temp (oC)': 'input_Temp_c',
    'input_Media type': 'input_media_type',
    'input_Feed Type': 'input_feed_type',
    'input_Glucose Limit': 'input_glucose_limit',
    'output_OD600': 'output_OD600',
    'output_WCW (g/L)': 'output_WCW_gl',
    'output_Agitation': 'output_agitation',
    'output_Air %': 'output_air_%',
    'output_D0 %': 'output_D0_%',
    'output_GasFlow': 'output_gasflow',
    'output_O2.1': 'output_O2',
    'output_Ph': 'output_Ph',
    'output_Temp': 'output_Temp',
    'output_Feed %': 'output_feed_%',
    # 'output_Titre sample 1': 'output_titre_µgl_sample_1',
    # 'output_Titre sample 2': 'output_titre_µgl_sample_2',
    # 'output_Titre (mg/ml) (Sample 1)': 'output_titre_mg_ml_sample_1',
    # 'output_Titre (mg/ml) Sample 2': 'output_titre_mg_ml_sample_2',
    # 'output_Titre (µg/µl)': 'output_titre_µgl',
    # 'output_Average Titre (mg/ml)': 'output_average_titre_mg_ml',
    'output_Titre sample 1': 'output_titre_µgl_sample_1',
    'output_Titre sample 2': 'output_titre_µgl_sample_2',
    'output_Titre (mg/ml) (Sample 1)': 'output_titre_µgl_sample_1',
    'output_Titre (mg/ml) Sample 2': 'output_titre_µgl_sample_2',
    'output_Titre (µg/µl)': 'output_titre_µgl',
    'output_Average Titre (mg/ml)': 'output_average_titre_µgl',
    
    'output_Feed': 'output_feed',
    'output_Glycerol (g/L)': 'output_glycerol_gl',
    'output_Glucose (g/L)': 'output_glucose_gl',
    'output_Acetate (mmol/L)': 'output_acetate_mmol_l',
    'output_Phosphate (mmol/L)': 'output_phosphate_mmol_l',
    'input_Agitation': 'input_agitation_rpm',
    'input_DO': 'input_DO_%',
    'input_Temp': 'input_Temp_c',
    'input_Air': 'input_air_%',
    'input_pH': 'input_pH_setpoint',
    'output_Agitation.1': 'output_agitation',
    'output_O2.1': 'output_O2',
    'output_Temp.1': 'output_Temp',
    'output_Titre (mg/ml)': 'output_titre_mg_ml',
    'output_Titre (mg/ml).1': 'output_titre_mg_ml'
    
}


def get_data(experiment:str, num_range, index_col=None, parse_dates=True):
    # Initialize a list to store the dataframes
    df_list = []
    
    # Load the data
    for i in range(1, num_range+1):
        df = pd.read_csv(f"data/processed/{experiment}_{i}.csv", index_col=index_col, parse_dates=parse_dates)
        df.rename(columns={col: columns_to_replace[col] for col in df.columns if col in columns_to_replace}, inplace=True)
        if df.input_O2.dtypes == object:
            df.input_O2 = df.input_O2.apply(lambda x: x[:1]).astype(float)
        # df.drop(columns=[col for col in df.columns if 'titre' in col.lower() ], inplace=True)   # drop all titre columns
                    
        df_list.append(df)
    
    return df_list

# experiements
exp_210623 = get_data('exp_210623', 4, index_col='input_Timepoint (hr)', parse_dates=True)
exp_211013 = get_data('exp_211013', 4, index_col='input_Timepoint (hr)', parse_dates=True)
exp_211130 = get_data('exp_211130', 4, index_col='input_Timepoint (hr)', parse_dates=True)
exp_220309demo = get_data('exp_220309demo', 4, index_col='input_Timepoint (hr)', parse_dates=True)
exp_220315c1 = get_data('exp_220315c1', 6, index_col='input_Timepoint (hr)', parse_dates=True)
exp_220329c2 = get_data('exp_220329c2', 6, index_col='input_Timepoint (hr)', parse_dates=True)
exp_220822 = get_data('exp_220822', 4, index_col='input_Timepoint (hr)', parse_dates=True)

In [30]:
# exp_220315c1[3]

In [31]:
def standardize_column_order(df_list, reference_order):
    """
    Aligns the column order of a list of dataframes to a reference order.

    :param df_list: List of pandas DataFrames to standardize.
    :param reference_order: List of column names in the desired order.
    :return: List of DataFrames with standardized column order.
    """
    standardized_dfs = []
    for df in df_list:
        # Reorder the columns according to the reference, dropping any that are not in the reference
        standardized_df = df.reindex(columns=reference_order)
        standardized_dfs.append(standardized_df)
    return standardized_dfs


reference_order = ['input_vessel_type', 'input_vessel_volume', 'input_vessel_name', 'input_agitation_rpm', 'input_DO_%', 'input_pH_setpoint', 
                   'input_gas_flow', 'input_air_%', 'input_O2', 'input_Temp_c', 'input_media_type', 'input_feed_type', 'input_glucose_limit', 
                   'output_OD600', 'output_WCW_gl', 'output_agitation', 'output_air_%', 'output_D0_%', 'output_gasflow', 'output_O2', 
                   'output_Ph', 'output_feed_%', 'output_feed', 'output_Temp', 'output_glycerol_gl', 'output_glucose_gl', 
                   'output_acetate_mmol_l', 'output_phosphate_mmol_l',
                   'output_titre_µgl_sample_1', 'output_titre_µgl_sample_2', 'output_average_titre_µgl', 'output_titre_µgl'
                   
                   ]

exp_210623 = standardize_column_order(exp_210623, reference_order)
exp_211013 = standardize_column_order(exp_211013, reference_order)
exp_211130 = standardize_column_order(exp_211130, reference_order)
exp_220309demo = standardize_column_order(exp_220309demo, reference_order)
exp_220315c1 = standardize_column_order(exp_220315c1, reference_order)
exp_220329c2 = standardize_column_order(exp_220329c2, reference_order)
exp_220822 = standardize_column_order(exp_220822, reference_order)


In [32]:
exp_220315c1[3]

Unnamed: 0_level_0,input_vessel_type,input_vessel_volume,input_vessel_name,input_agitation_rpm,input_DO_%,input_pH_setpoint,input_gas_flow,input_air_%,input_O2,input_Temp_c,input_media_type,input_feed_type,input_glucose_limit,output_OD600,output_WCW_gl,output_agitation,output_air_%,output_D0_%,output_gasflow,output_O2,output_Ph,output_feed_%,output_feed,output_Temp,output_glycerol_gl,output_glucose_gl,output_acetate_mmol_l,output_phosphate_mmol_l,output_titre_µgl_sample_1,output_titre_µgl_sample_2,output_average_titre_µgl,output_titre_µgl
input_Timepoint (hr),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
0,5L Fermenter,5000ml,Ferm4,700,1,6.8,5,100,0.0,30,Media type A,Glucose,0,0.288333,,699.947998,100.0,104.460197,5.013672,0.0,6.700653,0.0,0.0,29.87915,,0.25893,9.32,31.4,,,,
2,5L Fermenter,5000ml,Ferm4,700,1,6.8,5,100,0.0,30,Media type A,Glucose,0,0.542333,,700.101074,100.0,100.281303,5.000001,0.0,6.791103,0.0,0.0,30.14337,,0.21418,10.07,28.42,,,,
4,5L Fermenter,5000ml,Ferm4,700,1,6.8,5,100,0.0,30,Media type A,Glucose,0,1.671333,,699.721313,100.0,92.700241,5.001091,0.0,6.899038,0.0,0.0,30.11269,,-0.03197,16.25,26.4,,,,
6,5L Fermenter,5000ml,Ferm4,700,1,6.8,5,100,0.0,30,Media type A,Glucose,0,7.253333,33.5,699.858276,100.0,46.38361,4.994204,0.0,6.826073,0.0,0.0,30.118931,,-0.0358,27.76,25.15,,,,
8,5L Fermenter,5000ml,Ferm4,700,1,6.8,5,100,0.0,30,Media type A,Glucose,0,14.22,,700.182678,100.0,55.87999,5.039995,0.0,6.893322,0.0,0.0,29.90379,,-0.04603,36.18,23.35,,,,
10,5L Fermenter,5000ml,Ferm4,700,1,6.8,5,100,0.0,30,Media type A,Glucose,0,25.766667,,700.176208,100.0,20.613449,5.019815,0.0,6.74033,10.39333,0.353,29.99588,,-0.0537,73.68,19.28,,,,
12,5L Fermenter,5000ml,Ferm4,700,1,6.8,5,100,0.0,30,Media type A,Glucose,0,28.5,,700.265625,100.0,10.809,4.980003,0.0,6.70267,20.243019,0.688,30.038919,,-0.05754,131.85,18.44,,,,
14,5L Fermenter,5000ml,Ferm4,700,1,6.8,5,100,0.0,30,Media type A,Glucose,0,35.333333,,700.588623,89.803253,45.338871,4.999862,10.16328,6.73495,30.18936,1.026,29.95904,,12.12191,136.88,16.18,,,,
16,5L Fermenter,5000ml,Ferm4,700,1,6.8,5,100,0.0,30,Media type A,Glucose,0,43.566667,,699.753418,88.113342,34.181629,5.065026,11.90465,6.725535,16.309999,0.554,30.038971,,23.14416,130.94,14.05,,,,
18,5L Fermenter,5000ml,Ferm4,700,1,6.8,5,100,0.0,30,Media type A,Glucose,0,45.433333,116.5,700.148315,93.571991,39.418701,4.960131,6.428413,6.707714,11.41,0.387,30.0082,,28.44429,124.01,13.43,,,,


In [33]:
def drop_col(df_list):
    filtered_df = []
    for df in df_list:
        df.drop(columns=['output_average_titre_µgl', 'output_titre_µgl_sample_2' ], inplace=True)   # drop all titre columns
        if 'output_titre_µgl_sample_1' in df.columns and 'output_titre_µgl' in df.columns:
            # Count the non-null values in each titer column
            count_sample_1 = df['output_titre_µgl_sample_1'].notnull().sum()
            count_sample_2 = df['output_titre_µgl'].notnull().sum()
            
            # Drop the column with fewer non-null values
            if count_sample_1 < count_sample_2:
                df.drop(columns='output_titre_µgl_sample_1', inplace=True)
                
            else:
                df.drop(columns='output_titre_µgl', inplace=True)
                df.rename(columns={'output_titre_µgl_sample_1': 'output_titre_µgl'}, inplace=True)

        if 'output_titre_µgl' not in df.columns:
            df['output_titre_µgl'] = np.nan  
        filtered_df.append(df)
    return filtered_df


exp_210623 = drop_col(exp_210623)
exp_211013 = drop_col(exp_211013)
exp_211130 = drop_col(exp_211130)
exp_220309demo = drop_col(exp_220309demo)
exp_220315c1 = drop_col(exp_220315c1)
exp_220329c2 = drop_col(exp_220329c2)
exp_220822 = drop_col(exp_220822)




# adding attribute name to each dataframe

def add_attribute_name(df_list, df_name:str):
    """
    Adds an attribute name to each DataFrame in a list.

    :param df_list: List of pandas DataFrames to add attribute names to.
    :param attribute_name: Name of the attribute to add.
    :return: List of DataFrames with attribute names.
    """
    for index, df in enumerate(df_list, start=1):
        df.name = f"{df_name}_{index}"
        
    return df_list

exp_210623 = add_attribute_name(exp_210623, 'exp_210623')
exp_211013 = add_attribute_name(exp_211013, 'exp_211013')
exp_211130 = add_attribute_name(exp_211130, 'exp_211130')
exp_220309demo = add_attribute_name(exp_220309demo, 'exp_220309demo')
exp_220315c1 = add_attribute_name(exp_220315c1, 'exp_220315c1')
exp_220329c2 = add_attribute_name(exp_220329c2, 'exp_220329c2')
exp_220822 = add_attribute_name(exp_220822, 'exp_220822')



In [34]:
# save the cleaned dataset.

def save_dataframes(df_list, output_directory):
    """
    Saves each dataframe in df_list to a CSV file in the specified output_directory.
    The filename is derived from the 'name' attribute of each dataframe.

    :param df_list: List of pandas DataFrames to be saved.
    :param output_directory: The directory where CSV files will be saved.
    """
    for df in df_list:
        # Ensure the dataframe has a 'name' attribute set
        if hasattr(df, 'name') and df.name:
            filename = f"{df.name}.csv"
            # Ensure the output directory ends with a '/'
            if not output_directory.endswith('/'):
                output_directory += '/'
            # Construct the full path and save the dataframe
            full_path = output_directory + filename
            df.to_csv(full_path, index=True) 
        else:
            print("DataFrame does not have a 'name' attribute or it's empty. Skipping...")

# Example usage
# Assuming df_list is your list of dataframes and each dataframe has a 'name' attribute set
output_directory = r"data"  # Specify your output directory here




save_dataframes(exp_210623, os.path.join(current_wd, output_directory))
save_dataframes(exp_211013, os.path.join(current_wd, output_directory))
save_dataframes(exp_211130, os.path.join(current_wd, output_directory))
save_dataframes(exp_220309demo, os.path.join(current_wd, output_directory))
save_dataframes(exp_220315c1, os.path.join(current_wd, output_directory))
save_dataframes(exp_220329c2, os.path.join(current_wd, output_directory))
save_dataframes(exp_220822, os.path.join(current_wd, output_directory))

