In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import ipywidgets as widgets
import seaborn as sns
import numpy as np
from IPython.display import display

In [2]:
df = pd.read_csv('/Users/daviddornig/Documents/Master_Thesis/Bioinf/Code/philipp-trinh/KOOPOMICS/input_data/pea_fungal/9Metabolite - Infection dataset.csv')

In [3]:
df

Unnamed: 0,Sample_ID,Plant_ID,Treated pots,dayTreatment,Dpi,Replicate,sol.sugar nmol/mg,aa µmol/mg,flav µg/mg,phe µg/mg,Chla μg/mg,Chlb μg/mg,Caro μg/mg,pisatin µg/mg,Salicylic acid
0,1,1,PC,0PC,0,1,5616,5673,766,1649,361,133,142,216,3353
1,2,1,PC,2PC,2,1,5187,7345,1779,1849,396,154,139,317,3093
2,3,1,PC,4PC,4,1,7492,5763,2386,1565,317,140,122,224,3825
3,4,1,PC,6PC,6,1,7505,5559,2066,1809,371,142,124,196,3769
4,5,1,PC,10PC,10,1,7083,5310,2015,1581,315,140,123,188,4529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,116,20,MI,2MI,2,5,5858,8913,2305,2107,303,068,077,636,8355
116,117,20,MI,4MI,4,5,8084,9753,3435,2391,260,055,099,686,9673
117,118,20,MI,6MI,6,5,9337,8939,3480,2422,257,102,099,591,9741
118,119,20,MI,10MI,10,5,8555,7314,2960,2914,183,113,119,482,9445


# Convert values and feature strings to appropiate input elements

In [4]:
# Columns to convert
features_to_convert = df.columns[6:]

# Convert specified columns to numeric type
df[features_to_convert] = df[features_to_convert].replace({',': '.'}, regex=True).apply(pd.to_numeric, errors='coerce')

# Round the values to 4 decimal places
df[features_to_convert] = df[features_to_convert].round(4)

df

Unnamed: 0,Sample_ID,Plant_ID,Treated pots,dayTreatment,Dpi,Replicate,sol.sugar nmol/mg,aa µmol/mg,flav µg/mg,phe µg/mg,Chla μg/mg,Chlb μg/mg,Caro μg/mg,pisatin µg/mg,Salicylic acid
0,1,1,PC,0PC,0,1,56.16,56.73,7.66,16.49,3.61,1.33,1.42,2.16,33.53
1,2,1,PC,2PC,2,1,51.87,73.45,17.79,18.49,3.96,1.54,1.39,3.17,30.93
2,3,1,PC,4PC,4,1,74.92,57.63,23.86,15.65,3.17,1.40,1.22,2.24,38.25
3,4,1,PC,6PC,6,1,75.05,55.59,20.66,18.09,3.71,1.42,1.24,1.96,37.69
4,5,1,PC,10PC,10,1,70.83,53.10,20.15,15.81,3.15,1.40,1.23,1.88,45.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,116,20,MI,2MI,2,5,58.58,89.13,23.05,21.07,3.03,0.68,0.77,6.36,83.55
116,117,20,MI,4MI,4,5,80.84,97.53,34.35,23.91,2.60,0.55,0.99,6.86,96.73
117,118,20,MI,6MI,6,5,93.37,89.39,34.80,24.22,2.57,1.02,0.99,5.91,97.41
118,119,20,MI,10MI,10,5,85.55,73.14,29.60,29.14,1.83,1.13,1.19,4.82,94.45


# Data imputation

In [5]:
def add_interpolated_dpi_rows(df, dpi_values, timeidentifier='Dpi'):
    # Sort the DataFrame by Dpi to ensure the rows are in the correct order
    df.sort_values(by=timeidentifier, inplace=True)

    # Create a list to hold new DataFrame rows
    new_rows = []

    df.reset_index(drop=True, inplace=True)

    for index, row in df.iterrows():
        # Find the next row where Dpi is greater than the current row
        next_index = index + 1
        if next_index < len(df):
            next_row = df.iloc[next_index]

        # Check if there are any target Dpi values between this row and the next
        target_dpi = [dpi for dpi in dpi_values if row[timeidentifier] < dpi < next_row[timeidentifier]]
        for dpi in target_dpi:
            # Calculate interpolated row
            interpolated_row = row.copy()
            fraction = (dpi - row[timeidentifier]) / (next_row[timeidentifier] - row[timeidentifier])
            for col in df.columns:
                if pd.api.types.is_numeric_dtype(df[col]):
                    # Perform interpolation for numeric columns
                    interpolated_row[col] = row[col] + (next_row[col] - row[col]) * fraction
            interpolated_row[timeidentifier] = dpi  # Set the new Dpi value
            #interpolated_row['dayTreatment'] = f'{dpi}{df["Treated pots"].iloc[index]}'  # Create dayTreatment value if needed
            new_rows.append(interpolated_row)

    # Append new rows to the original DataFrame and sort again
    if new_rows:
        df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
        df.sort_values(by=timeidentifier, inplace=True)

    return df

In [6]:
new_dpi_values = [8, 12]
# Apply function to each group and combine
interpolated_df = pd.concat([add_interpolated_dpi_rows(group, new_dpi_values) for _, group in df.groupby(['Plant_ID', 'Treated pots', 'Replicate'])])

# Sorting and resetting index for clarity
interpolated_df.sort_values(by=['Plant_ID', 'Treated pots', 'Replicate', 'Dpi'], inplace=True)
interpolated_df.reset_index(drop=True, inplace=True)

# Display the modified DataFrame
interpolated_df

Unnamed: 0,Sample_ID,Plant_ID,Treated pots,dayTreatment,Dpi,Replicate,sol.sugar nmol/mg,aa µmol/mg,flav µg/mg,phe µg/mg,Chla μg/mg,Chlb μg/mg,Caro μg/mg,pisatin µg/mg,Salicylic acid
0,1.0,1.0,PC,0PC,0,1.0,56.16,56.730,7.660,16.490,3.61,1.330,1.420,2.160,33.53
1,2.0,1.0,PC,2PC,2,1.0,51.87,73.450,17.790,18.490,3.96,1.540,1.390,3.170,30.93
2,3.0,1.0,PC,4PC,4,1.0,74.92,57.630,23.860,15.650,3.17,1.400,1.220,2.240,38.25
3,4.0,1.0,PC,6PC,6,1.0,75.05,55.590,20.660,18.090,3.71,1.420,1.240,1.960,37.69
4,4.5,1.0,PC,6PC,8,1.0,72.94,54.345,20.405,16.950,3.43,1.410,1.235,1.920,41.49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,118.0,20.0,MI,6MI,6,5.0,93.37,89.390,34.800,24.220,2.57,1.020,0.990,5.910,97.41
156,118.5,20.0,MI,6MI,8,5.0,89.46,81.265,32.200,26.680,2.20,1.075,1.090,5.365,95.93
157,119.0,20.0,MI,10MI,10,5.0,85.55,73.140,29.600,29.140,1.83,1.130,1.190,4.820,94.45
158,119.5,20.0,MI,10MI,12,5.0,79.00,72.705,29.265,28.025,1.97,0.800,1.145,4.750,91.50


In [7]:
interpolated_df.to_csv('9Metabolite_pea_interpolated_preprocessed.csv', index=False)

# Normalization

In [8]:
interpolated_df

Unnamed: 0,Sample_ID,Plant_ID,Treated pots,dayTreatment,Dpi,Replicate,sol.sugar nmol/mg,aa µmol/mg,flav µg/mg,phe µg/mg,Chla μg/mg,Chlb μg/mg,Caro μg/mg,pisatin µg/mg,Salicylic acid
0,1.0,1.0,PC,0PC,0,1.0,56.16,56.730,7.660,16.490,3.61,1.330,1.420,2.160,33.53
1,2.0,1.0,PC,2PC,2,1.0,51.87,73.450,17.790,18.490,3.96,1.540,1.390,3.170,30.93
2,3.0,1.0,PC,4PC,4,1.0,74.92,57.630,23.860,15.650,3.17,1.400,1.220,2.240,38.25
3,4.0,1.0,PC,6PC,6,1.0,75.05,55.590,20.660,18.090,3.71,1.420,1.240,1.960,37.69
4,4.5,1.0,PC,6PC,8,1.0,72.94,54.345,20.405,16.950,3.43,1.410,1.235,1.920,41.49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,118.0,20.0,MI,6MI,6,5.0,93.37,89.390,34.800,24.220,2.57,1.020,0.990,5.910,97.41
156,118.5,20.0,MI,6MI,8,5.0,89.46,81.265,32.200,26.680,2.20,1.075,1.090,5.365,95.93
157,119.0,20.0,MI,10MI,10,5.0,85.55,73.140,29.600,29.140,1.83,1.130,1.190,4.820,94.45
158,119.5,20.0,MI,10MI,12,5.0,79.00,72.705,29.265,28.025,1.97,0.800,1.145,4.750,91.50


In [9]:
df_normalized = interpolated_df.copy()

features_to_normalize = interpolated_df.columns[6:]

In [10]:
# Log1p Normalization
#-------------------------------
df_normalized[features_to_normalize] = np.log1p(df_normalized[features_to_normalize])

#Normalize by average row mean
#-------------------------------
row_means = df_normalized[features_to_normalize].mean(axis=1)
average_row_mean = row_means.mean()
print(f'Average row mean: {average_row_mean}')
df_normalized[features_to_normalize] =  df_normalized[features_to_normalize].sub(average_row_mean, axis=0)

# Print the normalized and log-transformed DataFrame
df_normalized


Average row mean: 2.589407198819862


Unnamed: 0,Sample_ID,Plant_ID,Treated pots,dayTreatment,Dpi,Replicate,sol.sugar nmol/mg,aa µmol/mg,flav µg/mg,phe µg/mg,Chla μg/mg,Chlb μg/mg,Caro μg/mg,pisatin µg/mg,Salicylic acid
0,1.0,1.0,PC,0PC,0,1.0,1.456447,1.466370,-0.430692,0.272222,-1.061179,-1.743539,-1.705640,-1.438835,0.952421
1,2.0,1.0,PC,2PC,2,1.0,1.378429,1.720721,0.343918,0.380494,-0.988001,-1.657243,-1.718114,-1.161491,0.874139
2,3.0,1.0,PC,4PC,4,1.0,1.740273,1.481839,0.623853,0.223003,-1.161491,-1.713938,-1.791900,-1.413834,1.080544
3,4.0,1.0,PC,6PC,6,1.0,1.741984,1.446425,0.486060,0.359757,-1.039719,-1.705640,-1.782931,-1.504218,1.066174
4,4.5,1.0,PC,6PC,8,1.0,1.713847,1.424179,0.474217,0.298183,-1.101008,-1.709780,-1.785166,-1.517824,1.159862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,118.0,20.0,MI,6MI,6,5.0,1.957816,1.914726,0.988541,0.638230,-1.316842,-1.886310,-1.901273,-0.656438,1.999735
156,118.5,20.0,MI,6MI,8,5.0,1.915501,1.820539,0.913143,0.731303,-1.426256,-1.859446,-1.852243,-0.738593,1.984582
157,119.0,20.0,MI,10MI,10,5.0,1.871315,1.716548,0.831593,0.816446,-1.549130,-1.833285,-1.805506,-0.828107,1.969195
158,119.5,20.0,MI,10MI,12,5.0,1.792619,1.710663,0.820585,0.778750,-1.500845,-2.001621,-1.826268,-0.840207,1.937801


In [11]:
df_normalized.to_csv('9Metabolite_pea_interpolated_normalized.csv', index=False)