# Total preprocessing

Log2 and log10 transformations
Then Pareto scaling, standardization or mean-centering are applied to either log2 or log10 datasets in order to find the best settings

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

folder = "Li_Oi-DAP_DAM"

################################################# Log transform (2 and 10) the metabolome dataset 

### Load the dataset
file_path = f'O2PLS_DA/{folder}/data/metabolome/metabolome_raw.xlsx'
data = pd.read_excel(file_path)

# Extract the sample names and classes
sample_info = data.iloc[:, :2]

# Extract the metabolite data
metabolite_data = data.iloc[:, 2:]

# Apply log2 and log10 transformation
log2_transformed_data = np.log2(metabolite_data)
log10_transformed_data = np.log10(metabolite_data)

# Combine the sample info with the log2 and log10 transformed data
log2_transformed_dataset = pd.concat([sample_info, log2_transformed_data], axis=1)
log10_transformed_dataset = pd.concat([sample_info, log10_transformed_data], axis=1)

# Save the log2 and log10 transformed dataset to a new Excel file
output_file_path_log2 = f'O2PLS_DA/{folder}/data/metabolome/metabolome_log2.xlsx'
output_file_path_log10 = f'O2PLS_DA/{folder}/data/metabolome/metabolome_log10.xlsx'
log2_transformed_dataset.to_excel(output_file_path_log2, index=False)
log10_transformed_dataset.to_excel(output_file_path_log10, index=False)

print("log transformation done")




################################################# Pareto scaling on log2 transformed data

# Load the log2 transformed data
file_path = f'O2PLS_DA/{folder}/data/metabolome/metabolome_log2.xlsx' 
data = pd.read_excel(file_path)

# Separate sample names and classes from the features
sample_names = data.iloc[:, 0]
sample_classes = data.iloc[:, 1]
features = data.iloc[:, 2:]

# Ensure all data in features is numeric
features = features.apply(pd.to_numeric, errors='coerce')

# Perform Pareto scaling
mean = features.mean(axis=0)
std = features.std(axis=0, ddof=1)  # Use sample standard deviation
pareto_scaled_data = (features - mean) / np.sqrt(std)

# Combine the sample names, classes, and scaled features into a new DataFrame
pareto_scaled_df = pd.concat([sample_names, sample_classes, pareto_scaled_data], axis=1)

# Save the scaled data to a new Excel file
pareto_scaled_df.to_excel(f'O2PLS_DA/{folder}/data/metabolome/metabolome_log2_PS.xlsx', index=False, header=True)

print("Pareto scaling on log2 transformed data done")



################################################# Pareto scaling on log10 transformed data

# Load the log10 transformed data
file_path = f'O2PLS_DA/{folder}/data/metabolome/metabolome_log10.xlsx' 
data = pd.read_excel(file_path)

# Separate sample names and classes from the features
sample_names = data.iloc[:, 0]
sample_classes = data.iloc[:, 1]
features = data.iloc[:, 2:]

# Ensure all data in features is numeric
features = features.apply(pd.to_numeric, errors='coerce')

# Perform Pareto scaling
mean = features.mean(axis=0)
std = features.std(axis=0, ddof=1)  # Use sample standard deviation
pareto_scaled_data = (features - mean) / np.sqrt(std)

# Combine the sample names, classes, and scaled features into a new DataFrame
pareto_scaled_df = pd.concat([sample_names, sample_classes, pareto_scaled_data], axis=1)

# Save the scaled data to a new Excel file
pareto_scaled_df.to_excel(f'O2PLS_DA/{folder}/data/metabolome/metabolome_log10_PS.xlsx', index=False, header=True)

print("Pareto scaling on log10 transformed data done")



################################################# Unit variance scaling on log2 trandformed data

# Load the log2 transformed data
file_path = f'O2PLS_DA/{folder}/data/metabolome/metabolome_log2.xlsx' 
data = pd.read_excel(file_path)

# Extract the sample names and classes
sample_info = data.iloc[:, :2]

# Extract the metabolite data
metabolite_data = data.iloc[:, 2:]

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the data
scaled_data = scaler.fit_transform(metabolite_data)

# Create a new DataFrame with the scaled data
scaled_df = pd.DataFrame(scaled_data, columns=metabolite_data.columns)

# Concatenate the sample info and the scaled data
final_df = pd.concat([sample_info, scaled_df], axis=1)

# Save the scaled data to a new Excel file
scaled_file_path = f'O2PLS_DA/{folder}/data/metabolome/metabolome_log2_UV.xlsx' 
final_df.to_excel(scaled_file_path, index=False)

print("Unit variance scaling on log2 trandformed data done")


################################################# Unit variance scaling on log10 trandformed data

# Load the log2 transformed data
file_path = f'O2PLS_DA/{folder}/data/metabolome/metabolome_log10.xlsx' 
data = pd.read_excel(file_path)

# Extract the sample names and classes
sample_info = data.iloc[:, :2]

# Extract the metabolite data
metabolite_data = data.iloc[:, 2:]

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the data
scaled_data = scaler.fit_transform(metabolite_data)

# Create a new DataFrame with the scaled data
scaled_df = pd.DataFrame(scaled_data, columns=metabolite_data.columns)

# Concatenate the sample info and the scaled data
final_df = pd.concat([sample_info, scaled_df], axis=1)

# Save the scaled data to a new Excel file
scaled_file_path = f'O2PLS_DA/{folder}/data/metabolome/metabolome_log10_UV.xlsx' 
final_df.to_excel(scaled_file_path, index=False)

print("Unit variance scaling on log10 trandformed data done")



################################################# Mean centering on log2 trandformed data

# Load the dataset
file_path = f'O2PLS_DA/{folder}/data/metabolome/metabolome_log2.xlsx' 
df = pd.read_excel(file_path)

# Set the sample name as the index
df.set_index('Sample name', inplace=True)

# Extract the sample class column and the data columns separately
sample_class = df['Class']
data = df.drop(columns=['Class'])

# Calculate the mean for each metabolite (column-wise mean)
means = data.mean()

# Mean center the data by subtracting the mean of each metabolite
mean_centered_data = data - means

# Add the sample class column back to the mean centered data
mean_centered_data.insert(0, 'Class', sample_class)

# Display the first few rows of the mean centered data
mean_centered_data.head()

# Save the mean centered data to a new Excel file
output_path = file_path = f'O2PLS_DA/{folder}/data/metabolome/metabolome_log2_centered.xlsx' 
mean_centered_data.to_excel(output_path, index=True)

print("Mean centering on log2 trandformed data done")




################################################# Mean centering on log10 trandformed data

# Load the dataset
file_path = f'O2PLS_DA/{folder}/data/metabolome/metabolome_log10.xlsx' 
df = pd.read_excel(file_path)

# Set the sample name as the index
df.set_index('Sample name', inplace=True)

# Extract the sample class column and the data columns separately
sample_class = df['Class']
data = df.drop(columns=['Class'])

# Calculate the mean for each metabolite (column-wise mean)
means = data.mean()

# Mean center the data by subtracting the mean of each metabolite
mean_centered_data = data - means

# Add the sample class column back to the mean centered data
mean_centered_data.insert(0, 'Class', sample_class)

# Display the first few rows of the mean centered data
mean_centered_data.head()

# Save the mean centered data to a new Excel file
output_path = file_path = f'O2PLS_DA/{folder}/data/metabolome/metabolome_log10_centered.xlsx' 
mean_centered_data.to_excel(output_path, index=True)

print("Mean centering on log10 trandformed data done")
print("end of script")

log transformation done
Pareto scaling on log2 transformed data done
Pareto scaling on log10 transformed data done
Unit variance scaling on log2 trandformed data done
Unit variance scaling on log10 trandformed data done
Mean centering on log2 trandformed data done
Mean centering on log10 trandformed data done
end of script
