In [6]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np

# KNN inputation and outlier detection

In [2]:
folder = "Li_Oi-DAP_DAM"

In [64]:
################################################################ KNN cross validaiton

# Load the proteome dataset
file_path = f'O2PLS_DA/{folder}/data/proteome/proteome_raw.xlsx' 
data = pd.read_excel(file_path)

# Separate the sample names, class labels, and features
sample_names = data.iloc[:, 0]
class_labels = data.iloc[:, 1]
features = data.iloc[:, 2:]

# Define a range of k values to test
k_values = range(1, 4)

# Create a function to evaluate the KNN imputer with cross-validation
def evaluate_knn_imputer(k, features):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mse_values = []
    
    for train_index, test_index in kf.split(features):
        train_data, test_data = features.iloc[train_index], features.iloc[test_index]
        
        # Introduce missing values in the test set to simulate imputation
        test_data_missing = test_data.copy()
        mask = np.random.choice([True, False], size=test_data.shape, p=[0.1, 0.9])
        test_data_missing[mask] = np.nan
        
        # Perform KNN imputation
        imputer = KNNImputer(n_neighbors=k)
        imputed_train_data = imputer.fit_transform(train_data)
        imputed_test_data = imputer.transform(test_data_missing)
        
        # Calculate MSE only on the artificially removed values
        true_values = test_data.values[mask]
        imputed_values = imputed_test_data[mask]
        
        # Remove any NaNs before calculating MSE
        valid_mask = ~np.isnan(true_values)
        true_values = true_values[valid_mask]
        imputed_values = imputed_values[valid_mask]
        
        mse = mean_squared_error(true_values, imputed_values)
        mse_values.append(mse)
    
    return np.mean(mse_values)

# Evaluate KNN imputer for each k value
mse_scores = []
for k in k_values:
    mse = evaluate_knn_imputer(k, features)
    mse_scores.append(mse)
    print(f'k={k}, MSE={mse}')

# Find the best k value
best_k = k_values[np.argmin(mse_scores)]
print(f'The best k value is {best_k} with MSE={min(mse_scores)}')


################################################################ KNN inputation 
# Script validé 

import pandas as pd
from sklearn.impute import KNNImputer

# Load the proteome dataset
file_path = f'O2PLS_DA/{folder}/data/proteome/proteome_raw.xlsx' 
data = pd.read_excel(file_path)

# Separate the sample names, class labels, and features
sample_names = data.iloc[:, 0]
class_labels = data.iloc[:, 1]
features = data.iloc[:, 2:]

# Initialize the KNN imputer
imputer = KNNImputer(n_neighbors=best_k)

# Perform KNN imputation on the entire dataset
features_imputed = imputer.fit_transform(features)

# Convert the imputed features back to a DataFrame
features_imputed_df = pd.DataFrame(features_imputed, columns=features.columns)

# Combine the imputed features with sample names and class labels
imputed_data = pd.concat([sample_names, class_labels, features_imputed_df], axis=1)

# Save the imputed dataset to an Excel file
output_file_path = f'O2PLS_DA/{folder}/data/proteome/proteome_KNN.xlsx' 
output_file_path_2 = f'O2PLS_DA/{folder}/data/proteome/proteome_KNN_CV.xlsx' 
imputed_data.to_excel(output_file_path, index=False)
imputed_data.to_excel(output_file_path_2, index=False)

print(f"KNN inputation finished with K = {best_k}")



################################################################ Outlier identification

# Load the proteome dataset
file_path = f'O2PLS_DA/{folder}/data/proteome/proteome_KNN.xlsx'
data = pd.read_excel(file_path)

# Separate the sample names, class labels, and features
sample_names = data.iloc[:, 0]
class_labels = data.iloc[:, 1]
features = data.iloc[:, 2:]

# Calculate the coefficient of variation (CV) for each protein
cv_values = features.std(axis=0) / features.mean(axis=0) * 100

# Filter out proteins with CV > 10%
filtered_features = features.loc[:, cv_values <= 10]

# Identify outliers
outliers = cv_values[cv_values > 10]

# Combine the filtered features with the sample names and class labels
filtered_data = pd.concat([sample_names, class_labels, filtered_features], axis=1)

# Save the outliers to a new Excel file
outliers_df = pd.DataFrame({'Protein': outliers.index, 'CV%': outliers.values})
outliers_df.to_excel(f'O2PLS_DA/{folder}/data/proteome/outliers_based_on_cv.xlsx', index=False)
print("Outliers saved to 'outliers_based_on_cv.xlsx'")


k=1, MSE=0.47941998758294685
k=2, MSE=0.44891566604744326
k=3, MSE=0.5123694903487279
The best k value is 2 with MSE=0.44891566604744326
KNN inputation finished with K = 2
Outliers saved to 'outliers_based_on_cv.xlsx'


# Scaling and centering
### Pareto scaling, unit variance scaling, mean centering

In [8]:
################################################# Pareto scaling 

# Load the log2 transformed data
file_path = f'O2PLS_DA/{folder}/data/proteome/proteome_KNN_CV.xlsx' 
data = pd.read_excel(file_path)

# Separate sample names and classes from the features
sample_names = data.iloc[:, 0]
sample_classes = data.iloc[:, 1]
features = data.iloc[:, 2:]

# Ensure all data in features is numeric
features = features.apply(pd.to_numeric, errors='coerce')

# Perform Pareto scaling
mean = features.mean(axis=0)
std = features.std(axis=0, ddof=1)  # Use sample standard deviation
pareto_scaled_data = (features - mean) / np.sqrt(std)

# Combine the sample names, classes, and scaled features into a new DataFrame
pareto_scaled_df = pd.concat([sample_names, sample_classes, pareto_scaled_data], axis=1)

# Save the scaled data to a new Excel file
pareto_scaled_df.to_excel(f'O2PLS_DA/{folder}/data/proteome/proteome_KNN_CV_PS.xlsx', index=False, header=True)

print("Pareto scaling on KNN_CV data done")


################################################# Unit variance scaling

# Load the log2 transformed data
file_path = f'O2PLS_DA/{folder}/data/proteome/proteome_KNN_CV.xlsx' 
data = pd.read_excel(file_path)

# Extract the sample names and classes
sample_info = data.iloc[:, :2]

# Extract the metabolite data
metabolite_data = data.iloc[:, 2:]

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the data
scaled_data = scaler.fit_transform(metabolite_data)

# Create a new DataFrame with the scaled data
scaled_df = pd.DataFrame(scaled_data, columns=metabolite_data.columns)

# Concatenate the sample info and the scaled data
final_df = pd.concat([sample_info, scaled_df], axis=1)

# Save the scaled data to a new Excel file
scaled_file_path = f'O2PLS_DA/{folder}/data/proteome/proteome_KNN_CV_UV.xlsx' 
final_df.to_excel(scaled_file_path, index=False)

print("Unit variance scaling on KNN_CV data done")


################################################# Mean centering

# Load the dataset
file_path = f'O2PLS_DA/{folder}/data/proteome/proteome_KNN_CV.xlsx' 
df = pd.read_excel(file_path)

# Set the sample name as the index
df.set_index('Sample name', inplace=True)

# Extract the sample class column and the data columns separately
sample_class = df['Class']
data = df.drop(columns=['Class'])

# Calculate the mean for each metabolite (column-wise mean)
means = data.mean()

# Mean center the data by subtracting the mean of each metabolite
mean_centered_data = data - means

# Add the sample class column back to the mean centered data
mean_centered_data.insert(0, 'Class', sample_class)

# Display the first few rows of the mean centered data
mean_centered_data.head()

# Save the mean centered data to a new Excel file
output_path = file_path = f'O2PLS_DA/{folder}/data/proteome/proteome_KNN_CV_centered.xlsx' 
mean_centered_data.to_excel(output_path, index=True)

print("Mean centering on KNN_CV data done")
print("script done")

Pareto scaling on KNN_CV data done
Unit variance scaling on KNN_CV data done
Mean centering on KNN_CV data done
script done
