In [None]:
import pandas as pd
import seaborn as sns

In [None]:
consortium_df = pd.read_csv("Consortium_covid_population_imputed_lesscols.csv")

In [None]:
general_df = pd.read_csv("General_population_imputed_lesscols.csv")

In [None]:
df= pd.concat([consortium_df, general_df])

In [None]:
df_transposed = df.set_index(["eid"]).T

In [None]:
#df_transposed.to_csv("Protein_abundance_matrix_variance_partitioning_new.csv")

In [None]:
covariates_df = pd.read_csv("/rds/general/user/meb22/projects/ukbiobank/live/ukbiobank/data_2025/proteomics/Processed_all_covariates.csv").iloc[:,1:]
covariate_cols = covariates_df.columns[1:]



In [None]:
covariates_df["well"].nunique()

In [None]:
covariates_df["Batch"].nunique()

In [None]:
covariates_df["plate"].nunique()

In [None]:
covariates_df

In [None]:
categorical_features = ["Season", "assessment_center", "Ethnicity", "Sex","plate", "Batch", "well"]
covariate_cols = categorical_features + ["fasting_time" ,"sample_age","smoking", "alcohol" ,"Age","BMI"]

In [None]:
covariates_df[categorical_features] =covariates_df[categorical_features].astype("str")

In [None]:
covariates_df.set_index("eid",inplace=True)

In [None]:
categorical_cols = covariates_df.select_dtypes(include=["category", "object"]).columns
cov_encoded = pd.get_dummies(covariates_df, columns=categorical_cols, drop_first=True)
corr_matrix_all = cov_encoded.corr()


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import CCA

def canCorPairs(covariates_df, categorical_encode="onehot"):
    """
    Compute pairwise canonical correlations between all covariates (continuous or categorical)
    
    Parameters
    ----------
    covariates_df : pd.DataFrame
        Covariates with samples as rows and variables as columns
    categorical_encode : str
        How to encode categorical variables: "onehot" or "label"
    
    Returns
    -------
    corr_matrix : pd.DataFrame
        Pairwise canonical correlation matrix (values between 0 and 1)
    """
    # copy to avoid modifying original
    df = covariates_df.copy()

    # detect categorical variables
    categorical_cols = df.select_dtypes(include=["category", "object"]).columns
    numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns

    # encode categorical variables
    if categorical_encode == "onehot":
        df = pd.get_dummies(df, columns=categorical_cols, drop_first=False)
    elif categorical_encode == "label":
        for col in categorical_cols:
            df[col] = df[col].astype("category").cat.codes

    all_cols = df.columns.tolist()
    corr_matrix = pd.DataFrame(np.zeros((len(all_cols), len(all_cols))), 
                               index=all_cols, columns=all_cols)

    # scale all columns
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)

    # compute pairwise canonical correlations
    for i, col_i in enumerate(all_cols):
        Xi = df_scaled[:, [i]]  # single variable (can be vector if expanded)
        for j, col_j in enumerate(all_cols[i:], start=i):
            Xj = df_scaled[:, [j]]
            
            # if i == j, correlation = 1
            if i == j:
                rho = 1.0
            else:
                # CCA between two single columns
                cca = CCA(n_components=1)
                cca.fit(Xi, Xj)
                U, V = cca.transform(Xi, Xj)
                # canonical correlation
                rho = np.corrcoef(U[:, 0], V[:, 0])[0, 1]
                # normalize to [0,1] in case of negative correlations
                rho = abs(rho)

            corr_matrix.at[col_i, col_j] = rho
            corr_matrix.at[col_j, col_i] = rho  # symmetric

    return corr_matrix


In [None]:
# Assume your covariates dataframe is called covariates_df
corr_matrix = canCorPairs(covariates_df)

# Save to CSV
corr_matrix.to_csv("covariates_pairwise_cca.csv")

# Optional: plot heatmap
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Pairwise canonical correlations between covariates")
plt.show()


In [None]:
nn

In [None]:
df_all = pd.merge(df, covariates_df, on="eid")

In [None]:
df_all_imp = df_all.dropna()

In [None]:
protein_cols = df.columns[1:].tolist()

In [None]:
df_all_imp = df_all_imp.set_index("eid")

In [None]:
nnn

import statsmodels.formula.api as smf
import pandas as pd

residuals_by_cov = {}  

for cov in covariate_cols:
    residual_data = {}

    for protein in protein_cols:
        # Handle categorical vs continuous
        term = f"C({cov})" if df_all_imp[cov].dtype == "object" else cov

        lhs = f'Q("{protein}")'
        formula = f'{lhs} ~ {term}'

        # Fit model
        model = smf.ols(formula, data=df_all_imp).fit()

        # Collect residuals
        residual_data[protein] = model.resid

    # Save as DataFrame for this covariate
    residuals_by_cov[cov] = pd.DataFrame(residual_data, index=df_all_imp.index)

    # Optionally save each to file
    residuals_by_cov[cov].to_csv(f"Residualised_{cov}.csv")


In [None]:
df_assessment = pd.read_csv("Residualised_assessment_center.csv").set_index("eid")

In [None]:
df_alcohol = pd.read_csv("Residualised_alcohol.csv").set_index("eid")

In [None]:
df_sample_age = pd.read_csv("Residualised_sample_age.csv").set_index("eid")

In [None]:
df_Ethnicity = pd.read_csv("Residualised_Ethnicity.csv").set_index("eid")

In [None]:
df_Season= pd.read_csv("Residualised_Season.csv").set_index("eid")

In [None]:
df_age = pd.read_csv("Residualised_Age.csv").set_index("eid")

In [None]:
df_BMI = pd.read_csv("Residualised_BMI.csv").set_index("eid")

In [None]:
df_smoking = pd.read_csv("Residualised_smoking.csv").set_index("eid")

In [None]:
df = df.set_index("eid")

In [None]:
df_BMI = df_BMI.set_index("eid")

In [None]:
df_BMI

In [None]:
cov_dfs = {
    "assessment_center": df_assessment,
    "alcohol": df_alcohol,
    "sample_age": df_sample_age,
    "Ethnicity": df_Ethnicity,
    "Season": df_Season,
    "Age": df_age,
    "BMI": df_BMI,
    "smoking": df_smoking,
}

In [None]:
results = []

for cov_name, cov_df in cov_dfs.items():

    corrs = df.corrwith(cov_df, axis=0)
    tmp = corrs.reset_index()
    tmp.columns = ["protein", "pearson_r"]
    tmp["covariate"] = cov_name
    results.append(tmp)
    


In [None]:
corr_df = pd.concat(results, ignore_index=True)

# Sort by correlation
corr_df_sorted = corr_df.sort_values(by="pearson_r", ascending=True)  # smallest first
# or descending:
# corr_df_sorted = corr_df.sort_values(by="pearson_r", ascending=False)

print(corr_df_sorted.head())


In [None]:
corr_df_sorted[corr_df_sorted["covariate"] == "assessment_center"] 