In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from src.imputation import imputation_normal_distribution, log2, NP_LOG_FCT, IMPUTATION_MEAN_SHIFT, IMPUTATION_STD_SHRINKAGE
from src.correlation import pairwise_correlation
import pingouin as pg
from matplotlib_venn import venn3, venn3_circles
from venn import venn
import statsmodels.stats.multitest as multi
from tqdm import tqdm
from scipy.stats import pearsonr
from scipy.stats import zscore
import pickle

### Proteomics data processing

#### Define directories

In [None]:
import os
from pathlib import Path

# Get the number of available CPUs
CPUS = os.cpu_count()

# Define the paths for the raw and processed data folders
DATA_FOLDER_RAW = 'data/raw'
DATA_FOLDER_PROCESSED = 'data/processed'
DATA_FOLDER_CLINIC = '/Volumes/auditgroupdirs/SUND-CBMR-Childhood-Genetic-TCOC/Proteomics analysis/GitHub/TARGET/'

# Create the processed data folder if it doesn't exist
os.makedirs(DATA_FOLDER_PROCESSED, exist_ok=True)

# Define the paths for the table, result, and figure folders
TABLE_FOLDER = 'tables'
RESULT_FOLDER = 'results'
FIGURE_FOLDER = Path('figures')
FIGURE_FOLDER.mkdir(exist_ok=True)

#### Import data and format column headers

In [None]:
# Read annotation file
annotation_file = pd.read_csv(os.path.join(DATA_FOLDER_RAW, 'Experiment annotation file.csv'), index_col = [0])

# Read Spectronaut output report into a DataFrame
report_filename = 'Protein_20210924_230719_TARGET_SNv15.4_Report.csv'
report_plasma = pd.read_csv(os.path.join(DATA_FOLDER_RAW, report_filename), na_values='Filtered')

# Drop columns that are not needed
report_plasma.drop(['PG.Qvalue', 'PG.MolecularWeight', 'PG.ProteinDescriptions'], axis=1, inplace=True)

# Rename columns to have more descriptive names
report_plasma.rename({'PG.Genes': 'Gene names', 'PG.ProteinAccessions': 'Protein IDs'}, inplace= True, axis=1)
report_plasma = report_plasma.set_index(['Protein IDs', 'Gene names'])

# Modify the column names to remove unnecessary information
report_plasma.rename(columns=lambda c: c.split(' ')[1].replace('.PG.Quantity', ''), inplace=True)

# Rename columns to match the annotation file
#report_plasma.rename({'ID':'Protein ID', 'names':'Gene name'}, inplace=True, axis=1)

# Rename the columns with the sample IDs
sample_id_mapping = dict(zip(annotation_file.index, annotation_file['Sample ID']))
report_plasma.rename(sample_id_mapping, inplace=True, axis=1)
report_plasma = report_plasma.reset_index()

# Preview the first few rows of the DataFrame
report_plasma.head()

In [None]:
# Keep the leading protein ID and gene name
report_plasma['Protein ID'] = report_plasma['Protein IDs'].str.split(';').str[0]
report_plasma['Gene name'] = report_plasma['Gene names'].astype(str).str.split(';').str[0]

IDmapping_proteinid_to_proteinids = dict(zip(report_plasma['Protein ID'], report_plasma['Protein IDs']))
IDmapping_genename_to_genenames = dict(zip(report_plasma['Gene name'], report_plasma['Gene names']))
report_plasma.drop(['Protein IDs', 'Gene names'], axis=1, inplace=True)

# Drop contaminants and set the index to the Protein ID column
contaminant_ids = report_plasma[report_plasma['Protein ID'].str.startswith('CON')]['Protein ID']
data_plasma_raw = report_plasma[~report_plasma['Protein ID'].isin(contaminant_ids)].set_index('Protein ID')

# Create a dictionary to map Protein ID to Gene name and batch
IDmapping_proteinid_to_genename = data_plasma_raw['Gene name'].to_dict()
IDmapping_sampleID_to_Batch = dict(zip(annotation_file['Sample ID'], annotation_file['Grouping_batch']))

# Drop the Gene name column from the DataFrame
data_plasma_raw.drop('Gene name', axis=1, inplace=True)

# Get the sample IDs to use for the analysis
non_qa_samples = annotation_file[annotation_file['Grouping_plate'] != 'QA']
sample_ids = non_qa_samples['Sample ID'].tolist()

# Combine Protein ID and Gene name into a single index
data_plasma_raw.index = [str(i) + '_' + str(IDmapping_proteinid_to_genename[i]) for i in data_plasma_raw.index]

#### Filter data based on data completeness

In [None]:
# Define the completeness threshold for filtering
DATA_COMPLETENESS = 0.4
data_plasma_filtered = data_plasma_raw.dropna(axis=0, thresh = data_plasma_raw.shape[1] * DATA_COMPLETENESS)
data_plasma_filtered_log = data_plasma_filtered.apply(log2)
proteins = data_plasma_filtered.T.columns.tolist()

# Calculate the percent of missing values in the filtered data
missing_value_percent = data_plasma_filtered.isna().mean().mean()
missing_value_pct_perprotein = data_plasma_filtered.isnull().sum(axis=1)/data_plasma_filtered.shape[1]
print(f'Missing data after filtering: {missing_value_percent:.2%}')

# Print the shape of the filtered data
print(f'Filtered data shape: {data_plasma_filtered.shape}') 

#### Check if there is batch effect by PCA

In [None]:
# Compute PCA
from sklearn.decomposition import PCA
from sklearn import preprocessing
X_train = data_plasma_filtered_log.T.dropna(axis=1)
X_scaled = preprocessing.StandardScaler().fit_transform(X_train)
X=X_scaled
pca = PCA(n_components=10, random_state=2021)
pca.fit(X)
X_pca = pca.transform(X)
X_df = pd.DataFrame(data=X_pca, columns=['PC{}'.format(i) for i in range(1, 11)])

# Loadings
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
df_loadings = pd.DataFrame(loadings, columns=['PC{}'.format(i) for i in range(1, 11)], index=X_train.columns)
df_loadings['Gene name'] = df_loadings.index.map(IDmapping_proteinid_to_genename)
df_loadings['missing_value_pct']=pd.Series(missing_value_pct_perprotein)

In [None]:
round(pca.explained_variance_ratio_[0]*100, 1)

In [None]:
# Generate PCA plot
color_batches = [IDmapping_sampleID_to_Batch[i] for i in X_train.index]
plt.figure(figsize=(4,4))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=color_batches, s=10, palette='Paired', alpha=0.8)
plt.title('Principal component plot', fontsize=20)
plt.xlabel('principal component 1 ({}%)'.format(round(pca.explained_variance_ratio_[0]*100, 1)), fontsize=16)
plt.ylabel('principal component 2 ({}%)'.format(round(pca.explained_variance_ratio_[1]*100, 1)), fontsize=16)
plt.xticks(fontsize=16);
plt.yticks(fontsize=16);
plt.rcParams['pdf.fonttype'] = 42
plt.savefig(os.path.join(FIGURE_FOLDER, 'PCA_batches.pdf'), bbox_inches='tight', dpi=120)

#### Imputation

In [None]:
SCALE_DATA = False
if SCALE_DATA:
    from sklearn.preprocessing import StandardScaler

    scaler = StandardScaler()

    data_plasma_filtered_log_imputed_np = scaler.fit_transform(data_plasma_filtered_log.values)
    data_plasma_filtered_log_imputed = data_plasma_filtered_log.copy()
    data_plasma_filtered_log_imputed.loc[:,:] = np.nan_to_num(data_plasma_filtered_log_imputed_np)
else:
    data_plasma_filtered_log_imputed = data_plasma_filtered_log.apply(imputation_normal_distribution)
    assert data_plasma_filtered_log_imputed.loc['Q9Y6Z7_COLEC10', 'Plate1_2'] - 9.370928 < 0.00001, 'Imputed value changed in comparison to previous run'

#### Normalization

In [None]:
from combat.pycombat import pycombat
dm = data_plasma_filtered_log_imputed[sample_ids].copy()
batch = [IDmapping_sampleID_to_Batch[i] for i in dm.columns]
data_plasma_filtered_log_imputed_corrected = pycombat(dm, batch)
assert data_plasma_filtered_log_imputed_corrected.loc['Q9Y6Z7_COLEC10', 'Plate1_2'] - 9.761459 < 0.00001, 'Corrected value changed in comparison to previous run'

#### Save dataset

In [None]:
FILE_RESULTS = 'data/processed/proteomics_datasets.xlsx'

def format_dataset(df):
    df_new=df.copy()
    df_new.rename_axis('ProteinID_Genename', axis=0, inplace=True)
    df_new['Protein ID'] = df_new.index.str.split('_').str[0]
    df_new['Gene name'] = df_new.index.str.split('_').str[1]
    df_new['Protein IDs'] = df_new['Protein ID'].map(IDmapping_proteinid_to_proteinids)
    df_new['Gene names'] = df_new['Gene name'].map(IDmapping_genename_to_genenames)
    return df_new

# with pd.ExcelWriter(FILE_RESULTS) as writer:
#     format_dataset(data_plasma_raw).to_excel(writer, sheet_name='raw')
#     format_dataset(data_plasma_filtered_log).to_excel(writer, sheet_name='filtered_log2')
#     format_dataset(data_plasma_filtered_log_imputed).to_excel(writer, sheet_name='filtered_log2_imputed')
#     format_dataset(data_plasma_filtered_log_imputed_corrected).to_excel(writer, sheet_name='imputed_batchcorrected')

### Quality assessment

#### Calculate CV based on 94 quality assessment samples (pooled plasma allocated in 24 plates)

In [None]:
qa_plasma = annotation_file[annotation_file['Grouping_batch'] == 'QA']['Sample ID'].tolist()
df_qa = data_plasma_filtered.copy()[qa_plasma]
coef_of_variation = lambda x: np.std(x) / np.mean(x)
proteins_cv = df_qa.apply(coef_of_variation, axis =1)
proteins_log2_abundance = np.log2(df_qa.median(axis=1))

In [None]:
print('median cv: {}'.format(round(proteins_cv.median(),2)))

In [None]:
df_cv = pd.DataFrame({'Coefficient of variation':proteins_cv, 
                      'Protein abundance [Log2]':proteins_log2_abundance}).sort_values(by='Coefficient of variation')
df_cv['color']=np.where(df_cv['Coefficient of variation']<0.3, 'CV<30%', 'rest')
df_cv.tail(3)

#### Generate figures of data quality

In [None]:
prot_dep_wide = pd.DataFrame({'raw': data_plasma_raw.count(), 'filtered':data_plasma_filtered.count()})
prot_dep = pd.melt(prot_dep_wide, var_name='dataset', value_name='Number of proteins')

In [None]:
df_cv['Protein abundance [Log10]'] = np.log10(2**df_cv['Protein abundance [Log2]'])
df_cv = df_cv.sort_values(by='Protein abundance [Log2]', ascending = False)

In [None]:
df_cv['rank']=np.arange(df_cv.shape[0])

In [None]:
prot_dep.groupby('dataset')['Number of proteins'].median()

#### Supplementary figure 1

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12,4))
sns.boxplot(data = prot_dep, x='dataset', y='Number of proteins', color='white', ax=ax1)
ax1.set_ylim(0, 520)
for i,box in enumerate(ax1.artists):
    box.set_edgecolor('black')
    box.set_facecolor('white')

    # iterate over whiskers and median lines
    for j in range(6*i,6*(i+1)):
         ax1.lines[j].set_color('black')
ax2 = sns.scatterplot(x=df_cv['rank'], y=df_cv['Protein abundance [Log10]'], ax=ax2)
ax3 = sns.scatterplot(x=df_cv['Protein abundance [Log2]'], y=df_cv['Coefficient of variation'], hue=df_cv['color'], ax=ax3)
plt.rcParams['pdf.fonttype'] = 42
plt.savefig('figures/data_quality.pdf', dpi=120, bbox_inches='tight')

### Double key

In [None]:
# Read the double ID file into a DataFrame and set the index
double_id_file = pd.read_csv(os.path.join(DATA_FOLDER_RAW, 'Target_sampleID.csv'), index_col='blood_sample_ID').dropna(how='all')

# Drop rows with all missing values
double_id_file = double_id_file.dropna(how='all')

# Create a dictionary to map blood sample IDs to proteomics sample IDs and vice versa
IDmapping_bloodSampleID_to_SampleID = double_id_file['Sample ID'].to_dict()
IDmapping_SampleID_to_bloodSampleID = dict(zip(double_id_file['Sample ID'], double_id_file.index))

In [None]:
# Define the path to the output file
output_file_path = os.path.join('pQTL', 'phenomics', 'IDmapping_SampleID_to_bloodSampleID.p')

# Dump the dictionary to the output file
with open(output_file_path, 'wb') as output_file:
    pickle.dump(IDmapping_SampleID_to_bloodSampleID, output_file)

### Clinical data exploration

#### Import data and add relevant columns

In [None]:
# Read in the raw clinical data file
data_cli_raw = pd.read_csv(os.path.join(DATA_FOLDER_CLINIC, 'HOL_dataset_massspec_bas_fu_clean_v2022.07.07.csv'), 
                          sep=';', decimal=',', low_memory=False)

# Drop unnecessary columns and rows with all missing values
data_cli_raw.drop(['Unnamed: 0'], axis=1, inplace=True)
data_cli_raw.dropna(how='all', axis=1, inplace=True)

# Rename columns and add a proteomics analysis date column
data_cli_raw.rename({'gender':'sex'}, axis=1, inplace=True)
data_cli_raw['proteomics_analysis_date'] = pd.Timestamp('2021-07-17')

# Assign groups of overweight and normalweight 
data_cli_raw['obesity'] = np.where(data_cli_raw['z_BMI.Nysom']>=1.28, 1, 0)

# Add a column for the time between blood sample collection and analysis
#data_cli_raw['blood_sample_date_ts']=[pd.Timestamp(s) for s in data_cli_raw['blood_sample_date']]
data_cli_raw['blood_sample_date_ts'] = pd.to_datetime(data_cli_raw['blood_sample_date'])
storagetime = data_cli_raw['proteomics_analysis_date'] - data_cli_raw['blood_sample_date_ts']
data_cli_raw['time_to_analysis'] = storagetime.dt.days

# Drop any duplicated rows and save the filtered DataFrame to a variable
data_cli_filtered = data_cli_raw[~data_cli_raw.index.duplicated(keep=False)]

print('Filtered clinical data shape: {}'.format(data_cli_filtered.shape))

#### Extract clinical data for samples with proteomics measurement

In [None]:
data_cli_prot = pd.DataFrame(data= data_plasma_raw.T.index, columns=['Sample ID']).set_index('Sample ID')
data_cli_prot['blood_sample_ID'] = data_cli_prot.index.map(IDmapping_SampleID_to_bloodSampleID)
data_cli_prot = data_cli_prot.reset_index().merge(data_cli_filtered, how='left', left_on='blood_sample_ID', 
                                                  right_on='biobank_ID').set_index('Sample ID')
data_cli_prot.drop('blood_sample_ID', axis=1, inplace=True)

data_cli_prot['age_int']=data_cli_prot['age'].round(0)
bin_numbers_bmi = pd.qcut(
    x=data_cli_prot['BMI'], q=20, labels=False, duplicates='drop'
)
data_cli_prot['bin_numbers_bmi'] = bin_numbers_bmi

In [None]:
dates = [i for i in data_cli_prot['visit_date'] if type(i)==str]
dates_year = [int(i.split('-')[0]) for i in dates]
data_cli_prot['visit_date_year'] = data_cli_prot['visit_date'].map(dict(zip(dates, dates_year)))
data_cli_prot['visit_date_year_binary'] = np.where(data_cli_prot['visit_date_year']>2015, '>2015', '≤2015')

df = data_cli_prot.copy()
df['mr_liverfat_above5%'] = np.where(df['mr_real_liverfat_percent']>5, 1, 0)
df['mr_liverfat_above1.5%'] = np.where(df['mr_real_liverfat_percent']>1.5, 1, 0)
df.loc[df['mr_real_liverfat_percent'].isnull(), 'mr_liverfat_above5%'] = np.nan
df.loc[df['mr_real_liverfat_percent'].isnull(), 'mr_liverfat_above1.5%'] = np.nan

prot_batch = pd.get_dummies(annotation_file.set_index('Sample ID')['Grouping_batch'])
prot_batches = prot_batch.columns.tolist()
df = df.join(prot_batch)
data_cli_prot = df.copy()

In [None]:
# Save clinical data for phenomics analysis 
data_cli_prot.reset_index().to_csv('pQTL/phenomics/data_cli_prot.csv', index=False)

### Export data for Dash app

In [None]:
data_dash = data_plasma_filtered_log.T.join(data_cli_prot).rename_axis('Sample ID', axis=0)[proteins + ['age_int', 'sex', 'z_BMI.Nysom']]
data_dash.to_csv('dash/dataset/data_age_sex.csv')

### Baseline participant characteristics

In [None]:
data_cli_base = data_cli_prot.copy()
data_cli_base = data_cli_base[data_cli_base['QA']==0]

para_toinclude = ['age_year', 'sex', 'z_BMI.Nysom','BMI','height', 'weight', 
                  'tanner_stage', 'pubertal_status2', 'triglycerides', 
                  'chol_total', 'chol_ldl', 'chol_hdl',
                  'glucose', 'insulin', 'HbA1c', 'ALAT', 'ASAT', 'GGT', 'obesity']

data_cli_base = data_cli_base[para_toinclude]

In [None]:
data_cli_base.groupby('obesity')['pubertal_status2'].value_counts(1)

In [None]:
data_cli_base.groupby('obesity').quantile(0.25)

### Combine proteomics and clinical data

In [None]:
data_combined = data_plasma_filtered_log_imputed_corrected.T.join(data_cli_prot).rename_axis('Sample ID', axis=0)

#### Check sample quality markers by sample collection time

In [None]:
quality_markers=pd.read_csv('data/processed/quality_markers.csv')['denoise_proteins']
REPLOT_FIGURES = False
if REPLOT_FIGURES:
    for i in quality_markers.tolist():
        fig, ax = plt.subplots()
        sns.boxplot(x='visit_date_year', y=i, data=data_combined, color='crimson', )
        plt.title(i, fontsize=16)
        plt.xticks(rotation=45)
        plt.savefig('figures/quality_markers/{}'.format(i), bbox_inches='tight', dpi=120)

### Data exploration

#### Normality test before transformation

In [None]:
# Long data format to ease computation 
data_long = data_combined[proteins].melt(var_name='ProteinID_Genename', value_name='MS signal [Log2]').set_index('ProteinID_Genename')

from src.statistical_testing import normality_pg
# Set a dummy variable needed for pingouin.normality
data_long['group_dummy']=1
normality_results = normality_pg(data=data_long, dv='MS signal [Log2]', group='group_dummy')

# Show results
print('Number of protein with non-normal distribution:{}'.format(normality_results['normal'].value_counts()[False]))

#### Ranked-based inverse normalized transformation (INT)
- https://stackoverflow.com/questions/15549836/transform-data-to-fit-normal-distribution
- [good discussion on violating OLS residual normality assumption](https://stats.stackexchange.com/questions/29731/regression-when-the-ols-residuals-are-not-normally-distributed)
- [good discussion on testing non-linear association](https://stats.stackexchange.com/questions/35893/how-do-i-test-a-nonlinear-association)

In [None]:
# Perform ranked-based inverse normalized transformation (INT) on protein levels per protein
# Refer to https://github.com/edm1/rank-based-INT

from src.rank_based_int import rank_INT
RE_INT = False

if not RE_INT:
    data_proteomics_int = pd.read_csv('data/processed/proteomics_filtered_imputed_corrected_int.csv').set_index('Sample ID')
else:
    new_df = []
    for protein in tqdm(proteins):
        new_df.append(pd.DataFrame(rank_INT(data_plasma_filtered_log_imputed_corrected.T[protein], stochastic=False), columns=[protein]))
    data_proteomics_int = pd.concat(new_df, axis=1)
    data_proteomics_int.rename_axis('Sample ID', axis=0, inplace=True)
    data_proteomics_int.to_csv('data/processed/proteomics_filtered_imputed_corrected_int.csv')    

In [None]:
assert data_proteomics_int.loc['Plate1_2', 'Q9Y6Z7_COLEC10'] - -1.055522 < 0.00001, 'Normalized value changed in comparison to previous run'

In [None]:
data_combined_int = data_proteomics_int.join(data_cli_prot).rename_axis('Sample ID', axis=0)

#### Multivariate normality test after data transformation

In [None]:
covariates = ['sex', 'z_BMI.Nysom' , 'age', 'time_to_analysis', 'pubertal_status2', ]
lr_residuals = {}
for protein in tqdm(proteins):
    lr = pg.linear_regression(X=data_combined_int[covariates], y=data_combined_int[protein], remove_na=True)
    residuals = lr.residuals_    
    lr_residuals[protein]=residuals

In [None]:
normality_test_residuals = []
for protein in lr_residuals.keys():
    normal = pg.normality(lr_residuals[protein])
    normal['protein']=protein
    normality_test_residuals.append(normal)
pd.concat(normality_test_residuals)['normal'].value_counts()

#### Multicolinearity test after data transformation

In [None]:
data_combined_int[covariates].corr()

### Protein-phenotype associations

#### Run linear regression adjusting for covariates

In [None]:
# five covariates including age, sex, z_BMI.Nysom (BMI SDS, beta estimated seperately for normal and overweight (z_BMI.Nysom >=1.28)),
# time_to_analysis (sample storage time), and pubertal_status2 (1=prepubertal, 2=pubertal/postpubertal)

In [None]:
# overweight/obesity (BMI SDS >= 1.28)
data_combined_int['overweight'] = np.where(data_combined_int['z_BMI.Nysom']>=1.28, 1, 0)
data_combined_int['normalweight'] = np.where(data_combined_int['z_BMI.Nysom']>=1.28, 0, 1)
data_combined_int['overweight*z_BMI.Nysom']=data_combined_int['overweight']*data_combined_int['z_BMI.Nysom']
data_combined_int['normalweight*z_BMI.Nysom']=data_combined_int['normalweight']*data_combined_int['z_BMI.Nysom']

In [None]:
covariates_lm =['sex', 'normalweight*z_BMI.Nysom','overweight*z_BMI.Nysom' , 'age', 'time_to_analysis', 'pubertal_status2', ]
COVARIATES=covariates_lm
RE_LIREG = False

if not RE_LIREG:
    lireg_statistics = pd.read_csv('results/association/lireg_statistics.csv')
    lireg_residuals = pd.read_csv('results/association/lireg_residuals.csv').set_index('Sample ID')
else:
    lireg_statistics = []
    lireg_residuals = {}
    for protein in tqdm(proteins):
        df = data_combined_int
        df_test = df[[protein]+COVARIATES].dropna()
        nr_obs = df_test.shape[0]
        X=df_test[COVARIATES]
        y=df_test[protein]
        lm = pg.linear_regression(X=X, y=y,relimp=True )
        lm['dep_var']=protein
        lm['nr_obs']=nr_obs
        residuals = pd.Series(lm.residuals_, index=df_test.index)
        df_model = lm.df_model_
        df_residual = lm.df_resid_
        lm['df_model']=df_model
        lm['df_residual']=df_residual
        lireg_statistics.append(lm)
        lireg_residuals[protein]=residuals
    lireg_statistics = pd.concat(lireg_statistics)
        
    # FDR correction
    reject, qvalue = multi.fdrcorrection(lireg_statistics['pval'], alpha=0.05, method='indep')
    lireg_statistics['qvalue'] = qvalue
    lireg_statistics['rejected'] = reject
    
    # Calculate -log10(pval)
    lireg_statistics['-Log10 P-value'] = -np.log10(lireg_statistics['pval'])
    
    # Determine the direction of the coefficient
    lireg_statistics['direction']=np.where(lireg_statistics['coef']>0, 'pos', 'neg')
    
    # Set direction as 'not significant' for non-rejected coefficients
    lireg_statistics.loc[lireg_statistics['rejected']== False, 'direction']='not significant'  
    
    # Save results
    lireg_statistics.to_csv('results/association/lireg_statistics.csv', index=False)
    lireg_residuals = pd.DataFrame.from_dict(lireg_residuals)
    lireg_residuals.rename_axis('Sample ID', axis=0, inplace=True)
    lireg_residuals.to_csv('results/association/lireg_residuals.csv')

In [None]:
# Check if regression results changed compared to previous run
assert abs(lireg_residuals.loc['Plate5_1', 'Q9Y6Z7_COLEC10'] - -1.673965) < 0.00001, 'Regressed value changed in comparison to previous run'

In [None]:
dict_BMI = {'overweight*z_BMI.Nysom':'z_BMI.Nysom', 'normalweight*z_BMI.Nysom':'z_BMI.Nysom'}
lireg_statistics['names2'] = lireg_statistics['names'].replace(dict_BMI)

# Extract significant associations
lireg_statistics_sig = lireg_statistics[lireg_statistics.rejected]

# Extract proteins associated with three factors
factors_tokeep = ['sex', 'z_BMI.Nysom', 'age',]
lireg_statistics_sig[lireg_statistics_sig['names2'].isin(factors_tokeep)]['dep_var'].nunique()
lireg_statistics_sig['names2'].value_counts()

In [None]:
# Create a dictionary to ease plotting
lireg_dicts = {}
for covariate in ['age', 'sex','z_BMI.Nysom']:
    lireg_dicts[covariate] = {'df':lireg_statistics[lireg_statistics['names2']==covariate], 
                         'sig':lireg_statistics_sig[lireg_statistics_sig['names2']==covariate],
                        'sig_set':set(lireg_statistics_sig[lireg_statistics_sig['names2']==covariate]['dep_var'])}

#### Write results to supplementary table

In [None]:
df_table1 = lireg_statistics_sig.copy()
var_tokeep = ['z_BMI.Nysom', 'age', 'sex']
col_todrop = ['rejected', 'direction']

In [None]:
df_table1 = df_table1[df_table1['names2'].isin(var_tokeep)].drop(col_todrop, axis=1).sort_values(by='names2')
df_table1 = df_table1.reset_index().drop(['index'], axis=1)
df_table1['Protein ID'] = df_table1['dep_var'].str.split('_').str[0]
df_table1['Gene name'] = df_table1['dep_var'].str.split('_').str[1]
df_table1 = df_table1.drop(['dep_var'], axis=1)

In [None]:
df_table1['names']=df_table1['names'].replace({'overweight*z_BMI.Nysom':'overweight*BMI-SDS',
                                              'normalweight*z_BMI.Nysom':'normalweight*BMI-SDS'})

In [None]:
cols_tokeep = ['Protein ID', 'Gene name', 'names', 'coef', 'se', 'T', 'pval', 'df_model', 'df_residual',
               'r2', 'adj_r2', 'CI[2.5%]', 'CI[97.5%]',
               'nr_obs', 'qvalue', '-Log10 P-value', ]
headers_toreplace = ['Protein ID', 'Gene name', 'variable', 'coefficient', 'standard error', 'T-value', 'p-value', 
                    'degree of freedom_model', 'degree of freedom_residual', 'R square', 'adjusted R square', 'CI[2.5%]', 'CI[97.5%]',
                    'observations', 'BH-corrected p-value', '-Log10 p-value']

# Replace column headers
new_names = dict(zip(cols_tokeep, headers_toreplace))
df_table1_formatted = df_table1[cols_tokeep].rename(new_names, axis=1).sort_values(by=['variable', 'R square'], ascending=False)

#### Export data for Dash app

In [None]:
df_dash = lireg_statistics.copy()
df_dash = df_dash[df_dash['names'].isin(['sex', 'normalweight*z_BMI.Nysom','overweight*z_BMI.Nysom','age'])]

In [None]:
df_dash = lireg_statistics.copy()
df_dash = df_dash[df_dash['names'].isin(['sex', 'normalweight*z_BMI.Nysom','overweight*z_BMI.Nysom','age'])]
df_dash = df_dash.rename(dict(zip(cols_tokeep, headers_toreplace)), axis=1)
df_dash['p-value [-Log10]']=-np.log10(df_dash['p-value'])
df_dash['BH-corrected p-value [-Log10]']=-np.log10(df_dash['BH-corrected p-value'])

df_dash = df_dash[['dep_var', 'variable', 'coefficient', 'standard error','observations', 'p-value [-Log10]', 'BH-corrected p-value [-Log10]', 'rejected']]
df_dash=df_dash.rename({'dep_var':'ProteinID_Genename', 'variable':'factor', 'rejected':'significant'}, axis=1)
df_dash.to_csv('dash/dataset/dataset2.csv', index=False)

#### Characteristics of participants included in the phenotype-protein association analysis

In [None]:
data_combined_int[['sex', 'age_int', 'z_BMI.Nysom', 'BMI', 'pubertal_status2']].groupby('sex')['pubertal_status2'].value_counts()

In [None]:
data_combined_int.dropna(subset=['pubertal_status2'])['overweight'].value_counts()

### Data processing for protein-genotype association
- Including puberty stage as a covariate results in losing >500 samples due to incomplete data. 
- In this step, we decided not to include puberty stage. 

In [None]:
# Define the list of covariates for pqtl analysis
covariates_pqtl = ['sex', 'z_BMI.Nysom', 'age', 'time_to_analysis']

# Alternative covariates that were tested but did not change the main results are commented out below
#covariates_pqtl = ['sex', 'z_BMI.Nysom', 'age', 'time_to_analysis', 'overweight']
#covariates_pqtl = ['sex', 'overweight*z_BMI.Nysom', 'normalweight*z_BMI.Nysom', 'age', 'time_to_analysis']

In [None]:
COVARIATES = covariates_pqtl
RE_LIREG = False

if not RE_LIREG:
    lireg_statistics_pqtl = pd.read_csv('pQTL/gemma/lireg_statistics.csv')
    lireg_residuals_pqtl = pd.read_csv('pQTL/gemma/lireg_residuals.csv').set_index('Sample ID')
else:
    lireg_statistics_pqtl = []
    lireg_residuals_pqtl = {}
    for protein in tqdm(proteins):
        df = data_combined_int
        df_test = df[[protein]+COVARIATES].dropna()
        nr_obs = df_test.shape[0]
        X=df_test[COVARIATES]
        y=df_test[protein]
        lm = pg.linear_regression(X=X, y=y,relimp=True )
        lm['dep_var']=protein
        lm['nr_obs']=nr_obs
        residuals = pd.Series(lm.residuals_, index=df_test.index)
        lireg_statistics_pqtl.append(lm)
        lireg_residuals_pqtl[protein]=residuals
    lireg_statistics_pqtl = pd.concat(lireg_statistics_pqtl)
        
    #FDR correction
    reject, qvalue = multi.fdrcorrection(lireg_statistics_pqtl['pval'], alpha=0.05, method='indep')
    lireg_statistics_pqtl['qvalue'] = qvalue
    lireg_statistics_pqtl['rejected'] = reject
    
    # Calculate -log10(pval)
    lireg_statistics_pqtl['-Log10 P-value'] = -np.log10(lireg_statistics_pqtl['pval'])
    
    # Determine the direction of the coefficient
    lireg_statistics_pqtl['direction']=np.where(lireg_statistics_pqtl['coef']>0, 'pos', 'neg')
    
    # Set direction as 'not significant' for non-rejected coefficients
    lireg_statistics_pqtl.loc[lireg_statistics_pqtl['rejected']== False, 'direction']='not significant'  
    
    # Save results
    lireg_statistics_pqtl.to_csv('pQTL/gemma/lireg_statistics.csv', index=False)
    lireg_residuals_pqtl = pd.DataFrame.from_dict(lireg_residuals_pqtl)
    lireg_residuals_pqtl.rename_axis('Sample ID', axis=0)
    lireg_residuals_pqtl.to_csv('pQTL/gemma/lireg_residuals.csv')

In [None]:
# Check if regression results changed in comparison to previous run
assert abs(lireg_residuals_pqtl.loc['Plate5_1', 'Q9Y6Z7_COLEC10'] - -1.6455972111247719) < 0.0001, 'Regressed value for GWAS changed in comparison to previous run'

#### Perform INT on residuals

In [None]:
RE_INT = False

if not RE_INT:
    data_gwas_int = pd.read_csv('pQTL/gemma/lireg_residuals_int.csv').set_index('Sample ID')
else:
    new_df = []
    data = lireg_residuals_pqtl
    for protein in tqdm(data.columns):
        new_df.append(pd.DataFrame(rank_INT(data[protein], stochastic=False), columns=[protein]))
    data_gwas_int = pd.concat(new_df, axis=1)
    data_gwas_int.rename_axis('Sample ID', axis=0, inplace=True)
    data_gwas_int.to_csv('pQTL/gemma/lireg_residuals_int.csv')

# Check if normalization changed in comparison to previous run
assert data_gwas_int.loc['Plate5_1', 'Q9Y6Z7_COLEC10'] == -1.6443417914920349, 'Normalized value for GWAS changed in comparison to previous run'

#### Prepare the data to fit the GEMMA input format

In [None]:
# Replace sample ID with participant ID so it's compatible with genotype data
participant_ids = '66-' + data_gwas_int.index.map(IDmapping_SampleID_to_bloodSampleID)
data_gwas_int.insert(0, 'Participant ID', participant_ids)

In [None]:
# Import the .fam file template
fam_tep = pd.read_csv('pQTL/QC_PLINK/target5.fam', header=None, sep=' ')
data_gwas_export = fam_tep.set_index(0).join(data_gwas_int.round(5).set_index('Participant ID')).reset_index().drop([5], axis=1)
data_gwas_export.fillna('NA', inplace=True)

# Check if data changed in comparison to previous run
assert data_gwas_export.loc[0, 'A0A024R6I7_SERPINA1'] == 1.08161, 'Exported value for GWAS changed in comparison to previous run'

# Export data for GWAS
data_gwas_export.to_csv('pQTL/gemma/phenotype.fam', header=None, index=False, sep=' ')

In [None]:
# Create a dataframe mapping phenotype IDs to protein IDs
proteinID_pqtl=pd.DataFrame({'Phenotype ID':np.arange(1, 421),'Protein ID':data_gwas_export.columns[5:]})
proteinID_pqtl.to_csv('pQTL/gemma/ProteinID.txt', index=False, sep='\t')

# Association performed in Computerome (GEMMA v0.98.3)

#### Export data to look at "dose-response"

In [None]:
df = data_plasma_filtered_log_imputed_corrected.copy()
df.columns = '66-' + df.columns.map(IDmapping_SampleID_to_bloodSampleID)
df.to_csv('pQTL/dose-response/data_log2_imputed.csv')

df = data_plasma_filtered_log.copy()
df.columns = '66-' + df.columns.map(IDmapping_SampleID_to_bloodSampleID)
df.to_csv('pQTL/dose-response/data_log2.csv')

df_cli = data_cli_prot[['sex', 'age_year']].dropna()
df_cli['Participant ID'] = '66-' + df_cli.index.map(IDmapping_SampleID_to_bloodSampleID)
df_cli = df_cli.set_index('Participant ID')
df_cli.to_csv('pQTL/dose-response/data_cli.csv')

#### Export sample ID annotation for peptide data inspection

In [None]:
df_annotation_pep = annotation_file.copy()
df_annotation_pep['Participant ID'] = '66-' + df_annotation_pep['Sample ID'].map(IDmapping_SampleID_to_bloodSampleID)
df_annotation_pep.to_csv('pQTL/peptide_annotation.csv')

# Peptide-level inspection was performed in Computerome

#### Generate datasets with doubling sample size from n=50
- sample size: 50, 100, 200, 400, 800, 1600

In [None]:
from random import sample, seed

df = data_gwas_export.copy()
sample_size = [50, 100, 200, 400, 800, 1600]
indices = []
sampleIDs = []

# Set a seed for the random number generator to ensure reproducibility
seed(42)

for size in sample_size:
    sampled_indices = sample(df.index.tolist(), size)
    indices.append(sampled_indices)
    df_new = df.copy()
    df_new.loc[df.index.difference(sampled_indices), proteins]='NA'
    df_new.to_csv('pQTL/gemma/sample_size/phenotype_{}.fam'.format(size), header=None, index=False, sep=' ')
    
    sampleID_list = df_new[0].tolist()
    sampleIDs.append(sampleID_list)
    df_new = df_new.replace('NA', np.nan)
    normality_test_results = []
    for protein in df_new.columns[5:]:
        normal = pg.normality(np.array(df_new[df_new[protein]!='NA'][protein]))
        normal['protein']=protein
        normality_test_results.append(normal)
    print(pd.concat(normality_test_results)['normal'].value_counts())

#### Prepare genotype array batch data for use as covariates ####

In [None]:
# import batch info
df_batch = pd.read_csv('pQTL/ids_in_batch.txt', sep=' ', header=None, names=['participant ID', 'batch'])
onehot_batch = pd.DataFrame(pd.get_dummies(df_batch['batch']))
onehot_batch['participant ID']=df_batch['participant ID']

In [None]:
# Combine participant IDs and genotype array batch data as covariates
data_gwas_covariates = fam_tep[[0]].set_index(0).join(onehot_batch.set_index('participant ID')).drop(['batch2015'], axis=1)

# Add an intercept column to the covariate data
data_gwas_covariates.insert(0, 'intercept', 1)

# Replace missing values with 'NA'
data_gwas_covariates = data_gwas_covariates.fillna('NA')

# Select rows where the batch2018 column has missing values
data_gwas_covariates[data_gwas_covariates['batch2018']=='NA']

# Save the covariate data to a tab-separated text file for use with GEMMA
data_gwas_covariates.to_csv('pQTL/gemma/covariates.txt', header=None, index=False, sep=' ')

# Generate covariate file used in GWAS for different sample sizes
for size, IDlist in (zip(sample_size, sampleIDs)):
    df=data_gwas_covariates.copy()
    df_new = df.loc[IDlist]
    df_new.to_csv('pQTL/gemma/sample_size/covariates_{}.txt'.format(size), header=None, index=False, sep=' ')

In [None]:
for size, IDlist in (zip(sample_size, sampleIDs)):
    df=data_gwas_covariates.copy()
    df_new = df.loc[IDlist]
    df_new.to_csv('pQTL/gemma/sample_size/covariates_{}.txt'.format(size), header=None, index=False, sep=' ')

#### Check sex discrepancy

In [None]:
# Read in the PLINK sexcheck file and merge with clinical data
df_sex = pd.read_csv('pQTL/QC_PLINK/plink.sexcheck', sep='\s+')
df_sex['biobank_ID']=df_sex['FID'].str.split('-').str[1]
df_sex = df_sex.merge(data_cli_raw[['biobank_ID', 'sex']], on='biobank_ID', how='left')

# Select rows where the SNPSEX column does not match the sex column from clinical data
df_sex[df_sex['SNPSEX']!=df_sex['sex']]

### Can we predict age and BMI based on plasma proteome? 

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [None]:
def get_data(data, outcome_col, predictors, stratify_col):
    if outcome_col!=stratify_col:
        df=data[predictors + [outcome_col, stratify_col]].dropna()
    else:
        df=data[predictors + [outcome_col]].dropna()
    X=df[predictors]
    y=df[outcome_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, 
                                                       stratify=df[stratify_col]
                                                       )
    X_train_train, X_train_validation, y_train_train, y_train_validation = train_test_split(X_train, y_train, 
                                                                                            test_size=0.3, 
                                                                                            random_state=42,
                                                                                           stratify=df[stratify_col].loc[X_train.index]
                                                                                           )
    return X_train_train, X_train_validation, X_test, y_train_train, y_train_validation, y_test

In [None]:
# Get beta coefficients for features in training set
def get_features(outcome_col='age', predictors=proteins, stratify_col='age_int', data=data_combined):
    X_train_train, X_train_validation, X_test, y_train_train, y_train_validation, y_test = get_data(data=data, outcome_col=outcome_col, 
                                                                                                   predictors=predictors, stratify_col=stratify_col)
    model = LinearRegression()
    model.fit(X_train_train, y_train_train)
    feature_coef = pd.DataFrame(model.coef_, columns=['beta coef. (lm)'])
    feature_coef['features'] = X_train_train.columns
    feature_coef['abs(coef)'] = abs(feature_coef['beta coef. (lm)'])
    feature_coef = feature_coef.set_index('features').sort_values(by='abs(coef)', ascending=False)
    
    #iterating number of features from 1-total, record mean squared errors on validation set
    mses = []
    for i in tqdm(np.arange(1, feature_coef.shape[0]+1)):
        selected_features = feature_coef.iloc[:i].index
        model = LinearRegression()
        model.fit(X_train_train[selected_features], y_train_train)
        y_pred_validation = model.predict(X_train_validation[selected_features])
        mse = mean_squared_error(y_train_validation, y_pred_validation)
        mses.append(mse)
    sme_vs_features = pd.DataFrame(mses, columns=['Squared mean error'])
    sme_vs_features['Nr. features']=np.arange(1, sme_vs_features.shape[0]+1)
    sme_vs_features['features'] = feature_coef.index
    sme_vs_features.set_index('features', inplace=True)
    combined = feature_coef.join(sme_vs_features)
    nr_of_features = int(combined[combined['Squared mean error']==combined['Squared mean error'].min()].iloc[0]['Nr. features'])
    nr_of_features = nr_of_features if nr_of_features < 80 else 80
    features = combined.index[:nr_of_features]
    return (features, combined)

In [None]:
features_age, df_features_age = get_features()
features_bmi, df_features_bmi = get_features(outcome_col='BMI', stratify_col = 'bin_numbers_bmi')

In [None]:
def get_prediction_score(outcome_col, data, predictors, features, stratify_col):
    X_train_train, X_train_validation, X_test, y_train_train, y_train_validation, y_test = get_data(outcome_col=outcome_col, data=data,
                                                                                                       predictors=predictors, stratify_col=stratify_col)
    model = LinearRegression()
    model.fit(X_train_train[features], y_train_train)
    coef = model.coef_
    y_pred_test = model.predict(X_test[features])
    corr, pvalue = pearsonr(y_test, y_pred_test)
    mse = mean_squared_error(y_test, y_pred_test)
    mae = mean_absolute_error(y_test, y_pred_test)
    pred_score = pd.DataFrame.from_dict({'Pearson r': corr.round(2), 'mean_squared_error':mse.round(1),
                                         'mean_absolute_error':mae.round(1)},
                                          orient='index', columns=[outcome_col])
    pred = pd.DataFrame(y_test)
    pred['y_pred'] = y_pred_test
    
    return(pred, pred_score, coef)

In [None]:
# Get prediction values and scores for age
pred_age, pred_score_age, coef_age = get_prediction_score('age', data_combined, proteins, features_age, 'age_int')

# Get prediction values and scores for BMI
pred_bmi, pred_score_bmi, coef_bmi = get_prediction_score('BMI', data_combined, proteins, features_bmi, 'bin_numbers_bmi')

### Export supplementary tables

In [None]:
df_table2_age = pd.DataFrame.from_dict({'predictors for age':features_age, 
                                       'coefs':coef_age})
df_table2_bmi = pd.DataFrame.from_dict({'predictors for BMI':features_bmi, 
                                       'coefs':coef_bmi})
df_table2 = pd.concat([df_table2_age, df_table2_bmi], ignore_index=True, axis=1)
df_table2.columns = ['predictors for age (UniprotID_Genename)', 'coefficient', 
                     'predictors for BMI (UniprotID_Genename)','coefficient' ]

In [None]:
#pdrive_dir = '/Volumes/jpx667/Projects/TARGET'
#df_table3 = pd.read_csv(os.path.join(pdrive_dir, 'Perseus/protein_age_clusters.txt'), sep='\t')
df_table3 = pd.read_csv('tables/protein_age_clusters.txt', sep='\t')

In [None]:
cluster_dict = {'Cluster -156':'Cluster1', 'Cluster -152':'Cluster2', 
                'Cluster -142':'Cluster3','Cluster -149':'Cluster4', 
                'Cluster -124':'Cluster5', 'Cluster -99':'Cluster6', 
                'Cluster -162':'Cluster7'}

In [None]:
df_table3['Cluster2'] = df_table3['Cluster'].map(cluster_dict)
df_table3['Cluster2']=df_table3['Cluster2'].fillna('Unclassified')
df_table3 = df_table3[1:].drop(['Cluster'], axis=1).sort_values(by='Cluster2').rename({'Cluster2':'Cluster'}, axis=1)
df_table3 = df_table3.reset_index().drop('index', axis=1)
df_table3 = df_table3.set_index(['Cluster', 'Protein ID', 'Gene name']).astype(np.float)

In [None]:
new_index = df_table3.index
new_cols = pd.MultiIndex.from_tuples([('boys', i) for i in np.arange(5, 20)] + 
                                     [('girls', i) for i in np.arange(5, 21)])

In [None]:
df_table3_formatted = pd.DataFrame(df_table3.values, index=new_index, columns=new_cols)
df_table3_formatted = df_table3_formatted.rename_axis(['sex', 'age'], axis=1)

In [None]:
df_table6 = pd.read_excel('tables/TableS6.xlsx', engine='openpyxl')

In [None]:
# Write table 1-3, and 6 to supplementary tables
with pd.ExcelWriter('tables/Supplementary_tables.xlsx') as writer:
    df_table1_formatted.to_excel(writer,sheet_name='ST1', index=False)
    df_table2.to_excel(writer,sheet_name='ST2', index=False)
    df_table3_formatted.to_excel(writer, sheet_name='ST3')
    df_table6.to_excel(writer, sheet_name='ST6')

### Figure 1

In [None]:
import string
sns.set_style('ticks')
fig, axes = plt.subplots(3,3, figsize=(16,16))
FACTORS = ['age', 'z_BMI.Nysom', 'sex']
dfs=[lireg_dicts[i]['df'] for i in FACTORS]
axes[1, 0].set_title('Age associated proteome', fontsize=14)
axes[1, 1].set_title('BMI associated proteome', fontsize=14)
axes[1, 2].set_title('Sex associated proteome', fontsize=14)
axs = axes.flat
for n, ax in enumerate(axs):
    ax.text(-0.2, 1.1, string.ascii_lowercase[n], transform=ax.transAxes, size=15, weight='bold')
    
for i in range(3):
    panel2 = sns.scatterplot(ax=axes[1, i], x='coef', y='-Log10 P-value', data=dfs[i], hue='direction',
                size='-Log10 P-value', palette = {'not significant':'gray', 'pos':'darkred', 'neg':'royalblue'},
                legend=False, alpha=1, edgecolor='white')
    panel2.set(xlim=(-1, 1))
    
for i in range(3):
    for j in range(3):        
        axes[i, j].tick_params(axis='x', labelsize=13)
        axes[i, j].tick_params(axis='y', labelsize=13)
        axes[i, j].xaxis.label.set_size(fontsize=13)
        axes[i, j].yaxis.label.set_size(fontsize=13)

subsets = [set(lireg_dicts[i]['sig_set']) for i in FACTORS]
venn3(ax=axes[0,1], subsets=subsets, set_labels=('Age', 'BMI', 'Sex'), 
      set_colors=('white', 'steelblue', 'red'))
venn3_circles(ax=axes[0,1], subsets=subsets, linewidth=0.8);

c=sns.stripplot(ax=axes[0,2], x='names2', y='coef', data=lireg_statistics_sig[lireg_statistics_sig['names2'].isin(FACTORS)], palette = dict(zip(FACTORS, ['white', 'steelblue', 'pink'])), 
                       alpha=1, edgecolor='black', linewidth=0.5, size=5)

x='age'
y='y_pred'
data=pred_age
score = pred_score_age
h=sns.scatterplot(ax=axes[2,1], x=x, y=y, data=data, color='steelblue', edgecolor='black')
h=sns.regplot(ax=axes[2,1], x=x, y=y, data=data, scatter=False, color='darkred')
h.set(xlim=(4.6, 20.6), ylim=(4.6,20.6), xticks=np.arange(3,11)*2)
h.annotate('Test dataset (n={})\nPearson: {}\nMean absolute error:{}'.format(data.shape[0],score.loc['Pearson r'][x], score.loc['mean_absolute_error'][x]), 
           xycoords='axes fraction', xy=(0.02, 0.8), fontsize=13)
x='BMI'
data=pred_bmi
score = pred_score_bmi
i=sns.scatterplot(ax=axes[2,2], x=x, y=y, data=data, color='steelblue', edgecolor='black')
i=sns.regplot(ax=axes[2,2], x=x, y=y, data=data, scatter=False, color='darkred')
i.set(xlim=(10, 50), ylim=(10,50))
i.annotate('Test dataset (n={})\nPearson: {}\nMean absolute error:{}'.format(data.shape[0],score.loc['Pearson r'][x], score.loc['mean_absolute_error'][x]), 
           xycoords='axes fraction', xy=(0.02, 0.8), fontsize=13)



axes[0, 0].axis('off')
axes[2, 0].axis('off')
plt.rcParams['pdf.fonttype'] = 42
plt.subplots_adjust(wspace=0.3, hspace=0.3)
plt.savefig('figures/figure 2.pdf', dpi=120, bbox_inches='tight')

In [None]:
# Explore associations
FACTORS = ['age', 'z_BMI.Nysom', 'sex']
dfs=[lireg_dicts[i]['df'] for i in FACTORS]
px.scatter(x='coef', y='-Log10 P-value', data_frame=lireg_dicts['z_BMI.Nysom']['df'], hover_name='dep_var')

### Figure 2 - Trajectories of age associated proteins
- export to Perseus to generate heatmap

In [None]:
sig_age = lireg_dicts['age']['sig_set']
#sig_puberty = lireg_dicts['pubertal_status2']['sig_set']
sig_age_proteins = list(set(proteins) & set(sig_age))
df_age = data_combined.groupby(['sex', 'age_int']).median()[sig_age_proteins].T.dropna(axis=0)
df_age['Gene name']=df_age.index.str.split('_').str[1]
df_age['Protein ID']=df_age.index.str.split('_').str[0]

df_sig_age = lireg_dicts['age']['sig']
df_sig_age.loc[:, 'abs(coef)']=abs(df_sig_age['coef'])
toplot_proteins = df_sig_age[df_sig_age['abs(coef)']>0.04]['dep_var'].tolist()

df_age.to_csv('data/processed/age_sig.csv')
df_age.loc[toplot_proteins].to_csv('data/processed/age_sig_coef.csv')

#### Plot examples

In [None]:
toplot=['I3L145_SHBG', 'P01023_A2M', 'P06276_BCHE', 'E7ES19_THBS4',
       'P02741_CRP', 'P18428_LBP', 'P02790_HPX', 'P04004_VTN', 'P35858_IGFALS','A6XND0_IGFBP3', 'A0A087WWU8_TPM3', 'P62736_ACTA2',
        'P20742_PZP', 'P01019_AGT', ]

In [None]:
fig, axs = plt.subplots(7, 2, figsize=(4,12))
fig.subplots_adjust(hspace=0.6, wspace=0.8)
n=1
for (ax, protein) in zip(axs.flat, toplot):
    ax.text(-0.7, 1.05, string.ascii_lowercase[n], transform=ax.transAxes, weight='bold')
    n+=1
    sns.lineplot(x='age_int', y=protein, data=data_combined, 
                 hue='sex', palette=['darkred', 'darkblue'], ax=ax, legend=False)
    genename=protein.split('_')[1]
    ax.set_title(genename)
    ax.set_xlabel('')
    ax.set_ylabel('MS signal\n[Log2]')
    ax.set_xticks([5*i for i in np.arange(1, 5)])
    ax.set_ylim(ax.get_ylim()[0]*0.97, ax.get_ylim()[1]*1.03)
    
plt.rcParams['pdf.fonttype'] = 42
fig.savefig('figures/figure2.pdf', dpi=120, bbox_inches='tight')