In [1]:
import os

while os.getcwd()[-len('mcrc-cetuximab-analysis'):] != 'mcrc-cetuximab-analysis':
    os.chdir('..')
os.getcwd() 

'/home/max/mcrc-cetuximab-analysis'

In [2]:
import pandas as pd
import numpy as np
import gseapy as gp
from src.scaling import median_scaling



In [3]:
def parse_emt_signatures():
    def parse_emt(file):
        res = []
        
        if 'emt_bg' in file:
            for gene in open(file, 'r').read().split():\
                res.append(gene)
        else:
            for line in open(file, 'r').readlines():
                if not line.startswith('(') and not line.startswith('GO') and not line.startswith('Gene'):
                    res.append(line.split()[0])
        
        return res
    
    emt_raw_files = ['raw/' + file for file in os.listdir('raw') if file.startswith('emt')]
    emt_gene_lists = {'EMT_SIGNATURE_' + file[file.find('/') + 1:file.find('.')]: parse_emt(file) for file in emt_raw_files}
    
    return emt_gene_lists

def compute_ssgsea_on_signatures(log_tpm, signatures):
    ssgsea_results = gp.ssgsea(data=log_tpm.T,
                           gene_sets=signatures,
                           outdir=None,
                           sample_norm_method='rank',
                           permutation_num=0,
                           no_plot=True,
                           min_size=0)

    ssgsea_df = ssgsea_results.res2d.rename(columns={'Name': 'sample_id'})
    
    log_tpm_with_signatures = log_tpm.copy()
    
    for signature in signatures:
        add = ssgsea_df.loc[ssgsea_df['Term'] == signature, ['sample_id', 'ES']]
        log_tpm_with_signatures = log_tpm_with_signatures.merge(add, left_index=True,
                                                                          right_on='sample_id').rename(
            columns={'ES': signature}).set_index('sample_id').astype({signature: np.float64})
    
    # Scale separetely
    for i, signature in enumerate(signatures):
        log_tpm_with_signatures[signature + '_med'] = median_scaling(log_tpm_with_signatures[signature])

    return log_tpm_with_signatures


In [4]:
emt_signatures = parse_emt_signatures()

log_tpm = pd.read_csv('raw/exprs_PRJNA805525.csv', index_col=0)
ann = pd.read_csv('raw/ann_PRJNA805525.csv', index_col=0)
df = compute_ssgsea_on_signatures(log_tpm, emt_signatures)

  log_tpm_with_signatures[signature + '_med'] = median_scaling(log_tpm_with_signatures[signature])
  log_tpm_with_signatures[signature + '_med'] = median_scaling(log_tpm_with_signatures[signature])
  log_tpm_with_signatures[signature + '_med'] = median_scaling(log_tpm_with_signatures[signature])
  log_tpm_with_signatures[signature + '_med'] = median_scaling(log_tpm_with_signatures[signature])
  log_tpm_with_signatures[signature + '_med'] = median_scaling(log_tpm_with_signatures[signature])
  log_tpm_with_signatures[signature + '_med'] = median_scaling(log_tpm_with_signatures[signature])


In [5]:
print(df)

                A1BG      A1CF       A2M     A2ML1   A3GALT2    A4GALT  \
sample_id                                                                
GSM5889737  1.559915  4.850911  7.942278  0.638897  0.066639  2.698472   
GSM5889743  1.450136  3.875473  9.015265  2.489116  0.985855  5.120251   
GSM5889750  0.338123  4.927891  6.760120  0.382481  0.185159  3.395341   
GSM5889757  1.332154  4.730954  8.116149  0.873677  0.402757  3.661071   
GSM5889760  1.710781  5.000614  7.758387  0.502044  0.160701  1.917028   
...              ...       ...       ...       ...       ...       ...   
GSM5890186  1.090007  3.646969  7.747969  0.480760  0.058289  3.338732   
GSM5890205  1.500115  4.897984  8.066126  0.819012  0.193333  3.702913   
GSM5890210  1.462713  1.528432  7.231812  0.849680  0.054286  2.787786   
GSM5890228  1.211073  4.067158  7.702073  0.515349  0.212306  3.700164   
GSM5890257  1.889419  4.090588  7.690994  0.495112  0.317136  4.060411   

               A4GNT      AAAS      A

In [6]:
df.to_csv('data/cohort_2_expr_emt_signatures.csv')