In [22]:
import os

import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import gseapy as gp
from notebooks.tasks58.plots import plot_correlations, all_negative, all_positive
from notebooks.tasks58.scaling import median_scaling

import os

while os.getcwd()[-len('mcrc-cetuximab-analysis'):] != 'mcrc-cetuximab-analysis':
    os.chdir('..')
os.getcwd() 

'/home/max/mcrc-cetuximab-analysis'

In [23]:
def parse_bg_signatures():
    file = open('raw/bg_gene_signatures.gmt', 'r')
    res = dict()
    for line in file.readlines():
        lst = line.split('\t')
        res['SIGNATURE_' + lst[0]] = lst[2:]
    return res

def compute_ssgsea_on_signatures(log_tpm, signatures):
    ssgsea_results = gp.ssgsea(data=log_tpm.T,
                           gene_sets=signatures,
                           outdir=None,
                           sample_norm_method='rank',
                           permutation_num=0,
                           no_plot=True,
                           min_size=0)

    ssgsea_df = ssgsea_results.res2d.rename(columns={'Name': 'sample_id'})
    
    log_tpm_with_signatures = log_tpm.copy()
    
    for signature in signatures:
        add = ssgsea_df.loc[ssgsea_df['Term'] == signature, ['sample_id', 'ES']]
        log_tpm_with_signatures = log_tpm_with_signatures.merge(add, left_index=True,
                                                                          right_on='sample_id').rename(
            columns={'ES': signature}).set_index('sample_id').astype({signature: np.float64})
    
    # Scale separetely
    for i, signature in enumerate(signatures):
        log_tpm_with_signatures[signature + '_med'] = median_scaling(log_tpm_with_signatures[signature])

    return log_tpm_with_signatures


In [None]:
signatures = parse_bg_signatures()

log_tpm_1 = pd.read_csv('data/log_tpms_from_fpkm_hgnc_filtered_by_ann.csv', index_col=0)
ann_1 = pd.read_csv('data/ann.csv', index_col=0)
df_1 = compute_ssgsea_on_signatures(log_tpm_1, signatures)

log_tpm_2 = pd.read_csv('raw/exprs_PRJNA805525.csv', index_col=0)
ann_2 = pd.read_csv('raw/ann_PRJNA805525.csv', index_col=0)
df_2 = compute_ssgsea_on_signatures(log_tpm_2, signatures)

In [21]:
df_all = pd.concat([df_1[[col for col in df_1.columns if col.endswith('_med')]],
                    df_2[[col for col in df_2.columns if col.endswith('_med')]]])
df_all.index

Index(['18R654_0015', '18R670_0002', '18R674_0006', '18R678_0010',
       '18R683_0015', '18R699_0007', '18R429_0009', '18R298_0010',
       '18R647_0008', '18R295_0003',
       ...
       'GSM5890091', 'GSM5890104', 'GSM5890115', 'GSM5890157', 'GSM5890183',
       'GSM5890186', 'GSM5890205', 'GSM5890210', 'GSM5890228', 'GSM5890257'],
      dtype='object', length=199)