In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from utils.funcs import *
from pathlib import Path
from collections import defaultdict

# stats
from scipy.stats import mannwhitneyu
# survival analysis
from lifelines.statistics import logrank_test

sns.set(font_scale = 1.25,
        style="whitegrid",
        rc={"figure.dpi":300, 'savefig.dpi':300, 'figure.figsize':(8,6)})

import warnings
warnings.filterwarnings('ignore')

In [3]:
data_path = Path.home() / "data/bioinfo_sm207"

In [4]:
response = pd.read_parquet(data_path/"step04_rchop_response.parquet")
response.head()

Unnamed: 0.1,Unnamed: 0,Gene Expression Subgroup,Gender,Age,PFS event,PFS time,Actual Arm Code,ecotyper_B.cells_group,ecotyper_Plasma.cells_group,ecotyper_T.cells.CD8_group,...,granulator_Neutrophils.LD_group,granulator_NK_group,granulator_pDCs_group,granulator_Plasmablasts_group,granulator_T.CD4.Memory_group,granulator_T.CD4.Naive_group,granulator_T.CD8.Memory_group,granulator_T.CD8.Naive_group,granulator_T.gd.non.Vd2_group,granulator_T.gd.Vd2_group
0,DLBCL11537,GCB,M,58.0,0.0,47.901437,Schmitz,S02,Unassigned,S02,...,high,low,high,high,high,low,high,low,low,low
1,DLBCL10477,ABC,F,53.0,1.0,3.482546,Schmitz,S04,Unassigned,Unassigned,...,low,low,low,high,high,low,low,low,low,low
2,DLBCL10471,ABC,M,83.0,1.0,77.864476,Schmitz,S05,Unassigned,S02,...,low,low,high,high,low,high,high,low,low,low
3,DLBCL11186,ABC,M,71.0,0.0,121.034908,Schmitz,S04,S01,S03,...,low,low,low,low,high,low,low,low,low,high
4,DLBCL10542,ABC,M,30.0,1.0,6.439425,Schmitz,S04,Unassigned,Unassigned,...,high,low,low,high,low,high,low,low,high,low


In [5]:
celltypes = [c for c in response.columns if c.startswith("granulator") and "group" not in c]
len(celltypes)

17

In [6]:
generate_stats(response, "Schmitz", ["PFS"], "granulator_T.CD8.Naive_group")

Unnamed: 0,Arm,group_col,param,P value,HR,lower,higher
0,Schmitz,granulator_T.CD8.Naive_group,PFS,0.751151,0.922968,0.562073,1.515587


In [7]:
def get_stats(arm):
    logrank_pvalues = {}
    hrs= {}
    for cell in celltypes:
        try:
            df = generate_stats(response, arm, ["PFS"], f"{cell}_group")
            logrank_pvalues[cell] = df["P value"].values[0]
            hrs[cell] = df["HR"].values[0]
        except:
            logrank_pvalues[cell] = np.nan
            hrs[cell] = np.nan

    return logrank_pvalues, hrs

In [8]:
pvalues_schmitz, hrs_schmitz = get_stats("Schmitz")
pvalues_reddy, hrs_reddy = get_stats("Reddy")

In [9]:
schmitz_res = pd.concat([pd.Series(pvalues_schmitz, name="P value (Log rank)"),
                         pd.Series(hrs_schmitz, name="Hazard Ratio")], axis=1)
schmitz_res.head()

Unnamed: 0,P value (Log rank),Hazard Ratio
granulator_B.Memory,0.013472,1.623544
granulator_B.Naive,0.884136,1.028598
granulator_Basophils.LD,0.612939,1.182529
granulator_MAIT,0.993311,1.001855
granulator_mDCs,0.719978,0.932691


In [10]:
reddy_res = pd.concat([pd.Series(pvalues_reddy, name="P value (Log rank)"),
                       pd.Series(hrs_reddy, name="Hazard Ratio")], axis=1)
reddy_res.head()

Unnamed: 0,P value (Log rank),Hazard Ratio
granulator_B.Memory,0.695864,1.063773
granulator_B.Naive,0.584143,0.91655
granulator_Basophils.LD,0.184003,1.612091
granulator_MAIT,0.302531,0.848383
granulator_mDCs,0.104748,0.760901


In [11]:
rchop_res = pd.merge(schmitz_res, reddy_res,
                    left_index=True, right_index=True,
                    suffixes=["_schmitz", "_reddy"],
                    how="outer")
rchop_res.head()

Unnamed: 0,P value (Log rank)_schmitz,Hazard Ratio_schmitz,P value (Log rank)_reddy,Hazard Ratio_reddy
granulator_B.Memory,0.013472,1.623544,0.695864,1.063773
granulator_B.Naive,0.884136,1.028598,0.584143,0.91655
granulator_Basophils.LD,0.612939,1.182529,0.184003,1.612091
granulator_MAIT,0.993311,1.001855,0.302531,0.848383
granulator_mDCs,0.719978,0.932691,0.104748,0.760901


In [12]:
rchop_res.index = rchop_res.index.map(lambda x: x.replace("granulator_", ""))
rchop_res.head()

Unnamed: 0,P value (Log rank)_schmitz,Hazard Ratio_schmitz,P value (Log rank)_reddy,Hazard Ratio_reddy
B.Memory,0.013472,1.623544,0.695864,1.063773
B.Naive,0.884136,1.028598,0.584143,0.91655
Basophils.LD,0.612939,1.182529,0.184003,1.612091
MAIT,0.993311,1.001855,0.302531,0.848383
mDCs,0.719978,0.932691,0.104748,0.760901


In [13]:
rchop_res.to_csv(data_path/"step07_rchop_results.csv")