In [1]:
import sys 
sys.path.append("/cellar/users/mpagadal/Programs/anaconda3/lib/python3.7/site-packages")
sys.path.insert(1, '/cellar/users/mpagadal/Data/scripts')
import json

In [2]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib as mpl
from itertools import product
import seaborn as sns
import networkx as nx
from ndex2.nice_cx_network import NiceCXNetwork 
import ndex2.client as nc
import ndex2

In [3]:
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [4]:
from get_brian_rna import *

In [5]:
from upsetplot import generate_counts
from upsetplot import plot
from matplotlib import pyplot

## Collect phenotype IDs

Our goal was to collect central phenotypes that shape the tumor immune microenvironment. These phenotypes include measures of immune states, immune infiltration and gene expression collected from a variety of sources, including gene sets, literature, and results of tool calculations

In [6]:
def filter_zeros(df,threshold):
    '''
    inputs:
    df: dataframe of rna values
    threshold: % of zero values by which to exclude phenotypes
    
    output:
    dataframe with rna values < threshold of zero values
    '''
    
    keep=[]
    remove=[]
    
    for x in df.columns:
        if len(df[df[x]==0])>threshold*len(df):
            remove.append(x)
        else:
            keep.append(x)
    
    print("{} phenotypes with > {} zeroes were removed".format(len(remove),threshold))
    return(df[keep])
        

In [12]:
pheno_folder="/cellar/users/mpagadal/Data/projects/germline-immune/discovery/phenotypes/downloaded_phenos/"

In [13]:
antigen_present=pd.read_csv(pheno_folder+"antigen_present_geneset.txt",skiprows=[1])
antigen_present_list=antigen_present["GO_ANTIGEN_PROCESSING_AND_PRESENTATION"].tolist()
antigen_present_list.append("MARCH1")

In [14]:
im_landscape=pd.read_csv(pheno_folder+"im-landscape-markers.txt",header=None)
im_landscape_list=im_landscape[0].tolist()

In [19]:
danaher=pd.read_csv(pheno_folder+"danaher_markers.csv")
danaher_list=danaher["Gene"].tolist()

In [20]:
im_checkpoint=pd.read_csv(pheno_folder+"immune-checkpoint.csv")
im_checkpoint_list=im_checkpoint["HGNC Symbol"].tolist()
im_checkpoint_list=[x for x in im_checkpoint_list if str(x) != "nan"]

In [21]:
landscape=pd.read_csv(pheno_folder+"pheno-immune-landscape-comp",delimiter="\t")
landscape_comps=landscape.columns[2:]

In [22]:
ifng=pd.read_csv(pheno_folder+"ifng-markers.txt",header=None)
ifng_list=ifng[0].tolist()

In [23]:
tgfbeta=pd.read_csv(pheno_folder+"tgf-beta-markers.txt",header=None)
tgfbeta_list=tgfbeta[0].tolist()

In [24]:
cibersortx=pd.read_csv("/cellar/users/mpagadal/Data/cibersort/cibersortx-results/tcga/tpm_cibersortx_compiled_allsig.tsv",delimiter="\t")
cibersortx_list=[x.replace(" ",".") for x in cibersortx.columns[1:]]

In [25]:
print(len(antigen_present_list))
print(len(im_landscape_list))
print(len(danaher_list))
print(len(im_checkpoint_list))
print(len(landscape_comps))
print(len(ifng_list))
print(len(tgfbeta_list))
print(len(cibersortx_list))


228
436
60
78
6
6
19
61


### Create phenotype dictionary

In [26]:
pheno_dict={}

for x in antigen_present_list:
    x=x.replace("-",".")
    pheno_dict[x]="antigen present"
for x in im_landscape_list:
    x=x.replace("-",".")
    pheno_dict[x]="immunomodulators"
for x in danaher_list:
    x=x.replace("-",".")
    pheno_dict[x]="cell type"
for x in im_checkpoint_list:
    x=x.replace("-",".")
    pheno_dict[x]="immune checkpoint"
for x in landscape_comps:
    x=x.replace("-",".")
    pheno_dict[x]="landscape components"
for x in ifng_list:
    x=x.replace("-",".")
    pheno_dict[x]="IFNG"    
for x in tgfbeta_list:
    x=x.replace("-",".")
    pheno_dict[x]="TGFbeta"
for x in cibersortx_list:
    x=x.replace("-",".")
    pheno_dict[x]="immune infiltration"

In [27]:
for x in pheno_dict.keys():
    if "HLA" in x:
        pheno_dict[x]="antigen present"

In [28]:
print(len(pheno_dict))

833


In [36]:
import json
with open('../data/pheno_dictionary.json', 'w') as json_file:
    json.dump(pheno_dict, json_file)

In [37]:
with open('../data/pheno_dictionary.json', 'r') as f:
    pheno_dict = json.load(f)

In [38]:
pheno_dict["BRCA2"]

'immunomodulators'

In [39]:
pheno_map=pd.DataFrame(pheno_dict,index=[0])

In [40]:
pheno_map=pheno_map.T

In [41]:
pheno_map.to_csv("../data/phenotype.mapping.csv",header=None)