In [1]:
import pandas as pd
import numpy as np

In [2]:
def baitlist(file):
    """
    Takes a .txt file where every line contains a bait protein. Returns a list of these proteins.
    Function is used in fullbait().
    """
    with open(file) as baitfile:
        outlist = [baitprotein.strip() for baitprotein in baitfile]
    return outlist

def fullbait(fileslist):
    """
    Takes a list of files where every line contains a bait protein. Returns a list of all proteins in
    those files.
    """
    baitlists = [baitlist(file) for file in fileslist]
    
    outlist = baitlists[0]
    for single_list in baitlists[1:]:
        outlist += single_list
    return outlist

def humap_to_list(file):
    """
    Takes the file with humap complexes, where every line is a tab-seperated
    list of subunits in a certain complex. A list is returned where every
    element in the list is a list where the first element is the row number
    of the complex in the file and all other elements are the subunits.
    """
    complexes_humap = []
    with open(file) as humap:
        n = 0
        for line in humap:
            line = line.strip()
            subunits = line.split('\t')
            compl_nr_subunits = [n] + subunits
            complexes_humap.append(compl_nr_subunits)
            n += 1
    return complexes_humap

def corum_to_list(file):
    """
    Takes the file with corum complexes, where every line is a tab-seperated
    list of data about a certain complex. A list is returned where every
    element in the list is a list containing: [complex nr, complex full name,
    PMID, subunit1;subunit2;etc]. Disregards complexes that are not human.
    """
    complexes_corum = []
    with open(file) as corum:
        for line in corum:
            line = line.strip()
            line_list = line.split('\t')
            line_list = line_list[0:3] + [line_list[14]] + [line_list[12]]
            if line_list[2] == 'Human':
                line_list.pop(2)
                complexes_corum.append(line_list)
    return complexes_corum

def nvs_to_list(file):
    """
    Takes the file with the nvs complexes (sorted by complex) where every line
    is tab-seperated summation of complex number, complex name and all subunits.
    Returns a list of lists where every list in the list is a complex. List is
    formatted: [number, complexname, subunit1, subunit2, ..., subunitn]
    """
    complexes_nvs = []
    with open(file) as nvs:
        for line in nvs:
            compl = line.strip().split('\t')
            complexes_nvs.append(compl)
    return complexes_nvs

def complexportal_to_list(file):
    """
    Takes the file with the complexportal complexes (corrected names) and returns
    a list of lists, where every sublist is a complex: [complexID, complexname, subunit1,
    subunit2, ..., subunitn]
    """
    complexes_complexportal = []
    with open(file) as complexportal:
        return [line.strip().split('\t') for line in complexportal]
       
def remove_dupl(dataframe, column):
    """
    Takes a column in a dataframe and removes duplicates within the same cell.
    E.g. a cell containing 'abc;cde;cde;' will be replaced by the cell 'abc;cde'.
    Returns the new dataframe.
    """
    oldcol = list(dataframe[column])
    newcol = []
    for cell in oldcol:
        cell = cell.split(';')[:-1]
        cell = list(set(cell))
        cell = ';'.join(cell)
        newcol.append(cell)
    dataframe[column] = newcol
    return dataframe

def cofactors_humap(complexes_humap, bait_list):
    """
    Takes the complexes_humap list and bait_list to generate a dataframe 
    of cofactors that are found in complexes with a protein on the bait list.
    Columns in the dataframe are: cofactor name, Drew000; (complex number/ID), Bait protein;.
    Dataframe is then grouped by cofactor.
    """
    cofactors_humap = []
    for bait in bait_list:
        for compl in complexes_humap:
            if bait in compl:
                for cof in compl[1:]:
                    current_list = [cof, 'Drew{};'.format(compl[0]), '{};'.format(bait)]
                    cofactors_humap.append(current_list)
    cofactors_humap_df = pd.DataFrame(cofactors_humap, columns = ['Cofactor', 'ComplexID_huMAP', 'Bait_huMAP'])
    cofactors_humap_df = cofactors_humap_df.groupby('Cofactor').sum()
    for col in cofactors_humap_df.columns:
        cofactors_humap_df = remove_dupl(cofactors_humap_df, col)
    return cofactors_humap_df

def cofactors_corum(complexes_corum, bait_list):
    """
    Takes the complexes_corum list and bait list to generate a dataframe 
    of cofactors that are found in complexes with a protein on the bait list.
    Columns in the dataframe are: cofactor name, complex ID;, complex full name;, bait protein;.
    Dataframe is then grouped by cofactor.
    """
    cofactors_corum = []
    for bait in bait_list:
        for compl in complexes_corum:
            current_complex = compl[3].split(';')
            if bait in current_complex:
                for cof in current_complex:
                    current_list = [cof, '{};'.format(compl[0]), '{};'.format(compl[1]), '{};'.format(compl[2]), '{};'.format(bait)]
                    cofactors_corum.append(current_list)
    cofactors_corum_df = pd.DataFrame(cofactors_corum, columns = ['Cofactor', 'ComplexID_CORUM', 'ComplexName_CORUM', 'PMID_CORUM', 'Bait_CORUM'])
    cofactors_corum_df = cofactors_corum_df.groupby('Cofactor').sum()
    for col in cofactors_corum_df.columns:
        cofactors_corum_df = remove_dupl(cofactors_corum_df, col)
    cofactors_corum_df.drop('', inplace = True)
    return cofactors_corum_df


def cofactors_nvs(complexes_nvs, bait_list):
    """
    Takes the complexes_nvs list and bait list to generate a dataframe 
    of cofactors that are found in complexes with a protein on the bait list.
    Columns in the dataframe are: cofactor name, complex IDs;(nvs001, nvs002, etc.), Bait protein;.
    Dataframe is then grouped by cofactor.
    """
    cofactors_nvs = []
    for bait in bait_list:
        for compl in complexes_nvs:
            if bait in compl[1:]:
                for cof in compl[1:]:
                    current_list = [cof, '{};'.format(compl[0]), '{};'.format(bait)]
                    cofactors_nvs.append(current_list)
    cofactors_nvs_df = pd.DataFrame(cofactors_nvs, columns = ['Cofactor', 'ComplexName_nvs', 'Bait_nvs'])
    cofactors_nvs_df = cofactors_nvs_df.groupby('Cofactor').sum()
    for col in cofactors_nvs_df.columns:
        cofactors_nvs_df = remove_dupl(cofactors_nvs_df, col)
    return cofactors_nvs_df

def cofactors_complexportal(complexes_complexportal, bait_list):
    """
    Takes the complexes_complexportal list and bait list to generate a dataframe 
    of cofactors that are found in complexes with a protein on the bait list.
    Columns in the dataframe are: cofactor name, complex IDs;(CPX-1, CPX-2, etc.), Bait protein;.
    Dataframe is then grouped by cofactor.
    """
    cofactors_complexportal = []
    for bait in bait_list:
        for compl in complexes_complexportal:
            if bait in compl[2:]:
                for cof in compl[2:]:
                    current_list = [cof, '{};'.format(compl[0]), '{};'.format(compl[1]), '{};'.format(bait)]
                    cofactors_complexportal.append(current_list)
    cofactors_complexportal_df = pd.DataFrame(cofactors_complexportal, columns = ['Cofactor', 'ComplexID_ComplexPortal', 'ComplexName_ComplexPortal', 'Bait_ComplexPortal'])
    cofactors_complexportal_df = cofactors_complexportal_df.groupby('Cofactor').sum()
    for col in cofactors_complexportal_df.columns:
        cofactors_complexportal_df = remove_dupl(cofactors_complexportal_df, col)
    return cofactors_complexportal_df

def merg_dfs(df1, df2, df3, df4):
    """
    Merges the 4 dataframes on cofactor. NaN when cofactor is not found in that complex-dataset.
    Returns the merged dataframe.
    """
    df12 = pd.merge(df1, df2, how = 'outer', on = ['Cofactor'])
    df123 = pd.merge(df12, df3, how = 'outer', on = ['Cofactor'])
    return pd.merge(df123, df4, how = 'outer', on = ['Cofactor'])

def list_tfs(tf_file):
    with open(tf_file) as tfs:
        outlist = [tf.strip() for tf in tfs]
    return outlist

def add_interactions(df, int_file, dataset):
    """
    dataset = 'BioGRID' or 'IntACT'
    """
    interactions_df = pd.read_csv(int_file)
    interactors_all = set(interactions_df['Interactor'])
    interactors_smallset = set(interactions_df.loc[interactions_df['Interaction with small set of TFs'] == 1]['Interactor'])
    df['Interaction_TF_{}'.format(dataset)] = [1 if index in interactors_all else 0 for index, row in df.iterrows()]
    df['Interaction_TF_{}_smallset'.format(dataset)] = [1 if index in interactors_smallset else 0 for index, row in df.iterrows()]
    return df

def baitcolumn(df, baitfiles, bait_types):
    """
    Takes the dataframe and checks for each index if it is found on one of the bait lists
    as provided in baitfiles. Adds a column for each bait type in bait_types where a 1
    represents it is found in that bait list.
    
    Returns the dataframe
    """
    for i in range(len(baitfiles)):
        with open(baitfiles[i]) as baitfile:
            baits = [bait.strip() for bait in baitfile]
        df['In_bait_{}'.format(bait_types[i])] = [1 if index in baits else 0 for index, row in df.iterrows()]
    return df

def add_nvs(df, nvs_proteins_file):
    """
    Takes the dataframe and adds all proteins on the nvs-list that are not already on the list
    to the dataframe.
    
    Returns the dataframe.
    """
    with open(nvs_proteins_file) as nvs_prots:
        nvss = [line.strip().split('\t')[0] for line in nvs_prots]
        nvss = [prot for prot in nvss if not prot in df_full.index]

    df_new = pd.DataFrame(index = nvss)
    return pd.concat([df, df_new], sort = False)

def add_baitlists(df, baitfiles):
    """
    Takes the dataframe and adds all the proteins from the baitlists that are not yet in the dataframe.
    Returns the dataframe.
    """
    for file in baitfiles:
        with open(file) as cur_file:
            baits = [bait.strip() for bait in cur_file]
            new_prots = [prot for prot in baits if prot not in df.index]
            df_new = pd.DataFrame(index = new_prots)
            df = pd.concat([df, df_new], sort = False)
    return df

def add_tfs(df, tfs):
    """
    Takes the dataframe and adds all the TFs in the tfs-list to the dataframe.
    Returns the dataframe.
    """
    tfs = [tf for tf in tfs if tf not in df.index]
    df_new = pd.DataFrame(index = tfs)
    return pd.concat([df, df_new], sort = False)

def tf_column(df, tfs):
    """
    Adds a column to the dataframe with a 1 for each row where the protein is a TF
    and a 0 otherwise.
    Returns the dataframe.
    """
    tf_col = [1 if prot in tfs else 0 for prot in df.index]
    df['Is_TF'] = tf_col
    return df

def nvs_column(df, nvs_proteins_file):
    """
    Adds a column to the dataframe with a 1 for each row where the protein is in nvs
    and a 0 otherwise.
    Returns the dataframe.
    """
    with open(nvs_proteins_file) as nvs_prots:
        nvss = [line.strip().split('\t')[0] for line in nvs_prots]
    nvs_col = [1 if prot in nvss else 0 for prot in df.index]
    df['In_NVS'] = nvs_col
    return df

def create_translation_dict():
    hugo_uniprot_dict = {}
    hugo_ensg_dict = {}

    with open(hugo_ensg_uniprot_file) as transl_file:
        next(transl_file)
        for line in transl_file:
            line = line.strip().split('\t')
            while len(line) < 4:
                line.append('')

            hugo, ensg1, ensg2, uniprot = line[0], line[1], line[3], line[2]
            hugo_uniprot_dict[hugo] = uniprot

            if ensg1 == '':
                ensg1 = ensg2
            hugo_ensg_dict[hugo] = ensg1
    return hugo_uniprot_dict, hugo_ensg_dict

def id_columns(df, hugo_uniprot, hugo_ensg):
    df.index.rename('Gene Symbol', inplace = True)
    uniprot_ids = [hugo_uniprot[hugo] if hugo in hugo_uniprot.keys() else '' for hugo in df.index]
    ensg_ids = [hugo_ensg[hugo] if hugo in hugo_ensg.keys() else '' for hugo in df.index]
    df['UniProt ID'] = uniprot_ids
    df['ENSG ID'] = ensg_ids
    df.set_index(['UniProt ID', 'ENSG ID'], inplace = True, append = True)
    return df.reset_index()

def filter_df(df_full):
    is_tf = df_full['Is_TF'] == 1
    in_nvs = df_full['In_NVS'] == 1
    is_crem = df_full['In_bait_crems'] == 1
    is_snfs = df_full['In_bait_snfs'] == 1
    is_nursa = df_full['In_bait_nursa'] == 1
    is_gocofs = df_full['In_bait_gocofs'] == 1
    is_bait = is_crem | is_snfs | is_nursa | is_gocofs
    on_startlist = is_bait | in_nvs | is_tf
    
    in_humap = df_full['ComplexID_huMAP'].notnull()
    in_corum = df_full['ComplexID_CORUM'].notnull()
    int_biogrid = df_full['Interaction_TF_BioGRID'] == 1
    int_intact = df_full['Interaction_TF_IntACT'] == 1
    tf_int = int_biogrid | int_intact
    solid_interaction = (in_humap & in_corum) | ((in_humap | in_corum) & tf_int)

    return df_full.loc[on_startlist | solid_interaction].copy()

def found_by_tfs_small(df, tfs_all, tfs_small, bait):
    new_col = []
    
    for index, row in df.iterrows():
        baits_humap = row['Bait_huMAP']
        if isinstance(baits_humap, float):
            baits_humap = []
        else:
            baits_humap = baits_humap.split(';')
            
        baits_corum = row['Bait_CORUM']
        if isinstance(baits_corum, float):
            baits_corum = []
        else:
            baits_corum = baits_corum.split(';')

        tf_b_s = row['Interaction_TF_BioGRID_smallset']
        tf_i_s = row['Interaction_TF_IntACT_smallset']
        protein = row['Gene Symbol']
        
        if  all(((len(baits_humap) != 0),
                 all(x in tfs_all for x in baits_humap),
                 all(x not in tfs_small for x in baits_humap))):
            humap_found_smallset = 0
        else:
            humap_found_smallset = 1

        if  all(((len(baits_corum) != 0),
                 all(x in tfs_all for x in baits_corum),
                 all(x not in tfs_small for x in baits_corum))):
            corum_found_smallset = 0
        else:
            corum_found_smallset = 1
            
        if all((humap_found_smallset, corum_found_smallset)):
            also_found_by_tfs_small = 1
        elif any((humap_found_smallset, corum_found_smallset)) and any((tf_b_s, tf_i_s)):
            also_found_by_tfs_small = 1
        elif protein in bait and protein not in tfs:
            also_found_by_tfs_small = 1
        elif protein in tfs_small:
            also_found_by_tfs_small = 1
        else:
            also_found_by_tfs_small = 0
            
        new_col.append(also_found_by_tfs_small)
        
    df['Found with small TF set'] = new_col
    return df

In [3]:
# Data and folders:
folder_datafiles = '/mbshome/nvelthuijs/Cofactors/20200402_Transcriptionregulationlist/Datafiles/'
folder_corrected_names = '/mbshome/nvelthuijs/Cofactors/20200402_Transcriptionregulationlist/Datafiles_corrected_names/'
folder_output = '/mbshome/nvelthuijs/Cofactors/20200402_Transcriptionregulationlist/Output_files/'

crems_file = folder_datafiles + 'crems.txt'
snfs_file = folder_datafiles + 'snfs.txt'
nursa_file = folder_datafiles + 'nursa.txt'
gocofs_file = folder_datafiles + 'gocofs.txt'
gocofs_notfs_file = folder_datafiles + 'gocofs_notfs.txt'
tf_file = folder_datafiles + 'tfs.txt'
tf_smallset_file = folder_datafiles + 'tfs_smallset.txt'
nvs_proteins_file = folder_datafiles + 'golden_standard_proteins.txt'
nvs_proteins_notfs_file = folder_datafiles + 'golden_standard_proteins_notfs.txt'
nvs_complexes_file = folder_datafiles + 'golden_standard_complexes.txt'
complexportal_file = folder_corrected_names + 'complexportal_corrected_names.txt'

humap_file = folder_corrected_names + 'humap_corrected_names.txt'
corum_file = folder_corrected_names + 'corum_corrected_names.txt'

biogrid_file = folder_output + '03_tf_interactors_biogrid_simple.csv'
intact_file = folder_output + '05_tf_interactors_intact_simple.csv'

hugo_ensg_uniprot_file = folder_datafiles + 'hugo_ensg_uniprot_approved.txt'

In [4]:
# Create transcription regulation list with baitset 1:
# (CREMs, SNFs, NURSA, GO coregulators) --> Changed to only GO coregulators

# Create list of files and the complete baitlist:
baitfiles = [crems_file, snfs_file, nursa_file, gocofs_file]
bait_types = [file.split('/')[-1][:-4] for file in baitfiles]
baitfiles2 = [gocofs_notfs_file]
bait_types2 = [file.split('/')[-1][:-4] for file in baitfiles2]
bait = fullbait(baitfiles2)
baitlength1 = len(bait)
tfs = list_tfs(tf_file)

# Create lists with all protein complexes:
complexes_humap = humap_to_list(humap_file)
complexes_corum = corum_to_list(corum_file)
complexes_nvs = nvs_to_list(nvs_complexes_file)
complexes_complexportal = complexportal_to_list(complexportal_file)

# Create dataframes with proteins found in complexes with
# bait proteins in seperate datasets, then merge into one df:
cof_humap_df = cofactors_humap(complexes_humap, bait)
cof_corum_df = cofactors_corum(complexes_corum, bait)
cof_nvs_df = cofactors_nvs(complexes_nvs, bait)
cof_complexportal_df = cofactors_complexportal(complexes_complexportal, bait)
df_full = merg_dfs(cof_humap_df, cof_corum_df, cof_nvs_df, cof_complexportal_df)

# Add proteins from bait lists, nvs and tfs:
df_full = add_nvs(df_full, nvs_proteins_file)
df_full = add_baitlists(df_full, baitfiles)
df_full = add_tfs(df_full, tfs)

# Add columns to cofactor-df on tf-interactions:
df_full = add_interactions(df_full, biogrid_file, 'BioGRID')
df_full = add_interactions(df_full, intact_file, 'IntACT')

# Add columns on whether something is present in a bait list.
df_full = baitcolumn(df_full, baitfiles, bait_types)
df_full = tf_column(df_full, tfs)
df_full = nvs_column(df_full, nvs_proteins_file)

# Filter dataframe:
df_filtered = filter_df(df_full)

# Add different IDs, write to csv:
hugo_uniprot_dict, hugo_ensg_dict = create_translation_dict()
df_filtered = id_columns(df_filtered, hugo_uniprot_dict, hugo_ensg_dict)

# tfs_small = set([x.strip() for x in tf_smallset_file])
# df_filtered = found_by_tfs_small(df_filtered, set(tfs), tfs_small, bait)

df_filtered.to_csv(folder_output + '06_Transcription_regulatorlist_baitset1_without_scores.csv', index = False)
df_filtered

Unnamed: 0,Gene Symbol,UniProt ID,ENSG ID,ComplexID_huMAP,Bait_huMAP,ComplexID_CORUM,ComplexName_CORUM,PMID_CORUM,Bait_CORUM,ComplexName_nvs,...,Interaction_TF_BioGRID,Interaction_TF_BioGRID_smallset,Interaction_TF_IntACT,Interaction_TF_IntACT_smallset,In_bait_crems,In_bait_snfs,In_bait_nursa,In_bait_gocofs,Is_TF,In_NVS
0,AATF,Q9NY61,ENSG00000275700,Drew1337;Drew1809;Drew3735,NOC2L;MYBBP1A;RIOX2;RRP1B;DDX54;CEBPZ,,,,,,...,1,1,1,1,0,0,0,0,0,0
1,ABL1,P00519,ENSG00000097007,Drew2259,ABL1,6592;6076;2811,c-Abl-cortactin-nmMLCK complex;BRCA1-cABL comp...,20861316;12024016;23740246,BRCA1;ABL1,,...,1,1,1,1,0,0,0,1,0,0
2,ACIN1,Q9UKV3,ENSG00000100813,Drew2802;Drew1090,THRAP3;SRSF2;BCLAF1;SPEN,351;760,Spliceosome;Apoptosis- and splicing-associated...,12226669;12665594,SRSF2;DHX9;SPEN;SNW1;DDX17;SAP18;SF1;TCERG1;PR...,,...,0,0,0,0,0,0,0,0,0,0
3,ACOT8,O14734,ENSG00000101473,Drew1995,PML,,,,,,...,0,0,1,1,0,0,0,0,0,0
4,ACTB,P60709,ENSG00000075624,Drew1704,BASP1,725;5615;778;189;149;5614;566;1729;5606;1166;5...,Emerin-actin-NMI-(alphaII)spectrin complex;BAF...,14729568;17620012;15607978;9845365;11078522;11...,HDAC1;SMARCC2;RCOR1;NMI;SMARCA4;TRRAP;NPM1;SMA...,BAF;PBAF;NuA4,...,1,1,1,1,0,0,0,0,0,1
5,ACTG1,P63261,ENSG00000184009,Drew780,ACTN1;HMGB2;BASP1;ACTN4,189,BAF complex,11078522,SMARCC1;ARID1A;ACTL6B;SMARCC2;SMARCD1;SMARCB1;...,,...,0,0,0,0,0,0,0,0,0,0
6,ACTL6A,O96019,ENSG00000136518,Drew4093;Drew2673,BRD7;SMARCC2;SMARCA4;SMARCD2;BICRAL;ACTL6A;ARI...,1251;807;304;564;238;565;1166;1175;798;1230;55...,BRM-SIN3A-HDAC complex;BRG1-SIN3A-HDAC contain...,8895581;9710619;11073988;11780067;10882073;984...,BRD7;HDAC1;SMARCC2;TRRAP;SMARCA4;SMARCD2;ACTL6...,BAF;PBAF;NuA4;SRCAP;INO80;GBAF,...,1,1,1,1,0,0,0,1,0,1
7,ACTL6B,O94805,ENSG00000077080,Drew4093;Drew2673,BRD7;SMARCC2;SMARCA4;SMARCD2;BICRAL;ACTL6A;ARI...,189,BAF complex,11078522,SMARCC1;ARID1A;ACTL6B;SMARCC2;SMARCD1;SMARCB1;...,BAF;PBAF,...,0,0,0,0,0,0,0,1,0,1
8,ACTN1,P12814,ENSG00000072110,Drew2322;Drew2122;Drew780,ACTN1;HMGB2;BASP1;ACTN4,4025;5177,"Polycystin-1 multiprotein complex (ACTN1, CDH1...",11113628;15159419,ACTN1;PXN;JUP,,...,0,0,1,1,0,0,0,1,0,0
9,ACTN2,P35609,ENSG00000077522,Drew1317;Drew216;Drew3559;Drew4632;Drew171;Dre...,ACTN2,,,,,,...,1,1,1,1,0,0,0,1,0,0


In [5]:
# Create transcription regulation list with baitset 2:
# (GO coregulators, CREMs, SNFs, NURSA, NVS)

# Create list of files and the complete baitlist:
baitfiles = [crems_file, snfs_file, nursa_file, gocofs_file]
bait_types = [file.split('/')[-1][:-4] for file in baitfiles]
tfs = list_tfs(tf_file)

baitfiles2 = [crems_file, snfs_file, nursa_file, gocofs_notfs_file]
bait_types2 = [file.split('/')[-1][:-4] for file in baitfiles2]
with open(nvs_proteins_notfs_file) as nvs:
    bait = list(set(fullbait(baitfiles2) + [line.strip().split('\t')[0] for line in nvs]))
baitlength2 = len(bait)

# Create dataframes with proteins found in complexes with
# bait proteins in seperate datasets, then merge into one df:
cof_humap_df = cofactors_humap(complexes_humap, bait)
cof_corum_df = cofactors_corum(complexes_corum, bait)
cof_nvs_df = cofactors_nvs(complexes_nvs, bait)
cof_complexportal_df = cofactors_complexportal(complexes_complexportal, bait)
df_full2 = merg_dfs(cof_humap_df, cof_corum_df, cof_nvs_df, cof_complexportal_df)

# Add proteins from bait lists, nvs and tfs:
# df_full2 = add_nvs(df_full2, nvs_proteins_file)
df_full2 = add_baitlists(df_full2, baitfiles)
df_full2 = add_tfs(df_full2, tfs)

# Add columns to cofactor-df on tf-interactions:
df_full2 = add_interactions(df_full2, biogrid_file, 'BioGRID')
df_full2 = add_interactions(df_full2, intact_file, 'IntACT')

# Add columns on whether something is present in a bait list.
df_full2 = baitcolumn(df_full2, baitfiles, bait_types)
df_full2 = tf_column(df_full2, tfs)
df_full2 = nvs_column(df_full2, nvs_proteins_file)

# Filter dataframe:
df_filtered2 = filter_df(df_full2)

# Add different IDs, write to csv:
df_filtered2 = id_columns(df_filtered2, hugo_uniprot_dict, hugo_ensg_dict)
# df_filtered2 = found_by_tfs_small(df_filtered2, set(tfs), tfs_small, bait)

df_filtered2.to_csv(folder_output + '07_Transcription_regulatorlist_baitset2_without_scores.csv', index = False)
df_filtered2


Unnamed: 0,Gene Symbol,UniProt ID,ENSG ID,ComplexID_huMAP,Bait_huMAP,ComplexID_CORUM,ComplexName_CORUM,PMID_CORUM,Bait_CORUM,ComplexName_nvs,...,Interaction_TF_BioGRID,Interaction_TF_BioGRID_smallset,Interaction_TF_IntACT,Interaction_TF_IntACT_smallset,In_bait_crems,In_bait_snfs,In_bait_nursa,In_bait_gocofs,Is_TF,In_NVS
0,AATF,Q9NY61,ENSG00000275700,Drew1337;Drew1809;Drew3735,SMARCA5;NOC2L;MYBBP1A;RIOX1;NSD2;RIOX2;RRP1B;D...,,,,,,...,1,1,1,1,0,0,0,0,0,0
1,ABL1,P00519,ENSG00000097007,Drew2259,ABL1,6592;6076;2811,c-Abl-cortactin-nmMLCK complex;BRCA1-cABL comp...,20861316;12024016;23740246,BRCA1;ABL1,,...,1,1,1,1,0,0,0,1,0,0
2,ABRAXAS1,Q6UWZ7,ENSG00000163322,Drew1841,UIMC1,2786,BRCA1 A complex,17525340,BRCA1;UIMC1,,...,0,0,0,0,0,0,0,0,0,0
3,ACIN1,Q9UKV3,ENSG00000100813,Drew2802;Drew4295;Drew1090,SRSF2;SPEN;RBM39;THRAP3;BCLAF1;CDC73,351;760,Spliceosome;Apoptosis- and splicing-associated...,12226669;12665594,DDX5;SRSF2;DHX9;SPEN;BCAS2;SF1;DDX17;SNW1;SAP1...,,...,0,0,0,0,0,0,0,0,0,0
4,ACOT8,O14734,ENSG00000101473,Drew1995,PML,,,,,,...,0,0,1,1,0,0,0,0,0,0
5,ACTB,P60709,ENSG00000075624,Drew1704;Drew233;Drew4201;Drew2569,CORO2A;ACTB;GSN;BASP1,725;5615;1729;7298;5606;6984;1166;5736;5607;56...,Emerin architectural complex;CTGF/Hcs24-actin(...,14729568;15834686;12470643;17620012;11950878;1...,PBRM1;HDAC1;GATAD2B;PTMA;PARP1;SMARCC2;DPY30;N...,BAF;PBAF;NuA4,...,1,1,1,1,0,0,0,0,0,1
6,ACTG1,P63261,ENSG00000184009,Drew2569;Drew780,GSN;ACTN4;ACTN1;ACTB;CFL1;HMGB2;BASP1,189;2254,BAF complex;CTGF/Hcs24-actin(beta/gamma) complex,11078522;12470643,SMARCC1;ARID1B;ARID1A;ACTL6B;SMARCC2;SMARCD1;S...,,...,0,0,0,0,0,0,0,0,0,0
7,ACTL6A,O96019,ENSG00000136518,Drew4093;Drew2673,PBRM1;BRD9;BRD7;SMARCC2;SMARCA4;SMARCD2;BICRAL...,1251;807;304;564;238;565;1166;1175;798;1230;55...,BRM-SIN3A-HDAC complex;BRG1-SIN3A-HDAC contain...,8895581;9710619;11073988;11780067;10882073;984...,PBRM1;BRD7;HDAC1;GATAD2B;H2AZ1;SMARCC2;MBD3;CC...,BAF;PBAF;NuA4;SRCAP;INO80;GBAF,...,1,1,1,1,0,0,0,1,0,1
8,ACTL6B,O94805,ENSG00000077080,Drew4093;Drew2673,PBRM1;BRD9;BRD7;SMARCC2;SMARCA4;SMARCD2;BICRAL...,189,BAF complex,11078522,SMARCC1;ARID1B;ARID1A;ACTL6B;SMARCC2;SMARCD1;S...,BAF;PBAF,...,0,0,0,0,0,0,0,1,0,1
9,ACTN1,P12814,ENSG00000072110,Drew2322;Drew2122;Drew780,ACTN4;ACTN1;CFL1;HMGB2;BASP1,4025;5177,"Polycystin-1 multiprotein complex (ACTN1, CDH1...",11113628;15159419,ACTN1;CTNNB1;PXN;JUP,,...,0,0,1,1,0,0,0,1,0,0


In [6]:
# Create transcription regulation list with baitset 3:
# (CREMs, SNFs, NURSA, GO coregulators, NVS, TFs)

# Create list of files and the complete baitlist:
baitfiles = [crems_file, snfs_file, nursa_file, gocofs_file]
bait_types = [file.split('/')[-1][:-4] for file in baitfiles]
with open(nvs_proteins_file) as nvs:
    bait = list(set(fullbait(baitfiles) + [line.strip().split('\t')[0] for line in nvs]))
tfs = list_tfs(tf_file)
bait = list(set(bait + tfs))
baitlength3 = len(bait)

# Create dataframes with proteins found in complexes with
# bait proteins in seperate datasets, then merge into one df:
cof_humap_df = cofactors_humap(complexes_humap, bait)
cof_corum_df = cofactors_corum(complexes_corum, bait)
cof_nvs_df = cofactors_nvs(complexes_nvs, bait)
cof_complexportal_df = cofactors_complexportal(complexes_complexportal, bait)
df_full3 = merg_dfs(cof_humap_df, cof_corum_df, cof_nvs_df, cof_complexportal_df)

# Add proteins from bait lists, nvs and tfs:
# df_full3 = add_nvs(df_full3, nvs_proteins_file)
df_full3 = add_baitlists(df_full3, baitfiles)
df_full3 = add_tfs(df_full3, tfs)

# Add columns to cofactor-df on tf-interactions:
df_full3 = add_interactions(df_full3, biogrid_file, 'BioGRID')
df_full3 = add_interactions(df_full3, intact_file, 'IntACT')

# Add columns on whether something is present in a bait list.
df_full3 = baitcolumn(df_full3, baitfiles, bait_types)
df_full3 = tf_column(df_full3, tfs)
df_full3 = nvs_column(df_full3, nvs_proteins_file)

# Filter dataframe:
df_filtered3 = filter_df(df_full3)
df_filtered3 = id_columns(df_filtered3, hugo_uniprot_dict, hugo_ensg_dict)

# Add column 'Not found by small set of TFs':
# df_filtered3 = found_by_tfs_small(df_filtered3, set(tfs), tfs_small, bait)

df_filtered3.to_csv(folder_output + '08_Transcription_regulatorlist_baitset3_without_scores.csv', index = False)
df_filtered3

Unnamed: 0,Gene Symbol,UniProt ID,ENSG ID,ComplexID_huMAP,Bait_huMAP,ComplexID_CORUM,ComplexName_CORUM,PMID_CORUM,Bait_CORUM,ComplexName_nvs,...,Interaction_TF_BioGRID,Interaction_TF_BioGRID_smallset,Interaction_TF_IntACT,Interaction_TF_IntACT_smallset,In_bait_crems,In_bait_snfs,In_bait_nursa,In_bait_gocofs,Is_TF,In_NVS
0,AATF,Q9NY61,ENSG00000275700,Drew1337;Drew1809;Drew3735,RIOX1;ZNF629;DHX30;ZBTB24;ZBTB11;MYBBP1A;PURA;...,,,,,,...,1,1,1,1,0,0,0,0,0,0
1,ABL1,P00519,ENSG00000097007,Drew2259,ABL1,6592;6076;2811,c-Abl-cortactin-nmMLCK complex;BRCA1-cABL comp...,20861316;12024016;23740246,BRCA1;ABL1,,...,1,1,1,1,0,0,0,1,0,0
2,ABRAXAS1,Q6UWZ7,ENSG00000163322,Drew1841,UIMC1,2786,BRCA1 A complex,17525340,BRCA1;UIMC1,,...,0,0,0,0,0,0,0,0,0,0
3,ACIN1,Q9UKV3,ENSG00000100813,Drew2802;Drew4295;Drew1090,SRSF2;SPEN;RBM39;THRAP3;BCLAF1;CDC73,351;760,Spliceosome;Apoptosis- and splicing-associated...,12226669;12665594,DDX5;SRSF2;DHX9;SPEN;BCAS2;SF1;SAP18;DDX17;SNW...,,...,0,0,0,0,0,0,0,0,0,0
4,ACOT1,Q86TX2,ENSG00000184227,Drew956,TFAP2A,,,,,,...,1,1,0,0,0,0,0,0,0,0
5,ACOT8,O14734,ENSG00000101473,Drew1995,PML,,,,,,...,0,0,1,1,0,0,0,0,0,0
6,ACTB,P60709,ENSG00000075624,Drew2569;Drew233;Drew4201;Drew1704,CORO2A;GSN;ACTB;LRRFIP1;BASP1,725;5615;1729;7298;5606;6984;1166;5736;5607;56...,CTGF/Hcs24-actin(beta/gamma) complex;Emerin ar...,14729568;15834686;12470643;17620012;15502823;1...,PBRM1;HDAC1;PARP1;PTMA;GATAD2B;SMARCC2;DPY30;N...,BAF;PBAF;NuA4,...,1,1,1,1,0,0,0,0,0,1
7,ACTG1,P63261,ENSG00000184009,Drew2569;Drew780,GSN;ACTN4;ACTN1;ACTB;CFL1;HMGB2;BASP1;ZBTB20,189;2254,BAF complex;CTGF/Hcs24-actin(beta/gamma) complex,11078522;12470643,SMARCC1;ARID1A;ACTB;SMARCC2;ACTL6B;SMARCD1;SMA...,,...,0,0,0,0,0,0,0,0,0,0
8,ACTL6A,O96019,ENSG00000136518,Drew4093;Drew2673,PBRM1;BRD9;BRD7;SMARCC2;SMARCA4;SMARCD2;BICRAL...,1251;807;304;564;238;565;1166;1175;798;1230;55...,BRM-SIN3A-HDAC complex;WINAC complex;BRG1-SIN3...,9710619;8895581;11073988;11780067;10882073;984...,PBRM1;BRD7;HDAC1;GATAD2B;H2AZ1;SMARCC2;MBD3;CC...,BAF;PBAF;NuA4;SRCAP;INO80;GBAF,...,1,1,1,1,0,0,0,1,0,1
9,ACTL6B,O94805,ENSG00000077080,Drew4093;Drew2673,PBRM1;BRD9;BRD7;SMARCC2;SMARCA4;SMARCD2;BICRAL...,189,BAF complex,11078522,SMARCC1;ARID1A;ACTB;SMARCC2;ACTL6B;SMARCD1;SMA...,BAF;PBAF,...,0,0,0,0,0,0,0,1,0,1


In [7]:
# Numbers for workflow flowchart:
dfs = [df_filtered, df_filtered2, df_filtered3]
dfs_full = [df_full, df_full2, df_full3]

is_tf = [(df['Is_TF'] == 1) for df in dfs]
in_nvs = [(df['In_NVS'] == 1) for df in dfs]
is_crem = [(df['In_bait_crems'] == 1) for df in dfs]
is_snfs = [(df['In_bait_snfs'] == 1) for df in dfs]
is_nursa = [(df['In_bait_nursa'] == 1) for df in dfs]
is_gocofs = [(df['In_bait_gocofs'] == 1) for df in dfs]
is_bait = [(is_crem[0] | is_snfs[0] | is_nursa[0] | is_gocofs[0]),
          (is_crem[1] | is_snfs[1] | is_nursa[1] | is_gocofs[1] | in_nvs[1]),
          (is_crem[2] | is_snfs[2] | is_nursa[2] | is_gocofs[2] | in_nvs[2] | is_tf[2])]
on_startlist = [(is_bait[i] | in_nvs[i] | is_tf[i]) for i in range(3)]

in_humap = [(df['ComplexID_huMAP'].notnull()) for df in dfs_full]
in_corum = [(df['ComplexID_CORUM'].notnull()) for df in dfs_full]
both_humap_corum = [(in_humap[i] & in_corum[i]) for i in range(3)]
one_humap_corum = [(in_humap[i] ^ in_corum[i]) for i in range(3)]
int_biogrid = [(df['Interaction_TF_BioGRID'] == 1) for df in dfs_full]
int_intact = [(df['Interaction_TF_IntACT'] == 1) for df in dfs_full]
tf_int = [(int_biogrid[i] | int_intact[i]) for i in range(3)]
onecomplex_tfint = [(one_humap_corum[i] & tf_int[i]) for i in range(3)]

filtered_in_humap = [(df['ComplexID_huMAP'].notnull()) for df in dfs]
filtered_in_corum = [(df['ComplexID_CORUM'].notnull()) for df in dfs]
filtered_both_humap_corum = [(filtered_in_humap[i] & filtered_in_corum[i]) for i in range(3)]
filtered_one_humap_corum = [(filtered_in_humap[i] ^ filtered_in_corum[i]) for i in range(3)]
filtered_int_biogrid = [(df['Interaction_TF_BioGRID'] == 1) for df in dfs]
filtered_int_intact = [(df['Interaction_TF_IntACT'] == 1) for df in dfs]
filtered_tf_int = [(filtered_int_biogrid[i] | filtered_int_intact[i]) for i in range(3)]
filtered_onecomplex_tfint = [(filtered_one_humap_corum[i] & filtered_tf_int[i]) for i in range(3)]
bait_notlisted = [(on_startlist[i] & ~filtered_both_humap_corum[i] & ~filtered_onecomplex_tfint[i]) for i in range(3)]
new_cofs = [(~is_gocofs[i] & ~is_tf[i]) for i in range(3)]

print('Number of bait proteins:')
print(baitlength1)
print(baitlength2)
print(baitlength3)

print('\nProteins from bait list not already on final list:')
for i in bait_notlisted:
    print(sum(i))

print('\nProteins in complexes with bait proteins:')
print(df_full.loc[df_full['ComplexID_huMAP'].notnull() | df_full['ComplexID_CORUM'].notnull()].shape[0])
print(df_full2.loc[df_full2['ComplexID_huMAP'].notnull() | df_full2['ComplexID_CORUM'].notnull()].shape[0])
print(df_full3.loc[df_full3['ComplexID_huMAP'].notnull() | df_full3['ComplexID_CORUM'].notnull()].shape[0])

print('\nProteins in both hu.MAP and CORUM complexes:')
for i in both_humap_corum:
    print(sum(i))

print('\nProteins in only hu.MAP or CORUM complexes:')
for i in one_humap_corum:
    print(sum(i))

print('\nProteins in only hu.MAP or CORUM complexes that interact with a TF:')
for i in onecomplex_tfint:
    print(sum(i))

print('\nTotal number of proteins on final list:')
for df in dfs:
    print(df.shape[0])
    
# print('\nTotal number of proteins in final list if small TF list were used:')
# for df in dfs:
#     print(df['Found with small TF set'].value_counts()[1])
    

number_of_interactors = len(set(
    list(pd.read_csv(biogrid_file)['Interactor']) + list(pd.read_csv(intact_file)['Interactor'])))
print('\nNumber of TF-interactors:\n', number_of_interactors)

print('\nNew potential cofactors (no GO coregulator, no GO TF):')
for i in new_cofs:
    print(sum(i))

print('\nProteins in (no complex/hu.MAP/CORUM/BOTH):')
for i, j, k, l in zip(bait_notlisted, filtered_in_humap, filtered_in_corum, filtered_both_humap_corum):
    print(sum(i), '/', sum(j), '/', sum(k), '/', sum(l))
    
    
interactors = set(list(pd.read_csv(biogrid_file)['Interactor']) + list(pd.read_csv(intact_file)['Interactor']))
print('\nTF-interactors in/not in list:')
print(len([x for x in interactors if x in set(df_filtered['Gene Symbol'])]), '/', len([x for x in interactors if not x in set(df_filtered['Gene Symbol'])]))
print(len([x for x in interactors if x in list(df_filtered2['Gene Symbol'])]), '/', len([x for x in interactors if not x in set(df_filtered2['Gene Symbol'])]))
print(len([x for x in interactors if x in list(df_filtered3['Gene Symbol'])]), '/', len([x for x in interactors if not x in set(df_filtered3['Gene Symbol'])]))

n_crems = [x.sum() for x in is_crem]
n_snfs = [x.sum() for x in is_snfs]
n_nursa = [x.sum() for x in is_nursa]
n_coregs = [x.sum() for x in is_gocofs]
with open(gocofs_notfs_file) as file:
    n_coregs_notfs = len([x.strip() for x in file])
n_nvs = [x.sum() for x in in_nvs]
with open(nvs_proteins_notfs_file) as file:
    n_nvs_notfs = len([x for x in file])
n_tfs = [x.sum() for x in is_tf]
print('\nNumber of baits:\nCREMs\t{}\nSNFs\t{}\nNURSA\t{}\nGO coregs\t{}\nGO coregs (no TFs)\t{}\nNVS\t{}\nNVS (no TFs)\t{}\nTFs\t{}\n'.format(n_crems, n_snfs, n_nursa, n_coregs, n_coregs_notfs, n_nvs, n_nvs_notfs, n_tfs))


Number of bait proteins:
419
893
2341

Proteins from bait list not already on final list:
1697
1541
1372

Proteins in complexes with bait proteins:
2345
3418
3932

Proteins in both hu.MAP and CORUM complexes:
667
1096
1183

Proteins in only hu.MAP or CORUM complexes:
1678
2322
2749

Proteins in only hu.MAP or CORUM complexes that interact with a TF:
818
1000
1248

Total number of proteins on final list:
3182
3637
3803

Number of TF-interactors:
 5205

New potential cofactors (no GO coregulator, no GO TF):
1306
1761
1927

Proteins in (no complex/hu.MAP/CORUM/BOTH):
1697 / 1188 / 1093 / 667
1541 / 1835 / 1508 / 1096
1372 / 2202 / 1611 / 1183

TF-interactors in/not in list:
2007 / 3198
2308 / 2897
2467 / 2738

Number of baits:
CREMs	[134, 134, 134]
SNFs	[22, 22, 22]
NURSA	[290, 290, 290]
GO coregs	[440, 440, 440]
GO coregs (no TFs)	419
NVS	[391, 391, 391]
NVS (no TFs)	364
TFs	[1457, 1457, 1457]

