# define_interactome.ipynb

This notebook takes evidence files from Affinity Purification Mass Spctrometry (APMS) experiments and uses the SAINTq method$^{1}$ to score the probability of a protein-protein interaction for bait-prey pairs. 

$^{1}$Teo G, Koh H, Fermin D, et al. SAINTq: Scoring protein-protein interactions in affinity purification - mass spectrometry experiments with fragment or peptide intensity data. Proteomics. 2016;16(15-16):2238‐2245. doi:10.1002/pmic.201500499

In [1]:
# Required libraries
library(artMS)
library(biomaRt)

Registered S3 method overwritten by 'openssl':
  method      from
  print.bytes Rcpp


Registered S3 method overwritten by 'xts':
  method     from
  as.zoo.xts zoo 


## Functions

In [2]:
# Function to run artMS and create saintq input files
create_artMSfiles <- function(annot_file, ev_file, out_dir, bait){
    
    ### Confirm that key and evidence file are correctly formatted
    keys = read.table(annot_file, na.strings=c("", "NA"), sep = "\t", header = TRUE, stringsAsFactors=FALSE)
    
    # Create bioreplicate column - in the format of 'Condition-(1,2,3)'
    if(("Raw.file" %in% names(keys)) & ("Condition" %in% names(keys))){
        suff = ave(keys$Condition, keys$Condition, FUN=seq_along)
        keys$BioReplicate = paste0(keys$Condition,'-',suff)
    } else {
        print("Incorrect formatting of annotation file. Check that the Raw.file and Conditon columns exist.")
    }
    
    # IsotopeLabelType column - if it doesn't exist, 'L' is used for label-free experiments
    if(! "IsotopeLabelType" %in% names(keys)){
        keys$IsotopeLabelType = "L"
    }
    
    # Run column - if it doesn't exist, a simple sequence of numbers is used
    if(! "Run" %in% names(keys)){
        keys$Run = seq(1:nrow(keys))
    }
    
    # If the SAINT column doesn't exist, create it ('C' for control runs, 'T' for test runs)
    if(! "SAINT" %in% names(keys)){
        keys$SAINT = ifelse(keys$Condition=="control", "C", "T")
    }
    
    # Rewrite annotation file
    write.table(keys, file = annot_file, sep = "\t", quote = FALSE, row.names = FALSE)
    
    # Write out the contrast string
    contrast = write.table(paste0("control-",bait), 
                         file = paste0(out_dir, "/contrast_", bait, ".txt"),
                         quote = FALSE, row.names = FALSE, col.names = FALSE)
    
    # Create SAINTq inputs
    artmsEvidenceToSAINTq(evidence_file = ev_file, 
                          keys_file = annot_file, 
                          output_dir = out_dir)
}

In [3]:
# Function to write a saintq parameter file
write_params <- function(input_file, input_level, param_file){
    
    norm = "true"
    
    if(input_level == "protein"){
        content = paste0("normalize_control=",norm,
                         "\ninput_filename=",input_file,
                         "\ninput_level=",input_level,
                         "\nprotein_colname=Proteins\ncompress_n_ctrl=100\ncompress_n_rep=100")
    } else if (input_level == "peptide"){
        content = paste0("normalize_control=",norm,
                         "\ninput_filename=",input_file,
                         "\ninput_level=",input_level,
                         "\nprotein_colname=Proteins\npep_colname=Sequence\ncompress_n_ctrl=100",
                         "\ncompress_n_rep=100",
                         "\nmin_n_pep=3\nbest_prop_pep=0.5")
  }
  
  write.table(content, file = param_file, quote = FALSE, col.names = FALSE, row.names = FALSE)
    
}


In [4]:
# Function to run saintq for a list of parameters, renaming the output file
run_saintq <- function(paramfilePath, infileName, configPath, newPath){
    
    notebook_dir = getwd()
    paramfile = gsub("^.*/", "", paramfilePath)
    
    setwd(configPath)
    path2saintq = "../../../saintq/bin/saintq"
    cmd = paste(path2saintq, paramfile)
    system(cmd)
    
    fname = paste0("scores_list__", infileName,"__.tsv")
    cmd = paste0("mv ", fname, " ", newPath)
    system(cmd)
    
    setwd(notebook_dir)
  
}

## GATA4 interactome from protein intensity data

In [5]:
bait = 'GATA4'
input_level = "protein"
annot_file = paste0("../data/apms/", bait, "_annotation.txt")
ev_file = paste0("../data/apms/", bait, "_evidence.txt")
config_dir = paste0("../intermediate/saintq_config/",bait)

if (! dir.exists(config_dir)){
    dir.create(config_dir)
}

create_artMSfiles(annot_file, ev_file, config_dir, bait)


input_file = paste0("saintq_input_proteins.txt") # note: These input files are the same
param_file = paste0(config_dir, "/", bait, "_parameters.txt")

write_params(input_file, input_level, param_file)


out_path = paste0(bait, "_saintq_results.txt")
run_saintq(param_file, input_file, config_dir, out_path) #note: These output files are the same

>> GENERATING A SAINTq INPUT FILE 
>> CHECKING THE keys FILE FIRST 
>> MERGING FILES 
--- Making the <Leading.Razor.Protein> the <Proteins> column 
--- ALL peptides with intensities will be used to generate the 
      saintq input file (indepependently of the number of spectral counts 
--- Removing empty protein ids (if any) 
--- Removing Protein Groups (if any) 
--- Removing contaminants
-- CONTAMINANTS CON__|REV__ REMOVED 
>> NEW 4 FILES CREATED:
	- saintq-config-peptides
	- saintq-config-proteins
	- saintq_input_peptides.txt
	- saintq_input_proteins.txt
>> DONE! 


## TBX5 interactome from peptide intensity data

In [6]:
bait = 'TBX5'
input_level = "peptide"
annot_file = paste0("../data/apms/", bait, "_annotation.txt")
ev_file = paste0("../data/apms/", bait, "_evidence.txt")
config_dir = paste0("../intermediate/saintq_config/",bait)

if (! dir.exists(config_dir)){
    dir.create(config_dir)
}

create_artMSfiles(annot_file, ev_file, config_dir, bait)


input_file = paste0("saintq_input_peptides.txt")
param_file = paste0(config_dir, "/", bait, "_parameters.txt")

write_params(input_file, input_level, param_file)

out_path = paste0(bait, "_saintq_results.txt")
run_saintq(param_file, input_file, config_dir, out_path)

>> GENERATING A SAINTq INPUT FILE 
>> CHECKING THE keys FILE FIRST 
>> MERGING FILES 
--(-) Raw.files in keys not found in evidence file: qx007857	qx007859	qx007861	qx007863	qx007865	qx007867	qx007875	qx007877	qx007879	qx007881	qx007883	qx008455

--- Making the <Leading.Razor.Protein> the <Proteins> column 
--- ALL peptides with intensities will be used to generate the 
      saintq input file (indepependently of the number of spectral counts 
--- Removing empty protein ids (if any) 
--- Removing Protein Groups (if any) 
--- Removing contaminants
-- CONTAMINANTS CON__|REV__ REMOVED 
>> NEW 4 FILES CREATED:
	- saintq-config-peptides
	- saintq-config-proteins
	- saintq_input_peptides.txt
	- saintq_input_proteins.txt
>> DONE! 


## Examine results

In [7]:
uniprot_map = read.table("../data/databases/uniprot_to_gene.txt", 
                         header = TRUE, sep = "\t", stringsAsFactors = FALSE)

# Function to read in the saintq results, get gene names, and select significantly-interacting proteins
summarize_results <- function(bait, bfdr_cutoff = 0.05){
    
    tab = read.table(file = paste0('../intermediate/saintq_config/', bait, '/', bait, "_saintq_results.txt"), 
                     sep = "\t", stringsAsFactors = FALSE, fill = TRUE, header = TRUE, comment.char = "")
    
    tab = tab[which(tab$BFDR <= bfdr_cutoff), ]
    tab$Gene = uniprot_map$GeneSymbol[match(tab$Prey, uniprot_map$UniProt)]
    
    return(tab)
    
}

In [8]:
gata_tab = summarize_results("GATA4", 0.001)
tbx5_tab = summarize_results("TBX5", 0.05)

## Observe nuclear compartment

In [9]:
compartments = read.table("../data/databases/BINGO_nuclear_localization.txt",
                         sep = "\t", header = TRUE, stringsAsFactors = FALSE)

gata_tab$Compartment = compartments$subcellular.location[match(gata_tab$Prey, compartments$UniprotID)]
tbx5_tab$Compartment = compartments$subcellular.location[match(tbx5_tab$Prey, compartments$UniprotID)]

unique(c(gata_tab$Compartment, tbx5_tab$Compartment))

In [10]:
nuclear_s = c("nucleus","Nucleus",NA) 
gata_genes = gata_tab$Gene[which(gata_tab$Compartment %in% nuclear_s)]
tbx5_genes = tbx5_tab$Gene[which(tbx5_tab$Compartment %in% nuclear_s)]

In [11]:
print("These genes were removed from the interactome due to non-nuclear localization:")
gata_tab$Gene[which(!gata_tab$Compartment %in% nuclear_s)]
tbx5_tab$Gene[which(!tbx5_tab$Compartment %in% nuclear_s)]

[1] "These genes were removed from the interactome due to non-nuclear localization:"


## Filtering the interactomes based on RNA-seq expression in WT vs KO

We remove any genes that were expressed significantly less in the control (KO) cell line, but had higher peptide intensities in the KO cell line, suggesting that they are false positive interactors. 

### Imputation and normalization functions

In [12]:
replace_all_zeroes <- function(ko_df){
    
    # Imputation Step 1: 
    # if there are control rows with all zeroes, replace with 90% of the minimum observed bait value
    
    zeroes = ko_df[which(rowSums(ko_df)==0), ]

    ko_df[ko_df == 0] <- NA
    run_mins = apply(ko_df, 2, FUN=min, na.rm=TRUE)
    run_mins
    ko_df[is.na(ko_df)] <- 0

    for(i in c(1:length(run_mins))){
        replacement_value = run_mins[i] * 0.9
        zeroes[,i] = replacement_value
    }
    
    ko <- ko_df[which(!row.names(ko_df) %in% row.names(zeroes)),] 
    ko = rbind(ko, zeroes)

    return(ko)
}


replace_missing <- function(ko_df){
    
    # Imputation Step 2: if the prey protein had an observed intensity value in at least one control run,
    # then the missing intensities are replaced by 90% of the lowest observed intensity for the given protein
    
    ko_df[ko_df == 0] <- NA
    ko_df$min = apply(ko_df, 1, FUN=min, na.rm=TRUE)

    for (idx in row.names(ko_df)){
        replacement_value = 0.9 * ko_df[idx,'min']
        ko_df[idx,][is.na(ko_df[idx,])] <- replacement_value
    }

    ko_df$min <- NULL
    return(ko_df)
}

normalize_intensities <- function(ko_df, wt_df){
    # Normalization  - 
    # the control intensities are altered such that the average total intensity across all bait purifications is
    # the same as the average total intensity in the controls

    wt_average = mean(colMeans(wt_df))
    ko_average = mean(colMeans(ko_df))
    ko_factor = wt_average/ko_average
    ko = ko_df * ko_factor
    
    return(ko)
}

In [13]:
get_blacklist <- function(bait, level, rnaseq){
    
    # Read data and separate control/bait experiments
    dat = read.table(paste0('../intermediate/saintq_config/', bait, '/saintq_input_', level, '.txt'),
                 skip=2, sep = "\t", header=TRUE, stringsAsFactors=FALSE)

    dat$Sequence <- NULL
    
    if (level=="peptides"){
        dat = aggregate(. ~ Proteins, dat, sum)
    }
    
    row.names(dat) = dat$Proteins
    dat$Proteins <- NULL

    wt_names = names(dat)[which(! grepl("control", names(dat)))]
    wt = dat[,c(wt_names)]

    ko_names = names(dat)[which(grepl("control", names(dat)))]
    ko = dat[,c(ko_names)]
    
    ko = replace_all_zeroes(ko)
    ko = replace_missing(ko)
    ko = normalize_intensities(ko, wt)
    
    # Determine fold change of average intensities across WT and KO lines
    wt$wt_avg_intensity = rowMeans(wt)
    ko$ko_avg_intensity = rowMeans(ko)

    intensities = merge(wt, ko, by=0, all=TRUE)
    intensities$logFC_intensity = log2(intensities$ko_avg_intensity/intensities$wt_avg_intensity)
    intensities$Gene = uniprot_map$GeneSymbol[match(intensities$Row.names, uniprot_map$UniProt)]

    # Merge with RNAseq data and identify prey with lower expression but equal/higher intensity
    colname = paste0('logFC.Treatment',substring(bait, 1, 1),'KO')
    intensities = merge(intensities, rnaseq[,c(colname, 'FDR','hgnc_symbol')], by.x = "Gene", by.y = "hgnc_symbol")
    blacklist = intensities[which((intensities[,colname] < 0) & (intensities$FDR < 0.05) & 
                                  (intensities$logFC_intensity > -0.5)),]
    
    return(blacklist)
    
}

In [14]:
rnaseq = read.csv("../data/rnaseq/DifferentialExpressionResults_koStudy_Sigma.csv")

In [15]:
gata4_blacklist = get_blacklist("GATA4","proteins",rnaseq)
tbx5_blacklist = get_blacklist("TBX5", "peptides",rnaseq)

In [16]:
gata4_blacklist[which(gata4_blacklist$Gene %in% gata_genes),]
tbx5_blacklist[which(tbx5_blacklist$Gene %in% tbx5_genes),]

Unnamed: 0,Gene,Row.names,GATA4.1,GATA4.2,GATA4.3,wt_avg_intensity,control.1,control.2,control.3,control.4,ko_avg_intensity,logFC_intensity,logFC.TreatmentGKO,FDR
320,FRG1,Q14331,1822100,2240000,2420700,2160933,1748479,645070.8,1853053,2400367,1661743,-0.3789577,-0.3123602,4.64573e-11


Gene,Row.names,TBX5.1,TBX5.2,TBX5.3,wt_avg_intensity,control.1,control.2,control.3,ko_avg_intensity,logFC_intensity,logFC.TreatmentTKO,FDR


We remove one gene that was scored as interactor of GATA4, but whose change in protein intensity was lower than expected: FRG1. No interactors met the criteria for removal in the TBX5 interactome.

In [17]:
# Merge with bait information and filter genesets
gata_tab$GATA4_blacklist = ifelse(gata_tab$Gene %in% gata4_blacklist$Gene, "blacklisted", "not blacklisted")
tbx5_tab$TBX5_blacklist = ifelse(tbx5_tab$Gene %in% tbx5_blacklist$Gene, "blacklisted", "not blacklisted")

gata_genes = gata_genes[which(!gata_genes %in% gata4_blacklist$Gene)]
tbx5_genes = tbx5_genes[which(!tbx5_genes %in% tbx5_blacklist$Gene)]

length(gata_genes)
length(tbx5_genes)

## Filtering the interactomes based on cell type co-expression

From embryonic mouse scRNA-seq data$^{2}$, we identify the broad embryonic cell populations and specific myocardium cell types in which each bait protein is expressed. For the proteins that interact in the APMS assay, we aim to confirm that these interactions are biologically relevant, based on whether they are expressed in any of the cell types that the baits are observed in.

$^{2}$Soysa, T. Yvanka de, Sanjeev S. Ranade, Satoshi Okawa, Srikanth Ravichandran, Yu Huang, Hazel T. Salunga, Amelia Schricker, Antonio Del Sol, Casey A. Gifford, and Deepak Srivastava. 2019. “Single-Cell Analysis of Cardiogenesis Reveals Basis for Organ-Level Developmental Defects.” Nature 572 (7767): 120–24.

In [18]:
broad_expr = read.csv('../data/rnaseq/BroadPopulations_AverageExpression.csv', 
                      row.names=1, stringsAsFactors=FALSE)
myo_expr = read.csv('../data/rnaseq/Myocardium_AverageExpression.csv', 
                   row.names=1,stringsAsFactors=FALSE)

expr = cbind(broad_expr, myo_expr)

expr[which(row.names(expr) %in% c("Gata4","Tbx5")),]

Unnamed: 0,Endocardial.Endothelial,Epicardium,Lateral.Plate.Mesoderm,Multipotent.Progenitors,Myocardium,Neural.Crest,Paraxial.Mesoderm,Atrial,AVC,EMP,OFT,SV,Ventricle
Gata4,0.49680854,1.0402156,0.2585975,0.2660332,0.7435623,0.011926803,0.01003805,0.6810797,0.8954255,0.7898763,0.55845915,1.211324,0.5504972
Tbx5,0.04964018,0.4602037,0.1221794,0.2131582,0.6715903,0.005005027,0.002291352,0.970663,1.2059016,0.6790237,0.07208931,1.189701,0.4328445


We identify genes that were not expressed in any cell type that the baits appear in (defined by expression greater than 0.05 tpm), calling them "non-physiological interactors."

GATA4 cell types: Endocardial.Endothelial, Epicardium, Lateral.Plate.Mesoderm, Multipotent.Progenitors, Myocardium, Atrial, AVC, EMP, OFT, SV, Ventricle

TBX5 cell types: Epicardium, Lateral.Plate.Mesoderm, Multipotent.Progenitors, Myocardium, Atrial, AVC, EMP, SV, Ventricle

Both GATA4 and TBX5 were detected at tpm>0.05 in the above listed cell types, so we include any putative interactors expressed at any detectable level in one of these cell types. It should be noted that one could choose a stricter threshold or different filtering criteria in order to emphasize interactions occurring in particular cell types. 

In [19]:
# Convert from mouse to human gene symbols, as this scRNAseq data is from mice
library(biomaRt)

mouse_to_human <- function(x){
    
    human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
    mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
    genes = getLDS(attributes = c("mgi_symbol"), filters = "mgi_symbol", values = x , 
                   mart = mouse, attributesL = c("hgnc_symbol"), martL = human, uniqueRows=F)
    return(genes)
}

human_genes = mouse_to_human(row.names(expr))
head(human_genes)

expr$Gene = human_genes$HGNC.symbol[match(row.names(expr), human_genes$MGI.symbol)]
head(expr[,c('Myocardium','Gene')])

# To fill missing data, we insert the uppercase version of mouse symbol
expr$Gene[is.na(expr$Gene)] <- toupper(row.names(expr)[is.na(expr$Gene)])
head(expr[,c('Myocardium','Gene')])

MGI.symbol,HGNC.symbol
mt-Nd5,MT-ND5
mt-Nd2,MT-ND2
mt-Nd4l,MT-ND4L
mt-Nd3,MT-ND3
mt-Nd6,MT-ND6
mt-Nd4,MT-ND4


Unnamed: 0,Myocardium,Gene
Xkr4,0.000338232,XKR4
Rp1,0.000440246,RP1
Sox17,0.004260234,SOX17
U6.1,0.000105483,
Mrpl15,1.848417926,MRPL15
Lypla1,0.765071909,LYPLA1


Unnamed: 0,Myocardium,Gene
Xkr4,0.000338232,XKR4
Rp1,0.000440246,RP1
Sox17,0.004260234,SOX17
U6.1,0.000105483,U6.1
Mrpl15,1.848417926,MRPL15
Lypla1,0.765071909,LYPLA1


In [20]:
expr = aggregate(.~Gene, data=expr, sum) # Some entries are for the same human gene - we sum the tpm values
row.names(expr) <- expr$Gene
expr$Gene <- NULL

gata4_cols = c("Endocardial.Endothelial","Epicardium","Lateral.Plate.Mesoderm","Multipotent.Progenitors",
              "Myocardium","Atrial","AVC","EMP","OFT","SV","Ventricle")
tbx5_cols = c("Epicardium","Lateral.Plate.Mesoderm","Multipotent.Progenitors",
              "Myocardium","Atrial","AVC","EMP","SV","Ventricle")


In [21]:
add_phys_col <- function(cols, tab){
    
    phys = expr[apply(expr[, cols], MARGIN = 1, function(x) any(x > 0)), ]
    non_phys = expr[apply(expr[, cols], MARGIN = 1, function(x) all(x <= 0)), ]
    
    tab$coexpression = ''
    tab$coexpression[which(tab$Gene %in% row.names(phys))] <- "co-expressed"
    tab$coexpression[which(tab$Gene %in% row.names(non_phys))] <- "not co-expressed"
                      
    return(tab)
}

                      
gata_tab = add_phys_col(gata4_cols, gata_tab)
tbx5_tab = add_phys_col(tbx5_cols, tbx5_tab)

In [22]:
nonphys_gata = gata_tab$Gene[which(gata_tab$coexpression == "not co-expressed")]
nonphys_tbx5 = tbx5_tab$Gene[which(tbx5_tab$coexpression == "not co-expressed")]

removed_gata = gata_genes[which(gata_genes %in% nonphys_gata)]
removed_tbx5 = tbx5_genes[which(tbx5_genes %in% nonphys_tbx5)]

print(paste("Removed from the GATA4 interactome:",removed_gata))
print(paste("Removed from the TBX5 interactome:",removed_tbx5))

[1] "Removed from the GATA4 interactome: "
[1] "Removed from the TBX5 interactome: "


In [23]:
gata_genes = gata_genes[which(!gata_genes %in% nonphys_gata)]
tbx5_genes = tbx5_genes[which(!tbx5_genes %in% nonphys_tbx5)]

# separate ambiguous complexes into component genes
complex = tbx5_genes[grepl(";",tbx5_genes)]

individual_genes = strsplit(complex,'; ')
for (i in individual_genes){
    tbx5_genes = c(tbx5_genes, i)
}

tbx5_genes = tbx5_genes[!grepl(';', tbx5_genes)]

cat("New interactome size after removing non-physiological interactions:\n")
cat(paste("GATA4:", length(gata_genes),'\n','TBX5:', length(tbx5_genes)))

New interactome size after removing non-physiological interactions:
GATA4: 205 
 TBX5: 77

In [24]:
# Save lists of the final interactomes to intermediate data folder
dir.create("../intermediate/interactome_lists", showWarnings = FALSE)

write.table(gata_genes, file = "../intermediate/interactome_lists/GATA4_genes.txt",
           row.names=FALSE, col.names=FALSE, quote=FALSE)
write.table(tbx5_genes, file = "../intermediate/interactome_lists/TBX5_genes.txt",
           row.names=FALSE, col.names=FALSE, quote=FALSE)

combined = unique(c(gata_genes, tbx5_genes))
write.table(combined, file = "../intermediate/interactome_lists/GATA4-TBX5_genes.txt",
           row.names=FALSE, col.names=FALSE, quote=FALSE)

## Add details and save out for manuscript

In [25]:
g4 = read.table("../intermediate/interactome_lists/GATA4_genes.txt", stringsAsFactors=FALSE)
t5 = read.table("../intermediate/interactome_lists/TBX5_genes.txt", stringsAsFactors=FALSE)

In [26]:
gata4_genes = g4$V1
tbx5_genes = t5$V1

In [27]:
head(gata_tab)

Unnamed: 0,Bait,Prey,X.Rep,AvgP,BFDR,Gene,Compartment,GATA4_blacklist,coexpression
3,GATA4,A4D1P6,3,1,0,WDR91,Nucleus,not blacklisted,co-expressed
6,GATA4,A6NHQ2,3,1,0,FBLL1,Nucleus,not blacklisted,co-expressed
8,GATA4,A7E2V4,3,1,0,ZSWIM8,Nucleus,not blacklisted,
14,GATA4,O00422,3,1,0,SAP18,,not blacklisted,co-expressed
23,GATA4,O14497,3,1,0,ARID1A,,not blacklisted,co-expressed
26,GATA4,O14646,3,1,0,CHD1,,not blacklisted,co-expressed


### Identify proteins shared between baits

In [28]:
shared = gata_genes[gata_genes %in% tbx5_genes]

gata_tab$GT_shared_interactor <- ifelse(gata_tab$Gene %in% shared, "shared", 'not shared')
tbx5_tab$GT_shared_interactor <- ifelse(tbx5_tab$Gene %in% shared, "shared", 'not shared')

length(shared)

### Create a column for whether the interaction is known in iRefIndex

In [29]:
ppi = read.table('../data/databases/mammalian_iRefIndex.txt',sep="\t",header=TRUE)

known_interaction <- function(bait_uid,prey_list){
    df = ppi[which(ppi$uidA == bait_uid),]
    bool_array = ifelse(prey_list %in% df$uidB, 'known interactor', 'unknown interactor')
}

gata_tab$known_interaction = known_interaction("P43694", gata_tab$Prey)
tbx5_tab$known_interaction = known_interaction("Q99593", tbx5_tab$Prey)

In [30]:
table(gata_tab$known_interaction)


  known interactor unknown interactor 
                 1                234 

In [31]:
bait_uid = "P43694"
df = ppi[which(ppi$uidA == bait_uid),]
df

Unnamed: 0,uidA,uidB,aliasA,aliasB,interactionType
223,P43694,Q02363,GATA4,ID2,MI:0915(physical association)
235,P43694,Q02363,GATA4,ID2,MI:0407(direct interaction)
248,P43694,Q02363,GATA4,ID2,MI:0407(direct interaction)
98857,P43694,P08123,GATA4,COL1A2,psi-mi:MI:0915(physical association)
217540,P43694,P28482,GATA4,MAPK1,MI:0407(direct interaction)
217670,P43694,Q9Y2Y9,GATA4,KLF13,MI:0915(physical association)
217671,P43694,Q9Y2Y9,GATA4,KLF13,MI:0915(physical association)
217672,P43694,Q9Y2Y9,GATA4,KLF13,MI:0915(physical association)
246120,P43694,Q8IX07,GATA4,ZFPM1,psi-mi:MI:0915(physical association)


### Create a column for whether the gene has been identified as a CHD risk gene previously

In [32]:
known = read.table("../data/databases/known_CHD_genes.txt", stringsAsFactors = FALSE)
known_genes = known$V1

gata_tab$known_CHD_gene <- ifelse(gata_tab$Gene %in% known_genes, "known CHD", 'not known CHD')
tbx5_tab$known_CHD_gene <- ifelse(tbx5_tab$Gene %in% known_genes, "known CHD", 'not known CHD')

head(gata_tab)

Unnamed: 0,Bait,Prey,X.Rep,AvgP,BFDR,Gene,Compartment,GATA4_blacklist,coexpression,GT_shared_interactor,known_interaction,known_CHD_gene
3,GATA4,A4D1P6,3,1,0,WDR91,Nucleus,not blacklisted,co-expressed,not shared,unknown interactor,not known CHD
6,GATA4,A6NHQ2,3,1,0,FBLL1,Nucleus,not blacklisted,co-expressed,not shared,unknown interactor,not known CHD
8,GATA4,A7E2V4,3,1,0,ZSWIM8,Nucleus,not blacklisted,,not shared,unknown interactor,not known CHD
14,GATA4,O00422,3,1,0,SAP18,,not blacklisted,co-expressed,not shared,unknown interactor,not known CHD
23,GATA4,O14497,3,1,0,ARID1A,,not blacklisted,co-expressed,not shared,unknown interactor,known CHD
26,GATA4,O14646,3,1,0,CHD1,,not blacklisted,co-expressed,not shared,unknown interactor,not known CHD


In [33]:
# Add column to indicate whether we consider it part of the final interactome
gata_tab$GT.PPI <- ifelse(gata_tab$Gene %in% gata_genes, 'GT.PPI','non-physiological interaction')
tbx5_tab$GT.PPI <- ifelse(tbx5_tab$Gene %in% tbx5_genes, 'GT.PPI','non-physiological interaction')

gata_tab = unique(gata_tab)
tbx5_tab = unique(tbx5_tab)

nrow(gata_tab)

In [34]:
write.csv(gata_tab, "../manuscript/tables/GATA4_interactor_data.csv", row.names=F, quote=F)
write.csv(tbx5_tab, "../manuscript/tables/TBX5_interactor_data.csv", row.names=F, quote=F)

## Repeat with HEK293T dataset

### Create GATA4 HEK interactome

In [35]:
bait = 'HEK-GATA4'
input_level = "protein"
annot_file = paste0("../data/apms/", bait, "_annotation.txt")
ev_file = paste0("../data/apms/", bait, "_evidence.txt")
config_dir = paste0("../intermediate/saintq_config/",bait)

if (! dir.exists(config_dir)){
    dir.create(config_dir)
}

create_artMSfiles(annot_file, ev_file, config_dir, bait)


input_file = paste0("saintq_input_proteins.txt")
param_file = paste0(config_dir, "/", bait, "_parameters.txt")

write_params(input_file, input_level, param_file)


out_path = paste0(bait, "_saintq_results.txt")
run_saintq(param_file, input_file, config_dir, out_path)

>> GENERATING A SAINTq INPUT FILE 
>> CHECKING THE keys FILE FIRST 
>> MERGING FILES 
--- Making the <Leading.Razor.Protein> the <Proteins> column 
--- ALL peptides with intensities will be used to generate the 
      saintq input file (indepependently of the number of spectral counts 
--- Removing empty protein ids (if any) 
--- Removing Protein Groups (if any) 
--- Removing contaminants
-- CONTAMINANTS CON__|REV__ REMOVED 
>> NEW 4 FILES CREATED:
	- saintq-config-peptides
	- saintq-config-proteins
	- saintq_input_peptides.txt
	- saintq_input_proteins.txt
>> DONE! 


### Create TBX5 HEK interactome

In [36]:
bait = 'HEK-TBX5'
input_level = "peptide"
annot_file = paste0("../data/apms/", bait, "_annotation.txt")
ev_file = paste0("../data/apms/", bait, "_evidence.txt")
config_dir = paste0("../intermediate/saintq_config/",bait)

if (! dir.exists(config_dir)){
    dir.create(config_dir)
}

create_artMSfiles(annot_file, ev_file, config_dir, bait)


input_file = paste0("saintq_input_peptides.txt")
param_file = paste0(config_dir, "/", bait, "_parameters.txt")

write_params(input_file, input_level, param_file)

out_path = paste0(bait, "_saintq_results.txt")
run_saintq(param_file, input_file, config_dir, out_path)

>> GENERATING A SAINTq INPUT FILE 
>> CHECKING THE keys FILE FIRST 
>> MERGING FILES 
--- Making the <Leading.Razor.Protein> the <Proteins> column 
--- ALL peptides with intensities will be used to generate the 
      saintq input file (indepependently of the number of spectral counts 
--- Removing empty protein ids (if any) 
--- Removing Protein Groups (if any) 
--- Removing contaminants
-- CONTAMINANTS CON__|REV__ REMOVED 
>> NEW 4 FILES CREATED:
	- saintq-config-peptides
	- saintq-config-proteins
	- saintq_input_peptides.txt
	- saintq_input_proteins.txt
>> DONE! 


#### Filter by compartment and cell-type co-expression

In [37]:
hgata_tab = summarize_results("HEK-GATA4", 0.001)
htbx5_tab = summarize_results("HEK-TBX5", 0.05)

compartments = read.table("../data/databases/BINGO_nuclear_localization.txt",
                         sep = "\t", header = TRUE, stringsAsFactors = FALSE)

hgata_tab$Compartment = compartments$subcellular.location[match(hgata_tab$Prey, compartments$UniprotID)]
htbx5_tab$Compartment = compartments$subcellular.location[match(htbx5_tab$Prey, compartments$UniprotID)]

nuclear_s = c("nucleus","Nucleus",NA) 
hgata_genes = hgata_tab$Gene[which(hgata_tab$Compartment %in% nuclear_s)]
htbx5_genes = htbx5_tab$Gene[which(htbx5_tab$Compartment %in% nuclear_s)]

In [38]:
# Save out list
write.table(hgata_genes, file = "../intermediate/interactome_lists/HEK-GATA4_genes.txt",
           row.names=FALSE, col.names=FALSE, quote=FALSE)
write.table(htbx5_genes, file = "../intermediate/interactome_lists/HEK-TBX5_genes.txt",
           row.names=FALSE, col.names=FALSE, quote=FALSE)

hcombined = unique(c(hgata_genes, htbx5_genes))
write.table(hcombined, file = "../intermediate/interactome_lists/HEKGATA4-HEKTBX5_genes.txt",
           row.names=FALSE, col.names=FALSE, quote=FALSE)

In [39]:
# Write out data
hgata_tab = hgata_tab[which(hgata_tab$Gene != "NA"),]
htbx5_tab = htbx5_tab[which(htbx5_tab$Gene != "NA"),]


In [40]:
write.csv(hgata_tab, "../manuscript/tables/HEK-GATA4_interactor_data.csv", quote=F, row.names=F)
write.csv(htbx5_tab, "../manuscript/tables/HEK-TBX5_interactor_data.csv",quote=F, row.names=F)

In [41]:
hgata_tab = add_phys_col(gata4_cols, hgata_tab)
htbx5_tab = add_phys_col(tbx5_cols, htbx5_tab)

nonphys_hgata = hgata_tab$Gene[which(hgata_tab$physiological_interaction == "non-physiological")]
nonphys_htbx5 = htbx5_tab$Gene[which(htbx5_tab$physiological_interaction == "non-physiological")]

print("Old interactome size: ")
print(length(hgata_genes))
print(length(htbx5_genes))

hgata_genes = hgata_genes[which(!hgata_genes %in% nonphys_hgata)]
htbx5_genes = htbx5_genes[which(!htbx5_genes %in% nonphys_htbx5)]

print("New interactome size after removing non-physiological interactions: ")
print(length(hgata_genes))
print(length(htbx5_genes))

[1] "Old interactome size: "
[1] 142
[1] 90
[1] "New interactome size after removing non-physiological interactions: "
[1] 142
[1] 90


In [42]:
# Save out list
write.table(hgata_genes, file = "../intermediate/interactome_lists/HEK-GATA4_genes.txt",
           row.names=FALSE, col.names=FALSE, quote=FALSE)
write.table(htbx5_genes, file = "../intermediate/interactome_lists/HEK-TBX5_genes.txt",
           row.names=FALSE, col.names=FALSE, quote=FALSE)

hcombined = unique(c(hgata_genes, htbx5_genes))
write.table(hcombined, file = "../intermediate/interactome_lists/HEKGATA4-HEKTBX5_genes.txt",
           row.names=FALSE, col.names=FALSE, quote=FALSE)

# Literature-based interactomes

We select physical interactors of GATA4 and TBX5 as listed in the BIOGRID, STRING, and iRefIndex databases.

In [17]:
#iRefIndex
iref = read.table('../data/databases/mammalian_iRefIndex.txt', sep='\t', stringsAsFactors=F, header=T)

# Specify physical interactions
physical = c('MI:0407(direct interaction)', 'psi-mi:MI:0407(direct interaction)', 
             'MI:0218(physical interaction)')
iref = iref[which(iref$interactionType %in% physical),]

iref_gata = iref[which(iref$aliasA=='GATA4' | iref$aliasB=="GATA4"),]
iref_tbx5 = iref[which(iref$aliasA=='TBX5' | iref$aliasB=="TBX5"),]

iref_gata
iref_tbx5

iref_gata_genes = unique(c(iref_gata$aliasA, iref_gata$aliasB))
iref_tbx5_genes = unique(c(iref_tbx5$aliasA, iref_tbx5$aliasB))

Unnamed: 0,uidA,uidB,aliasA,aliasB,interactionType
235,P43694,Q02363,GATA4,ID2,MI:0407(direct interaction)
248,P43694,Q02363,GATA4,ID2,MI:0407(direct interaction)
2092,A0A024R5A6,P43694,ZBTB3,GATA4,MI:0407(direct interaction)
108671,Q8WW38,P43694,ZFPM2,GATA4,MI:0407(direct interaction)
108675,Q8WW38,P43694,ZFPM2,GATA4,MI:0407(direct interaction)
108820,Q8WW38,P43694,ZFPM2,GATA4,MI:0407(direct interaction)
123836,Q92833,P43694,JARID2,GATA4,MI:0407(direct interaction)
123839,Q92833,P43694,JARID2,GATA4,MI:0407(direct interaction)
203713,Q14676,P43694,MDC1,GATA4,MI:0407(direct interaction)
217540,P43694,P28482,GATA4,MAPK1,MI:0407(direct interaction)


uidA,uidB,aliasA,aliasB,interactionType


In [18]:
# STRING database
## Files downloaded 22 April 2021 from https://string-db.org/ 
## Settings:
    # Species: H. sapiens 
    # Network type: physical network
    # Active interaction sources: Experiments
    # Minimum required interaction score: medium confidence (0.400)

string_gata = read.table('../data/databases/DB-GATA4_string_interactions.tsv', sep="\t", stringsAsFactors=F)
string_tbx5 = read.table('../data/databases/DB-TBX5_string_interactions.tsv', sep="\t", stringsAsFactors=F)

string_gata
string_tbx5

string_gata_genes = unique(c(string_gata$V1, string_gata$V2))
string_tbx5_genes = unique(c(string_tbx5$V1, string_tbx5$V2))

V1,V2,V3,V4,V5,V6,V7,V8,V9
GATA4,HEY2,9606.ENSP00000334458,9606.ENSP00000357348,0,0.412,0,0,0.412
GATA4,NKX2-3,9606.ENSP00000334458,9606.ENSP00000342828,0,0.412,0,0,0.412
GATA4,ZFPM2,9606.ENSP00000334458,9606.ENSP00000384179,0,0.472,0,0,0.472


V1,V2,V3,V4,V5,V6,V7,V8,V9
NKX2-3,TBX5,9606.ENSP00000342828,9606.ENSP00000309913,0,0.747,0,0,0.747
NKX2-5,TBX5,9606.ENSP00000327758,9606.ENSP00000309913,0,0.809,0,0,0.809
SMARCA4,TBX5,9606.ENSP00000395654,9606.ENSP00000309913,0,0.575,0,0,0.575


In [19]:
# BIOGRID
## Files downloaded 22 April 2021 from https://thebiogrid.org/
## Settings:
    # Species: H. sapiens and M. musculus
    
biogrid_gata = read.table('../data/databases/BIOGRID-GATA4-4.3.196.tab3.txt', sep="\t", stringsAsFactors=F)
biogrid_tbx5 = read.table('../data/databases/BIOGRID-TBX5-4.3.196.tab3.txt', sep="\t", stringsAsFactors=F)

biogrid_gata
biogrid_tbx5

biogrid_gata_genes = unique(toupper(c(biogrid_gata$V8, biogrid_gata$V9)))
biogrid_tbx5_genes = unique(toupper(c(biogrid_tbx5$V8, biogrid_tbx5$V9)))

V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37
244617,9656,2626,115014,108896,DAAP-285E11.7,-,MDC1,GATA4,NFBD1,...,B3KUF4,NP_002043,-,-,-,-,-,-,Homo sapiens,Homo sapiens
256287,2626,1482,108896,107864,-,-,GATA4,NKX2-5,ASD2|TACHD|TOF|VSD1,...,-,NP_001159647|NP_001159648|NP_004378,-,-,-,-,-,-,Homo sapiens,Homo sapiens
256288,6910,2626,112773,108896,-,-,TBX5,GATA4,HOS,...,B3KUF4,NP_002043,-,-,-,-,-,-,Homo sapiens,Homo sapiens
256289,2626,6910,108896,112773,-,-,GATA4,TBX5,ASD2|TACHD|TOF|VSD1,...,-,NP_542448|NP_000183|NP_852259,-,-,-,-,-,-,Homo sapiens,Homo sapiens
262585,2626,9464,108896,114850,-,-,GATA4,HAND2,ASD2|TACHD|TOF|VSD1,...,-,NP_068808,-,-,-,-,-,-,Homo sapiens,Homo sapiens
262586,2626,9464,108896,114850,-,-,GATA4,HAND2,ASD2|TACHD|TOF|VSD1,...,-,NP_068808,-,-,-,-,-,-,Homo sapiens,Homo sapiens
277082,2516,2626,108792,108896,RP11-101K10.1,-,NR5A1,GATA4,AD4BP|ELP|FTZ1|FTZF1|POF7|SF-1|SF1|SPGF8|SRXY3,...,B3KUF4,NP_002043,-,-,-,-,-,-,Homo sapiens,Homo sapiens
277278,2626,23414,108896,116986,-,-,GATA4,ZFPM2,ASD2|TACHD|TOF|VSD1,...,Q9NPQ0,NP_036214,-,-,-,-,-,-,Homo sapiens,Homo sapiens
277279,2626,23414,108896,116986,-,-,GATA4,ZFPM2,ASD2|TACHD|TOF|VSD1,...,Q9NPQ0,NP_036214,-,-,-,-,-,-,Homo sapiens,Homo sapiens
277280,2626,23414,108896,116986,-,-,GATA4,ZFPM2,ASD2|TACHD|TOF|VSD1,...,Q9NPQ0,NP_036214,-,-,-,-,-,-,Homo sapiens,Homo sapiens


V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37
256288,6910,2626,112773,108896,-,-,TBX5,GATA4,HOS,...,B3KUF4,NP_002043,-,-,-,-,-,-,Homo sapiens,Homo sapiens
256289,2626,6910,108896,112773,-,-,GATA4,TBX5,ASD2|TACHD|TOF|VSD1,...,-,NP_542448|NP_000183|NP_852259,-,-,-,-,-,-,Homo sapiens,Homo sapiens
256290,1482,6910,107864,112773,-,-,NKX2-5,TBX5,CHNG5|CSX|CSX1|HLHS2|NKX2.5|NKX2E|NKX4-1|VSD3,...,-,NP_542448|NP_000183|NP_852259,-,-,-,-,-,-,Homo sapiens,Homo sapiens
281518,6910,1482,112773,107864,-,-,TBX5,NKX2-5,HOS,...,-,NP_001159647|NP_001159648|NP_004378,-,-,-,-,-,-,Homo sapiens,Homo sapiens
281520,1482,6910,107864,112773,-,-,NKX2-5,TBX5,CHNG5|CSX|CSX1|HLHS2|NKX2.5|NKX2E|NKX4-1|VSD3,...,-,NP_542448|NP_000183|NP_852259,-,-,-,-,-,-,Homo sapiens,Homo sapiens
505310,10524,6910,115779,112773,-,-,KAT5,TBX5,ESA1|HTATIP|HTATIP1|PLIP|TIP|TIP60|ZC2HC5|cPLA2,...,-,NP_542448|NP_000183|NP_852259,-,-,-,-,-,-,Homo sapiens,Homo sapiens
673871,6910,7403,112773,113246,-,RP13-886N14.3,TBX5,KDM6A,HOS,...,Q59HG3|Q86TD1|B4E0L8|B7ZKN6|E1U0S6|B7ZKN1|B7ZKN5,NP_066963|NP_001278350|NP_001278344|NP_001278345|NP_001278346|NP_001278347,-,-,-,-,-,-,Homo sapiens,Homo sapiens
673877,7403,6910,113246,112773,RP13-886N14.3,-,KDM6A,TBX5,KABUK2|UTX|bA386N14.2,...,-,NP_542448|NP_000183|NP_852259,-,-,-,-,-,-,Homo sapiens,Homo sapiens
673880,6910,6597,112773,112481,-,-,TBX5,SMARCA4,HOS,...,A7E2E1|Q9HBD4|B3KNW7,NP_003063|NP_001122319|NP_001122318|NP_001122317|NP_001122316|NP_001122320|NP_001122321|NP_001361386,-,-,-,-,-,-,Homo sapiens,Homo sapiens
1053994,6910,97064,112773,220584,-,-,TBX5,Wwtr1,HOS,...,-,NP_598545|NP_001161753,-,-,-,-,-,-,Homo sapiens,Mus musculus


In [20]:
db_gata4 = unique(c(iref_gata_genes, string_gata_genes, biogrid_gata_genes))
db_tbx5 = unique(c(iref_tbx5_genes, string_tbx5_genes, biogrid_tbx5_genes))

In [24]:
write.table(db_gata4, '../intermediate/interactome_lists/DB-GATA4_genes.txt',
            row.names=FALSE, col.names=FALSE, quote=FALSE)
write.table(db_tbx5, '../intermediate/interactome_lists/DB-GATA4_genes.txt',
           row.names=FALSE, col.names=FALSE, quote=FALSE)