In [1]:
import argparse
import os
import re
import shutil
import subprocess
import sys

In [2]:
# Append JASPAR-profile-inference to path
jaspar_dir = os.path.abspath("./JASPAR-profile-inference")
sys.path.insert(1, jaspar_dir)
sys.path

['/media/ofornes/Storage/Work/GRECO',
 '/media/ofornes/Storage/Work/GRECO/JASPAR-profile-inference',
 '/home/ofornes/Anaconda3/lib/python38.zip',
 '/home/ofornes/Anaconda3/lib/python3.8',
 '/home/ofornes/Anaconda3/lib/python3.8/lib-dynload',
 '',
 '/home/ofornes/Anaconda3/lib/python3.8/site-packages',
 '/home/ofornes/Anaconda3/lib/python3.8/site-packages/IPython/extensions',
 '/home/ofornes/.ipython']

In [3]:
# Import from JASPAR-profile-inference
from __init__ import Jglobals

In [4]:
#-------------#
# Class       #
#-------------#

class TF(object):

    def __init__(self, gene_name, species):

        self.gene_name = gene_name
        self.species = species
        self.uniacc = ""
        self.unientry = ""
        self.geneid = ""
        self.status = ""
        self.sequence = ""
        self.family = "Unknown"
        self.cluster_num = ""
        # self.orthodb = set()
        self.jaspar_id = ""
        self.hocomoco_id = set()
        
        # In vivo
        self.chip_atlas = set()
        self.cistromedb = set()
        self.gtrd = set()
        self.remap = set()
        self.dap_seq = set()

        # In vitro
        self.ht_selex = set()
        self.cisbp = set()
        self.uniprobe = set()
        self.smile_seq = set()

        # Hidden variables (for internal use only)
        self._uniaccs = set()
        self._unientries = set()
        self._geneids = set()
        self._sequences = set()

    @property
    def invivo(self):
        """
        Returns 1 if the TF has been profiled by in vivo methods,
        or 0 otherwise.
        @rtype = {int}
        """

        # For simplicity, ChIP-seq ~ ChIP-exo ~ DAP-seq
        if self.chip_atlas or self.cistromedb or self.gtrd or self.dap_seq or self.remap:
            return(1)

        return(0)

    @property
    def invitro(self):
        """
        Returns the number of different experimental methods by
        which a TF has been profiled in vitro.
        @rtype = {int}
        """

        n = 0

        if self.ht_selex:
            n += 1
        # i.e. PBM
        if self.cisbp or self.uniprobe:
            n += 1
        if self.smile_seq:
            n += 1

        return(n)

    @property
    def evidence(self):
        """
        Returns the amount of evidence associated with a TF.
        @rtype = {int}
        """
        return(self.invivo+self.invitro)

    def __str__(self):

        string = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
            self.gene_name,
            self.geneid,
            self.species,
            self.uniacc,
            # ";".join(sorted([i for i in self._uniaccs if i != self.uniacc])),
            self.unientry,
            # ";".join(sorted([i for i in self._unientries if i != self.unientry])),
            self.status,
            self.sequence,
            # ";".join(sorted([i for i in self._sequences if i != self.sequence])),
            self.family,
            self.cluster_num,
            self.evidence,
            # ";".join(sorted([i for i in self._pfam_ids if i != self.pfam_id])),
            # ";".join(sorted(self.orthodb)),
            self.jaspar_id,
            ";".join(sorted(self.hocomoco_id)),
            ";".join(sorted(self.chip_atlas)),
            ";".join(sorted(self.cistromedb)),
            ";".join(sorted(self.gtrd)),
            ";".join(sorted(self.remap)),
            ";".join(sorted(self.dap_seq)),
            ";".join(sorted(self.ht_selex)),
            ";".join(sorted(self.cisbp)),
            ";".join(sorted(self.uniprobe)),
            ";".join(sorted(self.smile_seq))
        )

        return(string)

    def __repr__(self):

        string = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
            self.gene_name,
            self.geneid,
            self.species,
            self.uniacc,
            # ";".join(sorted([i for i in self._uniaccs if i != self.uniacc])),
            self.unientry,
            # ";".join(sorted([i for i in self._unientries if i != self.unientry])),
            self.status,
            self.sequence,
            # ";".join(sorted([i for i in self._sequences if i != self.sequence])),
            self.family,
            self.cluster_num,
            self.evidence,
            # ";".join(sorted([i for i in self._pfam_ids if i != self.pfam_id])),
            # ";".join(sorted(self.orthodb)),
            self.jaspar_id,
            ";".join(sorted(self.hocomoco_id)),
            ";".join(sorted(self.chip_atlas)),
            ";".join(sorted(self.cistromedb)),
            ";".join(sorted(self.gtrd)),
            ";".join(sorted(self.remap)),
            ";".join(sorted(self.dap_seq)),
            ";".join(sorted(self.ht_selex)),
            ";".join(sorted(self.cisbp)),
            ";".join(sorted(self.uniprobe)),
            ";".join(sorted(self.smile_seq))
        )

        return(string)

In [5]:
#-------------#
# Paths       #
#-------------#

hocomoco_file = "./Data/Databases/HOCOMOCO/HOCOMOCOv11_full_jaspar_format.txt"
chip_atlas_file = "./Data/Experiments/ChIP-seq.ChIP-Atlas.tsv"
cistromedb_file = "./Data/Experiments/ChIP-seq.CistromeDB.tsv"
gtrd_file = "./Data/Experiments/ChIP-seq.GTRD.tsv"
remap_file = "./Data/Experiments/ChIP-seq.ReMap2020.tsv"
dap_seq_file = "./Data/Experiments/DAP-seq.PMID:27203113.tsv"
ht_selex_files = ["./Data/Experiments/HT-SELEX.PMID:23332764.tsv",
                  "./Data/Experiments/HT-SELEX.PMID:28473536.tsv"]
cisbp_txt_file = "./Data/Databases/CisBP-2.0/PWMs.txt"
cisbp_tsv_file = "./Data/Experiments/PBM.CisBP-2.0.tsv"
uniprobe_file = "./Data/Experiments/PBM.UniPROBE.tsv"
smile_seq = "./Data/Experiments/SMiLE-seq.PMID:28092692.tsv"

In [8]:
#-------------#
# Parse Data  #
#-------------#

# Initialize
tfs = set()

# For each line...
for line in Jglobals.parse_tsv_file("./Data/Parsed/TFs.tab.gz"):

    # Initialize
    tf = TF(line[0], line[1])

    # Get UniProt Accession
    uniaccs = line[2].split(";")
    tf.uniacc = uniaccs[0]
    tf._uniaccs.update(set(uniaccs))

    # Get UniProt Entry
    unientries = line[3].split(";")
    tf.unientry = unientries[0]
    tf._unientries.update(set(unientries))

    # Get Entrez Gene ID
    geneids = str(line[4]).split(";")
    tf.geneid = geneids[0]
    tf._geneids.update(set(geneids))

    # Get sequence
    sequences = line[5].split(";")
    tf.sequence = sequences[0]
    tf._sequences.update(set(sequences))

    # Get status (i.e. reviewed or not)
    tf.status = line[6]

    # Get family
    tf.family = line[7]

    # Get JASPAR ids
    if isinstance(line[8], str):
        tf.jaspar_id = line[8]

    # Add TF to TFs
    tfs.add(tf)

print(next(iter(tfs)))

T26I12.90	824687	Arabidopsis thaliana	Q9M3C9	Q9M3C9_ARATH	Unreviewed	MSPPSTIAYVLPPGFKFVPNDEEVIHCYLKPYSDGNTNVLLHVPIHLVNIYESNPQTLSEEFQKGNDKEWFIITERNKVDQGLSQTKRVGYGAKRQKRVDTNGGYWHATVAAQKINAGDGVVRNKRPLAYYVGKPSEGVKTDWLMQEYSLDHSSHNNDKDYTLCKIYLTPQATKMNKEVGEEKKKQKKGEAVVSVAPVEALEEQLPCNVEYHQPLAPLDSCQPQPHDLAYQQQQFCPGPLDSYQPQPHDMENQQPHNEKLKKEEDVEQLDLHQPDQGKGC	NAC/NAM		0											


In [9]:
#-------------#
# HOCOMOCO    #
#-------------#

#   678 Homo sapiens
#   451 Mus musculus

# SMCA5_MOUSE # Not a TF: SWI/SNF-related matrix-associated actin-dependent regulator of chromatin subfamily A member 5
# FUBP1_MOUSE # Not a TF: Far upstream element-binding protein 1
# BRCA1_MOUSE # Not a TF: Breast cancer type 1 susceptibility protein homolog
# EVI1_MOUSE  # Not a TF: Histone-lysine N-methyltransferase MECOM
# TAF1_MOUSE  # Not a TF: Transcription initiation factor TFIID subunit 1
# BRAC_MOUSE  # Not a valid UniProt Entry
# HLTF_MOUSE  # Not a TF: Helicase-like transcription factor
# TAF1_HUMAN  # Not a TF: Transcription initiation factor TFIID subunit 1
# HLTF_HUMAN  # Not a TF: Helicase-like transcription factor
# BRCA1_HUMAN # Not a TF: Breast cancer type 1 susceptibility protein
# SMCA5_HUMAN # Not a TF: SWI/SNF-related matrix-associated actin-dependent regulator of chromatin subfamily A member 5
# ZF64A_HUMAN # Not a valid UniProt Entry
# BRAC_HUMAN  # Not a valid UniProt Entry
# EVI1_HUMAN  # Not a TF: Histone-lysine N-methyltransferase MECOM
# FUBP1_HUMAN # Not a TF: Far upstream element-binding protein 1
# BPTF_HUMAN  # Not a TF: Nucleosome-remodeling factor subunit BPTF
# CENPB_HUMAN # Not a TF: Major centromere autoantigen B
# ZBT48_HUMAN # Not a valid UniProt Entry; should be TZAP_HUMAN

# For each line...
for line in Jglobals.parse_file(hocomoco_file):

    if line.startswith(">"):

        # Get unientry
        m = re.search("(\w+_(HUMAN|MOUSE))", line)
        unientry = m.group(1)

        # For each TF...
        for tf in sorted(tfs, key=lambda x: x.gene_name):
            if unientry in tf._unientries:
                tf.hocomoco_id.add(line[1:])
                break

In [10]:
genes = 0
feats = 0
count = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.hocomoco_id)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        if count <= 3:
            print(tf.gene_name, tf.hocomoco_id)
        if count == 3:
            print(tf)
            print("...")
        else:
            pass
        count += 1
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

AHR {'AHR_HUMAN.H11MO.0.B'}
AIRE {'AIRE_HUMAN.H11MO.0.C'}
ALX1 {'ALX1_HUMAN.H11MO.0.B'}
ALX3 {'ALX3_HUMAN.H11MO.0.D'}
ALX3	257	Homo sapiens	O95076	ALX3_HUMAN	Reviewed	MDPEHCAPFRVGPAPGPYVASGDEPPGPQGTPAAAPHLHPAPPRGPRLTRFPACGPLEPYLPEPAKPPAKYLQDLGPGPALNGGHFYEGPAEAEEKTSKAASFPQLPLDCRGGPRDGPSNLQGSPGPCLASLHLPLSPGLPDSMELAKNKSKKRRNRTTFSTFQLEELEKVFQKTHYPDVYAREQLALRTDLTEARVQVWFQNRRAKWRKRERYGKIQEGRNPFTAAYDISVLPRTDSHPQLQNSLWASPGSGSPGGPCLVSPEGIPSPCMSPYSHPHGSVAGFMGVPAPSAAHPGIYSIHGFPPTLGGHSFEPSSDGDYKSPSLVSLRVKPKEPPGLLNWTT	Homeodomain		0	MA0634.1	ALX3_HUMAN.H11MO.0.D									
...
//
Total genes: 1057
Total feats: 1214


In [11]:
#-------------#
# ChIP-Atlas  #
#-------------#

#   155 ce10
#   234 dm3
#   975 hg19
#   723 mm9
#    52 rn6
#   135 sacCer3

genomes = {
    "ce10": "Caenorhabditis elegans",
    "dm3": "Drosophila melanogaster",
    "hg19": "Homo sapiens",
    "mm9": "Mus musculus",
    "rn6": "Rattus norvegicus",
    "sacCer3": "Saccharomyces cerevisiae"
}

# For each line...
for line in Jglobals.parse_tsv_file(chip_atlas_file):

    # Inialize
    genome = line[0]
    antigen_class = line[1]
    antigen = line[2]
    experiment_ids = line[4]

    if antigen_class != "TFs and others":
        continue

    # For each TF...
    for tf in sorted(tfs, key=lambda x: x.gene_name):
        if antigen.upper() == tf.gene_name.upper() and genomes[genome] == tf.species:
            for experiment_id in experiment_ids.split(","):
                tf.chip_atlas.add(experiment_id)
            break

In [12]:
genes = 0
feats = 0
count = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.chip_atlas)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        if count <= 3:
            print(tf.gene_name, tf.chip_atlas)
        if count == 3:
            print(tf)
            print("...")
        else:
            pass
        count += 1
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

ABF1 {'SRX3709366', 'SRX3544078', 'SRX3709365'}
ADNP2 {'SRX359980'}
AEBP2 {'SRX5426124', 'SRX6370210', 'SRX6370192', 'SRX5426125', 'SRX6370209', 'SRX6370191'}
AHCTF1 {'SRX359920'}
AHCTF1	25909	Homo sapiens	Q8WYP5	ELYS_HUMAN	Reviewed	MRDLRAQVTSGLLPFPEVTLQALGEDEITLESVLRGKFAAGKNGLACLACGPQLEVVNSITGERLSAYRFSGVNEQPPVVLAVKEFSWQKRTGLLIGLEETEGSVLCLYDLGISKVVKAVVLPGRVTAIEPIINHGGASASTQHLHPSLRWLFGVAAVVTDVGQILLVDLCLDDLSCNQNEVEASDLEVLTGIPAEVPHIRESVMRQGRHLCFQLVSPTGTAVSTLSYISRTNQLAVGFSDGYLALWNMKSMKREYYIQLESGQVPVYAVTFQEPENDPRNCCYLWAVQSTQDSEGDVLSLHLLQLAFGNRKCLASGQILYEGLEYCEERYTLDLTGGMFPLRGQTSNTKLLGCQSIEKFRSHGDREEGVNEALSPDTSVSVFTWQVNIYGQGKPSVYLGLFDINRWYHAQMPDSLRSGEYLHNCSYFALWSLESVVSRTSPHGILDILVHERSLNRGVPPSYPPPEQFFNPSTYNFDATCLLNSGVVHLTCTGFQKETLTFLKKSGPSLNELIPDGYNRCLVAGLLSPRFVDVQPSSLSQEEQLEAILSAAIQTSSLGLLTGYIRRWITEEQPNSATNLRFVLEWTWNKVVLTKEEFDRLCVPLFDGSCHFMDPQTIQSIQQCYLLLSNLNIVLSCFASEAREITERGLIDLSNKFVVSHLICQYAQVVLWFSHSGLLPEGIDDSVQLSRLCYNYPVIQNYYTSRRQKFERLSRGKWNPDCLMIDGLVSQLGERIEKLWKRDEGGTGKYPPASLHAVLDMYLLDGV

In [13]:
#-------------#
# CistromeDB  #
#-------------#

# 11348 Homo sapiens
#  9060 Mus musculus

# For each line...
for line in Jglobals.parse_tsv_file(cistromedb_file):

    # Inialize
    experiment_id = str(line[0])
    species = line[1]
    gene = line[6]

    # For each TF...
    for tf in sorted(tfs, key=lambda x: x.gene_name):
        if gene.upper() == tf.gene_name.upper() and species == tf.species:
            tf.cistromedb.add(experiment_id)
            break

In [14]:
genes = 0
feats = 0
count = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.cistromedb)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        if count <= 3:
            print(tf.gene_name, tf.cistromedb)
        if count == 3:
            print(tf)
            print("...")
        else:
            pass
        count += 1
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

ADNP {'63582', '38734', '63581'}
ADNP2 {'42808'}
AEBP2 {'64454', '64455', '38782'}
AHCTF1 {'42748'}
AHCTF1	25909	Homo sapiens	Q8WYP5	ELYS_HUMAN	Reviewed	MRDLRAQVTSGLLPFPEVTLQALGEDEITLESVLRGKFAAGKNGLACLACGPQLEVVNSITGERLSAYRFSGVNEQPPVVLAVKEFSWQKRTGLLIGLEETEGSVLCLYDLGISKVVKAVVLPGRVTAIEPIINHGGASASTQHLHPSLRWLFGVAAVVTDVGQILLVDLCLDDLSCNQNEVEASDLEVLTGIPAEVPHIRESVMRQGRHLCFQLVSPTGTAVSTLSYISRTNQLAVGFSDGYLALWNMKSMKREYYIQLESGQVPVYAVTFQEPENDPRNCCYLWAVQSTQDSEGDVLSLHLLQLAFGNRKCLASGQILYEGLEYCEERYTLDLTGGMFPLRGQTSNTKLLGCQSIEKFRSHGDREEGVNEALSPDTSVSVFTWQVNIYGQGKPSVYLGLFDINRWYHAQMPDSLRSGEYLHNCSYFALWSLESVVSRTSPHGILDILVHERSLNRGVPPSYPPPEQFFNPSTYNFDATCLLNSGVVHLTCTGFQKETLTFLKKSGPSLNELIPDGYNRCLVAGLLSPRFVDVQPSSLSQEEQLEAILSAAIQTSSLGLLTGYIRRWITEEQPNSATNLRFVLEWTWNKVVLTKEEFDRLCVPLFDGSCHFMDPQTIQSIQQCYLLLSNLNIVLSCFASEAREITERGLIDLSNKFVVSHLICQYAQVVLWFSHSGLLPEGIDDSVQLSRLCYNYPVIQNYYTSRRQKFERLSRGKWNPDCLMIDGLVSQLGERIEKLWKRDEGGTGKYPPASLHAVLDMYLLDGVTEAAKHSITIYLLLDIMYSFPNKTDTPIESFPTVFAISWGQVKLIQGFWLIDHNDYESGLDLLFHPATAKPLSWQHSKII

In [15]:
#-------------#
# GTRD        #
#-------------#

#    71 Arabidopsis thaliana
#   213 Caenorhabditis elegans
#    11 Danio rerio
#   249 Drosophila melanogaster
#  1236 Homo sapiens
#   513 Mus musculus
#    12 Rattus norvegicus
#   137 Saccharomyces cerevisiae
#    32 Schizosaccharomyces pombe

# For each line...
for line in Jglobals.parse_tsv_file(gtrd_file):

    # Inialize
    experiment_id = line[0]
    uniacc = line[1]

    # For each TF...
    for tf in sorted(tfs, key=lambda x: x.gene_name):
        if uniacc in tf._uniaccs:
            tf.gtrd.add(experiment_id)
            break

In [16]:
genes = 0
feats = 0
count = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.gtrd)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        if count <= 3:
            print(tf.gene_name, tf.gtrd)
        if count == 3:
            print(tf)
            print("...")
        else:
            pass
        count += 1
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

11723 {'EXP043557'}
ABF4 {'EXP041418'}
ADD1 {'EXP044073'}
ADNP {'EXP039553'}
ADNP	23394	Homo sapiens	Q9H2P0	ADNP_HUMAN	Reviewed	MFQLPVNNLGSLRKARKTVKKILSDIGLEYCKEHIEDFKQFEPNDFYLKNTTWEDVGLWDPSLTKNQDYRTKPFCCSACPFSSKFFSAYKSHFRNVHSEDFENRILLNCPYCTFNADKKTLETHIKIFHAPNASAPSSSLSTFKDKNKNDGLKPKQADSVEQAVYYCKKCTYRDPLYEIVRKHIYREHFQHVAAPYIAKAGEKSLNGAVPLGSNAREESSIHCKRCLFMPKSYEALVQHVIEDHERIGYQVTAMIGHTNVVVPRSKPLMLIAPKPQDKKSMGLPPRIGSLASGNVRSLPSQQMVNRLSIPKPNLNSTGVNMMSSVHLQQNNYGVKSVGQGYSVGQSMRLGLGGNAPVSIPQQSQSVKQLLPSGNGRSYGLGSEQRSQAPARYSLQSANASSLSSGQLKSPSLSQSQASRVLGQSSSKPAAAATGPPPGNTSSTQKWKICTICNELFPENVYSVHFEKEHKAEKVPAVANYIMKIHNFTSKCLYCNRYLPTDTLLNHMLIHGLSCPYCRSTFNDVEKMAAHMRMVHIDEEMGPKTDSTLSFDLTLQQGSHTNIHLLVTTYNLRDAPAESVAYHAQNNPPVPPKPQPKVQEKADIPVKSSPQAAVPYKKDVGKTLCPLCFSILKGPISDALAHHLRERHQVIQTVHPVEKKLTYKCIHCLGVYTSNMTASTITLHLVHCRGVGKTQNGQDKTNAPSRLNQSPSLAPVKRTYEQMEFPLLKKRKLDDDSDSPSFFEEKPEEPVVLALDPKGHEDDSYEARKSFLTKYFNKQPYPTRREIEKLAASLWLWKSDIASHFSNKRKKCVRDCEKYKPGVLLGFNMKELNKVKHEMDFDAEWLFENHDEKDSRVNASKTADKKLNLGKED

In [17]:
#-------------#
# ReMap 2020  #
#-------------#

#   405 Arabidopsis thaliana
#  1135 Homo sapiens

# For each line...
for line in Jglobals.parse_tsv_file(remap_file):

    # Inialize
    gene = line[0]
    species = line[1]
    experiment_ids = line[2]

    # For each TF...
    for tf in sorted(tfs, key=lambda x: x.gene_name):
        if gene.upper() == tf.gene_name.upper() and species == tf.species:
            for experiment_id in experiment_ids.split(";"):
                tf.remap.add(experiment_id)
            break

In [18]:
genes = 0
feats = 0
count = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.remap)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        if count <= 3:
            print(tf.gene_name, tf.remap)
        if count == 3:
            print(tf)
            print("...")
        else:
            pass
        count += 1
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

ABF1 {'GSE80564.ABF1.Col-0_seedling_3d-EtOH', 'GSE80564.ABF1.Col-0_seedling_3d-ABA'}
ABF3 {'GSE80564.ABF3.Col-0_seedling_3d-ABA', 'GSE80564.ABF3.Col-0_seedling_3d-EtOH'}
ABF4 {'GSE80564.ABF4.Col-0_seedling_3d-EtOH', 'GSE80564.ABF4.Col-0_seedling_3d-ABA'}
ABI5 {'GSE60141.ABI5.Col-0_leaves_tnt_col', 'GSE60142.ABI5.Col-0_seedling_36h-ypet', 'GSE60142.ABI5.Col-0_seedling_36h-wt'}
ABI5	818199.0	Arabidopsis thaliana	Q9SJN0	ABI5_ARATH	Reviewed	MVTRETKLTSEREVESSMAQARHNGGGGGENHPFTSLGRQSSIYSLTLDEFQHALCENGKNFGSMNMDEFLVSIWNAEENNNNQQQAAAAAGSHSVPANHNGFNNNNNNGGEGGVGVFSGGSRGNEDANNKRGIANESSLPRQGSLTLPAPLCRKTVDEVWSEIHRGGGSGNGGDSNGRSSSSNGQNNAQNGGETAARQPTFGEMTLEDFLVKAGVVREHPTNPKPNPNPNQNQNPSSVIPAAAQQQLYGVFQGTGDPSFPGQAMGVGDPSGYAKRTGGGGYQQAPPVQAGVCYGGGVGFGAGGQQMGMVGPLSPVSSDGLGHGQVDNIGGQYGVDMGGLRGRKRVVDGPVEKVVERRQRRMIKNRESAARSRARKQAYTVELEAELNQLKEENAQLKHALAELERKRKQQYFESLKSRAQPKLPKSNGRLRTLMRNPSCPL	bZIP		1	MA0931.1					GSE60141.ABI5.Col-0_leaves_tnt_col;GSE60142.ABI5.Col-0_seedling_36h-wt;GSE60142.ABI5.Col-0_seed

In [19]:
#-------------#
# DAP-seq     #
#-------------#

#   934 Arabidopsis thaliana
#     2 Zea mays

# For each line...
for line in Jglobals.parse_tsv_file(dap_seq_file):

    if line[0] == "AvgSpotLen":
        continue

    species = line[8]
    sra_run = line[9]
    gene = line[15]

    # For each TF...
    for tf in sorted(tfs, key=lambda x: x.gene_name):
        if gene.upper() == tf.gene_name.upper() and species == tf.species:
            tf.dap_seq.add(sra_run)

In [20]:
genes = 0
feats = 0
count = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.dap_seq)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        if count <= 3:
            print(tf.gene_name, tf.dap_seq)
        if count == 3:
            print(tf)
            print("...")
        else:
            pass
        count += 1
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

ABF2 {'SRR2926839'}
ABI5 {'SRR2926840', 'SRR2926841'}
ABR1 {'SRR2926082', 'SRR2926081'}
AGL13 {'SRR2926463'}
AGL13	825284.0	Arabidopsis thaliana	Q38837	AGL13_ARATH	Reviewed	MGRGKVEVKRIENKITRQVTFSKRKSGLLKKAYELSVLCDAEVSLIIFSTGGKLYEFSNVGVGRTIERYYRCKDNLLDNDTLEDTQGLRQEVTKLKCKYESLLRTHRNLVGEDLEGMSIKELQTLERQLEGALSATRKQKTQVMMEQMEELRRKERELGDINNKLKLETEDHDFKGFQDLLLNPVLTAGCSTDFSLQSTHQNYISDCNLGYFLQIGFQQHYEQGEGSSVTKSNARSDAETNFVQ	MADS box		1	MA1204.1						SRR2926463				
...
//
Total genes: 290
Total feats: 487


In [21]:
#-------------#
# HT-SELEX    #
#-------------#

# For each file...
for file_name in ht_selex_files:

    m = re.search("HT-SELEX.PMID:(\d+).tsv", file_name)
    pmid = int(m.group(1))

    # For each line...
    for line in Jglobals.parse_tsv_file(file_name):

        if line[0] == "Alias":
            continue

        m = re.search("^([A-Za-z\d]+)_", line[0])
        if m:

            gene_name = m.group(1)

            if pmid == 23332764:
                sra_run = line[14]
            else:
                sra_run = line[19]

            # For each TF...
            for tf in sorted(tfs, key=lambda x: x.gene_name):
                # HT-SELEX data only available for human and mouse
                if tf.species not in ["Homo sapiens", "Mus musculus"]:
                    continue
                if tf.gene_name == gene_name:
                    tf.ht_selex.add(sra_run)
                    break

In [22]:
genes = 0
feats = 0
count = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.ht_selex)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        if count <= 3:
            print(tf.gene_name, tf.ht_selex)
        if count == 3:
            print(tf)
            print("...")
        else:
            pass
        count += 1
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

ALX1 {'ERR1002054', 'ERR1002055', 'ERR1002057', 'ERR1002056', 'ERR1002050', 'ERR1002053', 'ERR1002052', 'ERR1002051'}
ALX3 {'ERR1002063', 'ERR1002067', 'ERR195525', 'ERR1002061', 'ERR1002072', 'ERR195526', 'ERR1002064', 'ERR193772', 'ERR1011038', 'ERR1002068', 'ERR1002069', 'ERR1002062', 'ERR1002065', 'ERR193770', 'ERR193773', 'ERR193771', 'ERR1010299', 'ERR1002071', 'ERR1002073', 'ERR1002070', 'ERR1002058', 'ERR1002060', 'ERR1002066', 'ERR195527', 'ERR1011404', 'ERR195528', 'ERR1010665', 'ERR1002059'}
ALX4 {'ERR194918', 'ERR1002081', 'ERR1002075', 'ERR1002074', 'ERR1002076', 'ERR194919', 'ERR1002079', 'ERR1002078', 'ERR1002080', 'ERR194917', 'ERR1002077', 'ERR194920'}
AR {'ERR193680', 'ERR193971', 'ERR193973', 'ERR193681', 'ERR193972', 'ERR193970', 'ERR193679', 'ERR193678'}
AR	367	Homo sapiens	P10275	ANDR_HUMAN	Reviewed	MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAASAAPPGASLLLLQQQQQQQQQQQQQQQQQQQQQQQETSPRQQQQQQGEDGSPQAHRRGPTGYLVLDEEQQPSQPQSALECHPERGCVPEPGAAVAASKGLPQQLPAPPDEDDSAAPSTLSLL

In [23]:
#-------------#
# CIS-BP      #
#-------------#

valid_matrix_ids = set()

# Get valid PWMs
for matrix_id in Jglobals.parse_file(cisbp_txt_file):
    valid_matrix_ids.add(matrix_id)

# For each line...
for line in Jglobals.parse_file(cisbp_tsv_file):

    matrix_ids = re.findall("(M\d{5}_2.00)", line)

    if matrix_ids:

        line = line.strip("\n").split("\t")
        gene_name = line[1]
        species = line[2].replace("_", " ")

        # Skip inferred TFs
        if line[3] != "D":
            continue

        # For each TF...
        for tf in sorted(tfs, key=lambda x: x.gene_name):
            if tf.gene_name.upper() == gene_name.upper() and species == tf.species:
                tf.cisbp.update(valid_matrix_ids.intersection(set(matrix_ids)))

In [24]:
genes = 0
feats = 0
count = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.cisbp)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        if count <= 3:
            print(tf.gene_name, tf.cisbp)
        if count == 3:
            print(tf)
            print("...")
        count += 1
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

ABF1 {'M00001_2.00', 'M00902_2.00'}
ABF1 {'M01772_2.00', 'M01773_2.00'}
ABF2 {'M00069_2.00'}
ABF2 {'M01771_2.00'}
ABF2	841095.0	Arabidopsis thaliana	Q9M7Q4	AI5L5_ARATH	Reviewed	MDGSMNLGNEPPGDGGGGGGLTRQGSIYSLTFDEFQSSVGKDFGSMNMDELLKNIWSAEETQAMASGVVPVLGGGQEGLQLQRQGSLTLPRTLSQKTVDQVWKDLSKVGSSGVGGSNLSQVAQAQSQSQSQRQQTLGEVTLEEFLVRAGVVREEAQVAARAQIAENNKGGYFGNDANTGFSVEFQQPSPRVVAAGVMGNLGAETANSLQVQGSSLPLNVNGARTTYQQSQQQQPIMPKQPGFGYGTQMGQLNSPGIRGGGLVGLGDQSLTNNVGFVQGASAAIPGALGVGAVSPVTPLSSEGIGKSNGDSSSLSPSPYMFNGGVRGRKSGTVEKVVERRQRRMIKNRESAARSRARKQAYTVELEAEVAKLKEENDELQRKQARIMEMQKNQETEMRNLLQGGPKKKLRRTESGPW	bZIP		2	MA0941.1						SRR2926839		M01771_2.00		
...
//
Total genes: 1200
Total feats: 1542


In [25]:
#-------------#
# UniPROBE    #
#-------------#

# For each line...
for line in Jglobals.parse_tsv_file(uniprobe_file):
   
    if line[0] == "Protein":
        continue

    gene_name = line[0]
    uniprobe_id = line[1]

    # For each TF...
    for tf in sorted(tfs, key=lambda x: x.gene_name):
        if tf.gene_name.upper() == gene_name.upper() and line[2] == tf.species:
            tf.uniprobe.add(uniprobe_id)

In [26]:
genes = 0
feats = 0
count = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.uniprobe)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        if count <= 3:
            print(tf.gene_name, tf.uniprobe)
        if count == 3:
            print(tf)
            print("...")
        else:
            pass
        count += 1
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

ABF1 {'UP00452'}
AFT1 {'UP00344'}
ARO80 {'UP00329'}
ARX {'UP00584'}
ARX	170302	Homo sapiens	Q96QS3	ARX_HUMAN	Reviewed	MSNQYQEEGCSERPECKSKSPTLLSSYCIDSILGRRSPCKMRLLGAAQSLPAPLTSRADPEKAVQGSPKSSSAPFEAELHLPPKLRRLYGPGGGRLLQGAAAAAAAAAAAAAAAATATAGPRGEAPPPPPPTARPGERPDGAGAAAAAAAAAAAAWDTLKISQAPQVSISRSKSYRENGAPFVPPPPALDELGGPGGVTHPEERLGVAGGPGSAPAAGGGTGTEDDEEELLEDEEDEDEEEELLEDDEEELLEDDARALLKEPRRCPVAATGAVAAAAAAAVATEGGELSPKEELLLHPEDAEGKDGEDSVCLSAGSDSEEGLLKRKQRRYRTTFTSYQLEELERAFQKTHYPDVFTREELAMRLDLTEARVQVWFQNRRAKWRKREKAGAQTHPPGLPFPGPLSATHPLSPYLDASPFPPHHPALDSAWTAAAAAAAAAFPSLPPPPGSASLPPSGAPLGLSTFLGAAVFRHPAFISPAFGRLFSTMAPLTSASTAAALLRQPTPAVEGAVASGALADPATAAADRRASSIAALRLKAKEHAAQLTQLNILPGTSTGKEVC	Homeodomain		2		ARX_HUMAN.H11MO.0.D						ERR1002106;ERR1002107;ERR1002108;ERR1002109;ERR1002110;ERR1002111;ERR1002112;ERR1002113;ERR195529;ERR195530;ERR195531;ERR195532	M00260_2.00;M00261_2.00;M00262_2.00;M00263_2.00;M00264_2.00;M00265_2.00	UP00584	
...
//
Total genes: 541
Total feats: 559


In [27]:
#-------------#
# SMiLE-seq   #
#-------------#

synonyms = {
    "CEBPb": "CEBPB",
    "cFOS": "FOS",
    "cFOSL2": "FOSL2",
    "cJUN": "JUN",
    "PPARa": "PPARA",
    "PPARg": "PPARG",
    "RXRa": "RXRA",
    "RXRg": "RXRG"            
}

# For each line...
for line in Jglobals.parse_tsv_file(smile_seq):

    if line[0] == "Assay_Type":
        continue

    sra_run = line[13]

    for gene_name in line[16].split("_")[0].split("-"):
        if gene_name in synonyms:
            gene_name = synonyms[gene_name]

        # For each TF...
        for tf in sorted(tfs, key=lambda x: x.gene_name):
            if tf.gene_name.upper() == gene_name.upper() and line[12] == tf.species:
                tf.smile_seq.add(sra_run)

In [28]:
genes = 0
feats = 0
count = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.smile_seq)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        if count <= 3:
            print(tf.gene_name, tf.smile_seq)
        if count == 3:
            print(tf)
            print("...")
        else:
            pass
        count += 1
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

ARNTL {'SRR3405116', 'SRR3405117'}
CEBPB {'SRR3405054'}
CLOCK {'SRR3405116', 'SRR3405117'}
CTCF {'SRR3405066', 'SRR3405055', 'SRR3405078'}
CTCF	10664	Homo sapiens	P49711	CTCF_HUMAN	Reviewed	MEGDAVEAIVEESETFIKGKERKTYQRRREGGQEEDACHLPQNQTDGGEVVQDVNSSVQMVMMEQLDPTLLQMKTEVMEGTVAPEAEAAVDDTQIITLQVVNMEEQPINIGELQLVQVPVPVTVPVATTSVEELQGAYENEVSKEGLAESEPMICHTLPLPEGFQVVKVGANGEVETLEQGELPPQEDPSWQKDPDYQPPAKKTKKTKKSKLRYTEEGKDVDVSVYDFEEEQQEGLLSEVNAEKVVGNMKPPKPTKIKKKGVKKTFQCELCSYTCPRRSNLDRHMKSHTDERPHKCHLCGRAFRTVTLLRNHLNTHTGTRPHKCPDCDMAFVTSGELVRHRRYKHTHEKPFKCSMCDYASVEVSKLKRHIRSHTGERPFQCSLCSYASRDTYKLKRHMRTHSGEKPYECYICHARFTQSGTMKMHILQKHTENVAKFHCPHCDTVIARKSDLGVHLRKQHSYIEQGKKCRYCDAVFHERYALIQHQKSHKNEKRFKCDQCDYACRQERHMIMHKRTHTGEKPYACSHCDKTFRQKQLLDMHFKRYHDPNFVPAAFVCSKCGKTFTRRNTMARHADNCAGPDGVEGENGGETKKSKRGRKRKMRSKKEDSSDSENAEPDLDDNEDEEEPAVEIEPEPEPQPVTPAPPPAKKRRGRPPGRTNQPKQNQPTAIIQVEDQNTGAIENIIVEVKKEPDAEPAEGEEEEAQPAATDAPNGDLTPEMILSMMDR	C2H2 ZF		3	MA0139.1	CTCF_HUMAN.H11MO.0.A	ERX008573;ERX008585;ERX008591;ERX008596;E

In [30]:
#-------------#
# Pfam        #
#-------------#

# The following code is adapted from:
# https://github.com/wassermanlab/JASPAR-profile-inference/blob/master/files/get_files.py

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import json
from infer_profile import hmmAlign, hmmScan, _makeSeqFile

# Initialize
seq_file = ".seq.fasta"
hmm_database = os.path.join(jaspar_dir, "files", "pfam-DBDs", "all_DBDs.hmm")

# Change dir
os.chdir(sys.path[0])
os.chdir(os.path.abspath("./Data/Pfam/"))

# Skip if JSON file already exists
json_file = "TFs.json"
if not os.path.exists(json_file):

    # Initialize
    pfams = {}

    # For each TF...
    for tf in sorted(tfs, key=lambda x: x.gene_name):

        # Initialize
        unientry = tf.unientry
        pfams.setdefault(unientry, [])

        # Make seq file
        seq = Seq(tf.sequence)
        seq_record = SeqRecord(seq, id=unientry, name=unientry, description=unientry)
        _makeSeqFile(seq_record, seq_file)

        # For each DBD...
        for pfam_ac, start, end, evalue in hmmScan(seq_file, hmm_database, non_overlapping_domains=True):

            # Initialize
            hmm_file = os.path.join(jaspar_dir, "files", "pfam-DBDs", "%s.hmm" % pfam_ac)

            # Make seq file
            sub_seq = seq[start:end]
            seq_record = SeqRecord(sub_seq, id=unientry, name=unientry, description=unientry)
            _makeSeqFile(seq_record, seq_file)

            # Add DBDs
            alignment = hmmAlign(seq_file, hmm_file)
            pfams[unientry].append((pfam_ac, alignment, start+1, end, evalue))

    # Write
    Jglobals.write(json_file, json.dumps(pfams, sort_keys=True, indent=4, separators=(",", ": ")))

    # Remove seq file
    if os.path.exists(seq_file):
        os.remove(seq_file)

# Change dir back
os.chdir(sys.path[0])



In [31]:
#-------------#
# Cluster     #
# (MMseqs2)   #
#-------------#

# Get species
species = {}
for tf in sorted(tfs, key=lambda x: x.gene_name):
    species.setdefault(tf.species, tf.species.replace(" ", "_"))

# Get species
species = {}
for tf in sorted(tfs, key=lambda x: x.gene_name):
    species.setdefault(tf.species, tf.species.replace(" ", "_"))

# For each species...
for s in sorted(species):

    # Initialize
    cluster_num = 0
    clusters = {}
    cluster_ids = set()
    clusters_dir = "./Data/Clusters/%s" % species[s]
    family_num = 0
    families = {}
    mmseqs_path = "/Users/ofornes/.anaconda3/bin/mmseqs"


    if not os.path.isdir(clusters_dir):
        os.mkdir(clusters_dir)

    # Get families
    for tf in sorted(tfs, key=lambda x: x.gene_name):
        if tf.species != s:
            continue
        if tf.family not in families:
            family_num += 1
            families.setdefault(tf.family, family_num)
            prefix = os.path.join(clusters_dir, str(families[tf.family]))
            fasta_file = "%s.fasta" % prefix
            if os.path.exists(fasta_file):
                os.remove(fasta_file)

    # Create family-specific FASTA files
    for tf in sorted(tfs, key=lambda x: x.gene_name):
        if tf.species != s:
            continue
        prefix = os.path.join(clusters_dir, str(families[tf.family]))
        fasta_file = "%s.fasta" % prefix
        Jglobals.write(fasta_file, ">%s\n%s" % (tf.unientry, tf.sequence))

    # Skip if JSON file already exists
    json_file = "%s/%s.json" % (clusters_dir, species[s])
    if not os.path.exists(json_file):

        for family in sorted(families.values()):

            # Initialize
            prefix = os.path.join(clusters_dir, str(family))
            cluster_all_seqs = "%s_all_seqs.fasta" % prefix
            cluster_clusters = "%s_cluster.tsv" % prefix
            cluster_rep_seqs = "%s_rep_seq.fasta" % prefix

            # Cluster TFs
            if not os.path.exists(cluster_rep_seqs):
                fasta_file = "%s.fasta" % prefix
                # From Burkhard Rost's "Twilight zone of protein sequence alignments":
                # Sequence alignments unambiguously distinguish between protein pairs of similar and
                # non-similar structure when the pairwise sequence identity is high (>40% for long alignments).
                opts = "--min-seq-id 0.4 --cov-mode 1"
                cmd = "%s easy-cluster %s %s %s %s" % (mmseqs_path, fasta_file, prefix, prefix, opts)
                process = subprocess.run(cmd, shell=True, check=True)
                if os.path.isdir(prefix):
                    shutil.rmtree(prefix)

            # For each line...
            for line in Jglobals.parse_file(cluster_clusters):

                # Get cluster
                cluster_id, unientry = line.split("\t")
                if cluster_id not in cluster_ids:
                    cluster_num += 1
                    cluster_ids.add(cluster_id)
                if unientry not in clusters:
                    clusters.setdefault(unientry, cluster_num)

        # Write
        Jglobals.write(json_file, json.dumps(clusters, sort_keys=True, indent=4, separators=(",", ": ")))

    else:
        with open(json_file) as f:
            clusters = json.load(f)

    # For each TF...
    for tf in sorted(tfs, key=lambda x: x.gene_name):
        if tf.species != s:
                continue
        tf.cluster_num = clusters[tf.unientry]

In [32]:
#-------------#
# Output TSV  #
#-------------#

# Initialize
fields = ["Gene Name", "Gene ID", "Species", "UniProt Accession", "UniProt Entry", "Status", "Sequence", "Family", "Cluster",
          "Evidence", "JASPAR", "HOCOMOCO", "ChIP-Atlas", "CistromeDB", "GTRD", "ReMap", "DAP-seq", "HT-SELEX",
          "CIS-BP", "UniPROBE", "SMiLE-seq"]

# For each species...
for s in sorted(species):

    # Initialize
    annotations_file = "./Annotations/%s.tsv" % species[s]

    if os.path.exists(annotations_file):
        os.remove(annotations_file)

    # Write
    Jglobals.write(annotations_file, "\t".join(fields))
    for tf in sorted(tfs, key=lambda x: x.gene_name):
        if tf.species != s:
            continue
        Jglobals.write(annotations_file, tf)

In [38]:
#-------------#
# Triads      #
#-------------#

# Initialize
count = 0
done = set()
evidence = {}
families = {}
strings = []

# For each species...
for s in sorted(species):

    # Initialize
    triads_file = "./Annotations/%s.triads.tsv" % species[s]

    if os.path.exists(triads_file):
        os.remove(triads_file)

    # Write
    Jglobals.write(triads_file, "\t".join(fields))
    for tf in sorted(tfs, key=lambda x: x.evidence, reverse=True):
        if tf.species != s:
            continue
        if tuple([tf.species, tf.cluster_num]) in done:
            continue
        if tf.invivo > 0 and tf.invitro > 1:
            Jglobals.write(triads_file, tf)
            strings.append([tf.gene_name, tf.species, tf.evidence, tf.invivo, tf.invitro])
            evidence.setdefault(tf.species, 0)
            evidence[tf.species] += 1
            families.setdefault(tf.family, 0)
            families[tf.family] += 1
            done.add(tuple([tf.species, tf.cluster_num]))

for s in strings:
    if count < 3:
        print(s)
    elif count == 3:
        print("...")
    else:
        pass
    count += 1
print("//")
for e in sorted(evidence):
    print(e, evidence[e])
print("//")
for f in sorted(families):
    print(f, families[f])

['sna', 'Drosophila melanogaster', 3, 1, 2]
['JUN', 'Homo sapiens', 4, 1, 3]
['PAX7', 'Homo sapiens', 4, 1, 3]
...
//
Drosophila melanogaster 1
Homo sapiens 62
Mus musculus 32
//
C2H2 ZF 15
C2H2 ZF,MADF 1
CUT,Homeodomain 1
DM 1
E2F 1
Ets 4
Forkhead 7
GATA 2
Homeodomain 15
Homeodomain,POU 1
Homeodomain,Paired box 2
Nuclear receptor 13
RFX 2
Rel 3
SAND 1
Sox 10
bHLH 9
bZIP 7


In [39]:
#-------------#
# Duos        #
#-------------#

# Initialize
count = 0
done = set()
evidence = {}
families = {}
strings = []

# For each species...
for s in sorted(species):

    # Initialize
    triads_file = "./Annotations/%s.duos.tsv" % species[s]

    if os.path.exists(triads_file):
        os.remove(triads_file)

    # Write
    Jglobals.write(triads_file, "\t".join(fields))
    for tf in sorted(tfs, key=lambda x: x.evidence, reverse=True):
        if tf.species != s:
            continue
        if tuple([tf.species, tf.cluster_num]) in done:
            continue
        if tf.invivo > 0 and tf.invitro > 0:
            Jglobals.write(triads_file, tf)
            strings.append([tf.gene_name, tf.species, tf.evidence, tf.invivo, tf.invitro])
            evidence.setdefault(tf.species, 0)
            evidence[tf.species] += 1
            families.setdefault(tf.family, 0)
            families[tf.family] += 1
            done.add(tuple([tf.species, tf.cluster_num]))

for s in strings:
    if count < 3:
        print(s)
    elif count == 3:
        print("...")
    else:
        pass
    count += 1
print("//")
for e in sorted(evidence):
    print(e, evidence[e])
print("//")
for f in sorted(families):
    print(f, families[f])

['SPL1', 'Arabidopsis thaliana', 2, 1, 1]
['SPL11', 'Arabidopsis thaliana', 2, 1, 1]
['SPL14', 'Arabidopsis thaliana', 2, 1, 1]
...
//
Arabidopsis thaliana 111
Caenorhabditis elegans 30
Drosophila melanogaster 40
Homo sapiens 294
Mus musculus 153
Saccharomyces cerevisiae 33
//
ABF1 1
AP2 22
AP2,B3 1
APSES 1
ARID/BRIGHT 1
AT hook 2
BED ZF 3
Brinker 1
C2H2 ZF 105
C2H2 ZF,Homeodomain 3
C2H2 ZF,MADF 1
C2HC ZF 1
CSD 1
CSL 1
CUT,Homeodomain 3
CxxC 4
DM 3
Dof 2
E2F 6
EBF1 2
EIN3 2
Ets 26
Forkhead 24
GATA 8
GCM 1
GCR1 1
Grainyhead 2
HSF 5
Homeodomain 107
Homeodomain,POU 10
Homeodomain,Paired box 5
IRF 11
LOB 1
MADF 1
MADS box 4
MBD 1
Myb/SANT 31
NAC/NAM 6
Nuclear receptor 35
POU 1
Paired box 1
Pipsqueak 1
Prospero 2
RFX 4
Rap1 1
Rel 4
Runt 2
SAND 2
SBP 5
SMAD 4
Sox 20
T-box 7
TBP 3
TCP 6
TCR/CxC 1
TEA 1
Unknown 3
WRKY 13
Zinc cluster 12
bHLH 63
bHLH,T-box 1
bZIP 56
bZIP,C2H2 ZF 1
p53 2


In [40]:
#-------------#
# Evidence 2+ #
#-------------#

# Initialize
count = 0
done = set()
evidence = {}
families = {}
strings = []

# For each species...
for s in sorted(species):

    # Initialize
    triads_file = "./Annotations/%s.multiple_evidence.tsv" % species[s]

    if os.path.exists(triads_file):
        os.remove(triads_file)

    # Write
    Jglobals.write(triads_file, "\t".join(fields))
    for tf in sorted(tfs, key=lambda x: x.evidence, reverse=True):
        if tf.species != s:
            continue
        if tuple([tf.species, tf.cluster_num]) in done:
            continue
        if tf.evidence > 1:
            Jglobals.write(triads_file, tf)
            strings.append([tf.gene_name, tf.species, tf.evidence, tf.invivo, tf.invitro])
            evidence.setdefault(tf.species, 0)
            evidence[tf.species] += 1
            families.setdefault(tf.family, 0)
            families[tf.family] += 1
            done.add(tuple([tf.species, tf.cluster_num]))

for s in strings:
    if count < 3:
        print(s)
    elif count == 3:
        print("...")
    else:
        pass
    count += 1
print("//")
for e in sorted(evidence):
    print(e, evidence[e])
print("//")
for f in sorted(families):
    print(f, families[f])

['SPL1', 'Arabidopsis thaliana', 2, 1, 1]
['SPL11', 'Arabidopsis thaliana', 2, 1, 1]
['SPL14', 'Arabidopsis thaliana', 2, 1, 1]
...
//
Arabidopsis thaliana 111
Caenorhabditis elegans 30
Drosophila melanogaster 41
Homo sapiens 312
Mus musculus 174
Saccharomyces cerevisiae 33
//
ABF1 1
AP2 22
AP2,B3 1
APSES 1
ARID/BRIGHT 1
AT hook 2
BED ZF 3
Brinker 1
C2H2 ZF 107
C2H2 ZF,Homeodomain 3
C2H2 ZF,MADF 1
C2HC ZF 1
CSD 1
CSL 1
CUT,Homeodomain 3
CxxC 4
DM 5
Dof 2
E2F 6
EBF1 2
EIN3 2
Ets 27
Forkhead 26
GATA 8
GCM 2
GCR1 1
Grainyhead 2
HSF 5
Homeodomain 133
Homeodomain,POU 12
Homeodomain,Paired box 5
IRF 11
LOB 1
MADF 1
MADS box 4
MBD 1
Myb/SANT 31
NAC/NAM 6
Nuclear receptor 35
POU 1
Paired box 1
Pipsqueak 1
Prospero 2
RFX 4
Rap1 1
Rel 4
Runt 2
SAND 2
SBP 5
SMAD 4
Sox 21
T-box 7
TBP 3
TCP 6
TCR/CxC 1
TEA 1
Unknown 3
WRKY 13
Zinc cluster 12
bHLH 66
bHLH,T-box 1
bZIP 56
bZIP,C2H2 ZF 1
p53 2


In [46]:
# Intersection with list of ~400 putative TFs
# r.lovering@ucl.ac.uk

# Initialize
gene_names = set()
tf_file = "./Annotations/Rende_400TFs_checked.tsv"

if os.path.exists(tf_file):
    os.remove(tf_file)

# Write
Jglobals.write(tf_file, "\t".join(fields))

# For each line...
for line in Jglobals.parse_tsv_file("./Data/Rende_400TFs_checked.txt"):
    if line[3] is None:
        continue
    gene_names.add(line[5])

# For each TF...
for tf in sorted(tfs, key=lambda x: x.evidence, reverse=True):
    if tf.species != "Homo sapiens":
        continue
    if tf.gene_name in gene_names:
        Jglobals.write(tf_file, tf)

In [None]:
#-------------#
# Extra stuff #
#-------------#

# # Get orthoDB cluster
# codec = coreapi.codecs.CoreJSONCodec()
# for uniacc in tf._uniaccs:
#     json_file = os.path.join(args.orthodb, "%s.json" % uniacc)
#     if not os.path.exists(json_file):
#         client = coreapi.Client()
#         response = client.get(
#             "https://www.orthodb.org/search?query=%s&level=2759&species=2759" % uniacc)
#         json_obj = json.loads(codec.encode(response))
#         with open(json_file, "w") as j:
#             j.write(json.dumps(json_obj, sort_keys=True, indent=4, separators=(",", ": ")))
#     with open(json_file, "r") as j:  
#         json_obj = json.load(j)
#         for orthodb in json_obj["data"]:
#             tf.orthodb.add(orthodb)

In [44]:
# #-------------#
# # Cluster     #
# # (CIS-BP)    #
# #-------------#

# # The following code is adapted from:
# # https://github.com/wassermanlab/JASPAR-profile-inference/blob/master/finfer_profile.py

# import math
# import numpy as np
# from infer_profile import _filter_results_by_Rost, _get_pid

# def get_members(tf):

#     # Initialize
#     members = []
#     tf_DBDs = [dbd[0] for dbd in pfams[tf.unientry]]
#     tf_alignments = [dbd[1] for dbd in pfams[tf.unientry]]
#     seq = Seq(tf.sequence, IUPAC.protein)
#     seq_record = SeqRecord(seq, id=tf.unientry, name=tf.unientry, description=tf.unientry)

#     # Get cut-offs on the percentage of sequence identity
#     cutoffs = {}
#     for pfam_ac in pfam_cutoffs:
#         if pfam_cutoffs[pfam_ac][0] in tf_DBDs:
#             cutoffs.setdefault(pfam_cutoffs[pfam_ac][0], pfam_cutoffs[pfam_ac][1])

#     # BLAST+ search
#     blast_results = BLAST(seq_record)

#     # Filter results
#     for filtered_result in sorted(_filter_results_by_Rost(blast_results), key=lambda x: x[5], reverse=True):

#         # Both TFs have same DBD composition
#         if [dbd[0] for dbd in pfams[filtered_result[1]]] == tf_DBDs:
            
#             # Inference: percentage of sequence identity
#             if tf_DBDs:

#                 skip = False
#                 alignments = [dbd[1] for dbd in pfams[filtered_result[1]]]
#                 for a in range(len(alignments)):
#                     pid = _get_pid(tf_alignments[a], alignments[a])
#                     if pid < cutoffs[tf_DBDs[a]]:
#                         skip = True
#                         break
#                 if not skip:
#                     members.append(filtered_result[1])

#             # From PMID:25215497;
#             # For the remaining classes, with suggestive but insufficient data, we chose a threshold of 70%,
#             # which is the mean, median, and mode threshold across all DBD classes.
#             else:
#                 if filtered_result[6] < 70:
#                     continue
#                 members.append(filtered_result[1])

#     return(members)

# def BLAST(seq_record):

#     # Initialize
#     blast_results = set()
#     outfmt = "sseqid pident length qstart qend sstart send evalue bitscore ppos qlen slen"

#     # Run BLAST+
#     cmd = "blastp -db %s -outfmt \"6 %s\"" % (fasta_file, outfmt)
#     process = subprocess.Popen([cmd], shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
#     fasta_sequence = ">%s\n%s" % (seq_record.id, seq_record.seq)
#     process.stdin.write(fasta_sequence.encode())
#     (blast_records, blast_errors) = process.communicate()

#     # For each BLAST+ record...
#     for blast_record in blast_records.decode("utf-8").split("\n"):

#         # Custom BLAST+ record:
#         # (1) identifier of target sequence;
#         # (2) percentage of identical matches;
#         # (3) alignment length;
#         # (4-5, 6-7) start and end-position in query and in target;
#         # (8) E-value;
#         # (9) bit score;
#         # (10) percentage of positive-scoring matches; and
#         # (4-7, 11, 12) joint coverage (i.e. square root of the coverage
#         # on the query and the target).
#         blast_record = blast_record.split("\t")

#         # Skip if not a BLAST+ record
#         if len(blast_record) != 12: continue

#         # Get BLAST+ record
#         target_id = blast_record[0]
#         percent_identities = float(blast_record[1])
#         alignment_length = int(blast_record[2])
#         query_start_end = "%s-%s" % (blast_record[3], blast_record[4])
#         target_start_end = "%s-%s" % (blast_record[5], blast_record[6])
#         e_value = float(blast_record[7])
#         score = float(blast_record[8])
#         percent_similarity = float(blast_record[9])
#         query_aligned_residues = int(blast_record[4]) - int(blast_record[3]) + 1
#         query_length = float(blast_record[10])
#         target_aligned_residues = int(blast_record[6]) - int(blast_record[5]) + 1
#         target_length = float(blast_record[11])
#         query_coverage = query_aligned_residues * 100 / query_length
#         target_coverage = target_aligned_residues * 100 / target_length
#         joint_coverage = math.sqrt(query_coverage * target_coverage)

#         # Add BLAST+ record to search results
#         blast_results.add((seq_record.id, target_id, query_start_end, target_start_end, e_value, score, percent_identities, alignment_length, percent_similarity, joint_coverage))

#     # Return results sorted by score
#     return(list(sorted(blast_results, key=lambda x: x[-1], reverse=True)))

# # Initialize
# with open(os.path.join(jaspar_dir, "files", "pfam-DBDs.json")) as f:
#     pfam_cutoffs = json.load(f)

# # Get species
# species = {}
# for tf in sorted(tfs, key=lambda x: x.gene_name):
#     species.setdefault(tf.species, tf.species.replace(" ", "_"))

# # For each species...
# for s in sorted(species):

#     # Initialize
#     cluster_num = 0
#     clusters = {}
#     clusters_dir = "./Data/Clusters/%s" % species[s]
#     family_num = 0
#     families = {}

#     if not os.path.isdir(clusters_dir):
#         os.mkdir(clusters_dir)

#     # Get families
#     for tf in sorted(tfs, key=lambda x: x.gene_name):
#         if tf.species != s:
#             continue
#         if tf.family not in families:
#             family_num += 1
#             families.setdefault(tf.family, family_num)
#             prefix = os.path.join(clusters_dir, str(families[tf.family]))
#             fasta_file = "%s.fasta" % prefix
#             if os.path.exists(fasta_file):
#                 os.remove(fasta_file)

#     # Create family-specific FASTA files
#     for tf in sorted(tfs, key=lambda x: x.gene_name):
#         if tf.species != s:
#             continue
#         prefix = os.path.join(clusters_dir, str(families[tf.family]))
#         fasta_file = "%s.fasta" % prefix
#         Jglobals.write(fasta_file, ">%s\n%s" % (tf.unientry, tf.sequence))

#     # Skip if JSON file already exists
#     json_file = "%s/%s.json" % (clusters_dir, species[s])
#     if not os.path.exists(json_file):

#         for family in sorted(families.values()):

#             # Make BLAST+ databases
#             prefix = os.path.join(clusters_dir, str(family))
#             fasta_file = "%s.fasta" % prefix
#             if not os.path.exists("%s.psq" % fasta_file):
#                 cmd = "makeblastdb -in %s -dbtype prot" % fasta_file
#                 process = subprocess.run([cmd], shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

#             for tf in sorted(tfs, key=lambda x: x.evidence, reverse=True):

#                 # Exit loop
#                 if tf.evidence == 0:
#                     break
#                 if tf.species != s:
#                     continue
#                 if families[tf.family] != family:
#                     continue
#                 if tf.unientry in clusters:
#                     continue

#                 # Get cluster members
#                 members = get_members(tf)
#                 if members:
#                     cluster_num += 1
#                     for member in members:
#                         if member in clusters:
#                             continue
#                         clusters.setdefault(member, cluster_num)

#         # Write
#         Jglobals.write(json_file, json.dumps(clusters, sort_keys=True, indent=4, separators=(",", ": ")))

# # with open(json_file) as f:
# #     clusters = json.load(f)

# # for tf in sorted(tfs, key=lambda x: x.gene_name):
# #     if tf.uniacc in clusters:
# #         tf.cluster_num = clusters[tf.uniacc]