In [1]:
import argparse
import os
import re
import shutil
import subprocess
import sys

In [2]:
# Append JASPAR-profile-inference to path
jaspar_dir = os.path.abspath("./JASPAR-profile-inference")
sys.path.insert(0, jaspar_dir)
sys.path

['/Users/ofornes/Work/GRECO/JASPAR-profile-inference',
 '/Users/ofornes/Work/GRECO',
 '/Users/ofornes/.anaconda3/lib/python37.zip',
 '/Users/ofornes/.anaconda3/lib/python3.7',
 '/Users/ofornes/.anaconda3/lib/python3.7/lib-dynload',
 '',
 '/Users/ofornes/.anaconda3/lib/python3.7/site-packages',
 '/Users/ofornes/.anaconda3/lib/python3.7/site-packages/aeosa',
 '/Users/ofornes/.anaconda3/lib/python3.7/site-packages/IPython/extensions',
 '/Users/ofornes/.ipython']

In [3]:
# Import from JASPAR-profile-inference
from __init__ import Jglobals

In [10]:
#-------------#
# Class       #
#-------------#

class TF(object):

    def __init__(self, gene_name, species):

        self.gene_name = gene_name
        self.species = species
        self.uniacc = None
        self.unientry = None
        self.status = None
        self.sequence = None
        self.family = "Unknown"
        self.cluster_num = None
        # self.orthodb = set()
        self.jaspar_id = None
        self.hocomoco_id = set()
        
        # In vivo
        self.chip_atlas = set()
        self.cistromedb = set()
        self.gtrd = set()
        self.dap_seq = set()
        self.remap = set()

        # In vitro
        self.ht_selex = set()
        self.cisbp = set()
        self.uniprobe = set()
        self.smile_seq = set()

        # Hidden variables (for internal use only)
        self._uniaccs = set()
        self._unientries = set()
        self._sequences = set()

    def __str__(self):

        string = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
            self.gene_name,
            self.species,
            self.uniacc,
            # ";".join(sorted([i for i in self._uniaccs if i != self.uniacc])),
            self.unientry,
            # ";".join(sorted([i for i in self._unientries if i != self.unientry])),
            self.status,
            self.sequence,
            # ";".join(sorted([i for i in self._sequences if i != self.sequence])),
            self.family,
            self.cluster_num,
            # ";".join(sorted([i for i in self._pfam_ids if i != self.pfam_id])),
            # ";".join(sorted(self.orthodb)),
            self.jaspar_id,
            ";".join(sorted(self.hocomoco_id)),
            ";".join(sorted(self.chip_atlas)),
            ";".join(sorted(self.cistromedb)),
            ";".join(sorted(self.gtrd)),
            ";".join(sorted(self.remap)),
            ";".join(sorted(self.dap_seq)),
            ";".join(sorted(self.ht_selex)),
            ";".join(sorted(self.cisbp)),
            ";".join(sorted(self.uniprobe)),
            ";".join(sorted(self.smile_seq))
        )

        return(string)

    def __repr__(self):

        string = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
            self.gene_name,
            self.species,
            self.uniacc,
            # ";".join(sorted([i for i in self._uniaccs if i != self.uniacc])),
            self.unientry,
            # ";".join(sorted([i for i in self._unientries if i != self.unientry])),
            self.status,
            self.sequence,
            # ";".join(sorted([i for i in self._sequences if i != self.sequence])),
            self.family,
            self.cluster_num,
            # ";".join(sorted([i for i in self._pfam_ids if i != self.pfam_id])),
            # ";".join(sorted(self.orthodb)),
            self.jaspar_id,
            ";".join(sorted(self.hocomoco_id)),
            ";".join(sorted(self.chip_atlas)),
            ";".join(sorted(self.cistromedb)),
            ";".join(sorted(self.gtrd)),
            ";".join(sorted(self.remap)),
            ";".join(sorted(self.dap_seq)),
            ";".join(sorted(self.ht_selex)),
            ";".join(sorted(self.cisbp)),
            ";".join(sorted(self.uniprobe)),
            ";".join(sorted(self.smile_seq))
        )

        return(string)

In [30]:
#-------------#
# Paths       #
#-------------#

hocomoco_file = "./Data/Databases/HOCOMOCO/HOCOMOCOv11_full_jaspar_format.txt"
chip_atlas_file = "./Data/Experiments/ChIP-seq.ChIP-Atlas.tsv"
cistromedb_file = "./Data/Experiments/ChIP-seq.CistromeDB.tsv"
gtrd_file = "./Data/Experiments/ChIP-seq.GTRD.tsv"
remap_file = "./Data/Experiments/ChIP-seq.ReMap2020.tsv"
dap_seq_file = "./Data/Experiments/DAP-seq.PMID:27203113.tsv"
ht_selex_files = ["./Data/Experiments/HT-SELEX.PMID:23332764.tsv",
                  "./Data/Experiments/HT-SELEX.PMID:28473536.tsv"]
cisbp_txt_file = "./Data/Databases/CisBP-2.0/PWMs.txt"
cisbp_tsv_file = "./Data/Experiments/PBM.CisBP-2.0.tsv"
uniprobe_file = "./Data/Experiments/PBM.UniPROBE.tsv"
smile_seq = "./Data/Experiments/SMiLE-seq.PMID:28092692.tsv"

In [17]:
#-------------#
# Parse Data  #
#-------------#

# Initialize
tfs = set()
families = {}

# For each line...
for line in Jglobals.parse_tsv_file("./Data/Parsed/TFs.tab.gz"):

    # Initialize
    tf = TF(line[0], line[1])

    # Get UniProt Accession
    uniaccs = line[2].split(";")
    tf.uniacc = uniaccs[0]
    tf._uniaccs.update(set(uniaccs))

    # Get UniProt Entry
    unientries = line[3].split(";")
    tf.unientry = unientries[0]
    tf._unientries.update(set(unientries))

    # Get sequence
    sequences = line[4].split(";")
    tf.sequence = sequences[0]
    tf._sequences.update(set(sequences))

    # Get status (i.e. reviewed or not)
    tf.family = line[5]

    # Get family
    tf.family = line[6]

    # Get JASPAR ids
    tf.jaspar_id = line[7]

    # # Get orthoDB cluster
    # codec = coreapi.codecs.CoreJSONCodec()
    # for uniacc in tf._uniaccs:
    #     json_file = os.path.join(args.orthodb, "%s.json" % uniacc)
    #     if not os.path.exists(json_file):
    #         client = coreapi.Client()
    #         response = client.get(
    #             "https://www.orthodb.org/search?query=%s&level=2759&species=2759" % uniacc)
    #         json_obj = json.loads(codec.encode(response))
    #         with open(json_file, "w") as j:
    #             j.write(json.dumps(json_obj, sort_keys=True, indent=4, separators=(",", ": ")))
    #     with open(json_file, "r") as j:  
    #         json_obj = json.load(j)
    #         for orthodb in json_obj["data"]:
    #             tf.orthodb.add(orthodb)

    # Add TF to TFs
    tfs.add(tf)

print(next(iter(tfs)))

ASH1	Saccharomyces cerevisiae	P34233	ASH1_YEAST	None	MSSLYIKTPLHALSAGPDSHANSSYYDNLLLPSFSNLSSNISRNNITTDNNINSASPRKYSFHSLNVSPILSPISLANEILGKKSNTAPASPHHMDYNPISSLTPGNSPEFNKASLSQISFTNPLNYGSGLGFSSNSQPRLPLLDRLSSVSLSKRPERPQQSLPSLRHLQLLPSPLLQENAARFPDTSKRTSNWKTDLTHWCKDTNYQDYVKIREEVAHFKPLSIPNLTNNQNNDSFNYGKELESTRSSKFHSPSKESFDRTKLIPSILEAKDQFKDLSNNAWSITPPVTPPMSPPTNRTMERTTLRGVEASFFEGKSSNNDSIFNPIISEKLVQEVKHQRQLRGNSFPMPNASHKKTNSFKALQIKKLLANRDILSNNSKSNVRKPSKNKISKQASNVFGNTARQLVMKLDNASYSSVSASSSPSPSTPTKSGKMRSRSSSPVRPKAYTPSPRSPNYHRFALDSPPQSPRRSSNSSITKKGSRRSSGSSPTRHTTRVCVSCHSSDSPCWRPSWSPRKQDQLCNSCGLRYKKTHTRCLNDLCRKIPTKGEINIMKSNGIDKEFVPERNCEIEGYRCLFCNYITETVEN	GATA	None	MA0276.1										


In [18]:
#-------------#
# HOCOMOCO    #
#-------------#

#  678 Homo sapiens
#  451 Mus musculus

# SMCA5_MOUSE # Not a TF: SWI/SNF-related matrix-associated actin-dependent regulator of chromatin subfamily A member 5
# FUBP1_MOUSE # Not a TF: Far upstream element-binding protein 1
# BRCA1_MOUSE # Not a TF: Breast cancer type 1 susceptibility protein homolog
# EVI1_MOUSE  # Not a TF: Histone-lysine N-methyltransferase MECOM
# TAF1_MOUSE  # Not a TF: Transcription initiation factor TFIID subunit 1
# BRAC_MOUSE  # Not a valid UniProt Entry
# HLTF_MOUSE  # Not a TF: Helicase-like transcription factor
# TAF1_HUMAN  # Not a TF: Transcription initiation factor TFIID subunit 1
# HLTF_HUMAN  # Not a TF: Helicase-like transcription factor
# BRCA1_HUMAN # Not a TF: Breast cancer type 1 susceptibility protein
# SMCA5_HUMAN # Not a TF: SWI/SNF-related matrix-associated actin-dependent regulator of chromatin subfamily A member 5
# ZF64A_HUMAN # Not a valid UniProt Entry
# BRAC_HUMAN  # Not a valid UniProt Entry
# EVI1_HUMAN  # Not a TF: Histone-lysine N-methyltransferase MECOM
# FUBP1_HUMAN # Not a TF: Far upstream element-binding protein 1
# BPTF_HUMAN  # Not a TF: Nucleosome-remodeling factor subunit BPTF
# CENPB_HUMAN # Not a TF: Major centromere autoantigen B
# ZBT48_HUMAN # Not a valid UniProt Entry; should be TZAP_HUMAN

# For each line...
for line in Jglobals.parse_file(hocomoco_file):

    if line.startswith(">"):

        # Get unientry
        m = re.search("(\w+_(HUMAN|MOUSE))", line)
        unientry = m.group(1)

        # For each TF...
        for tf in sorted(tfs, key=lambda x: x.gene_name):
            if unientry in tf._unientries:
                tf.hocomoco_id.add(line[1:])

In [19]:
genes = 0
feats = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.hocomoco_id)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        print(tf.gene_name, tf.hocomoco_id)
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

AHR {'AHR_HUMAN.H11MO.0.B'}
AIRE {'AIRE_HUMAN.H11MO.0.C'}
ALX1 {'ALX1_HUMAN.H11MO.0.B'}
ALX3 {'ALX3_HUMAN.H11MO.0.D'}
ALX4 {'ALX4_HUMAN.H11MO.0.D'}
AR {'ANDR_HUMAN.H11MO.1.A', 'ANDR_HUMAN.H11MO.2.A', 'ANDR_HUMAN.H11MO.0.A'}
ARID3A {'ARI3A_HUMAN.H11MO.0.D'}
ARID5B {'ARI5B_HUMAN.H11MO.0.C'}
ARNT {'ARNT_HUMAN.H11MO.0.B'}
ARNT2 {'ARNT2_HUMAN.H11MO.0.D'}
ARNTL {'BMAL1_HUMAN.H11MO.0.A'}
ARX {'ARX_HUMAN.H11MO.0.D'}
ASCL1 {'ASCL1_HUMAN.H11MO.0.A'}
ASCL2 {'ASCL2_HUMAN.H11MO.0.D'}
ATF1 {'ATF1_HUMAN.H11MO.0.B'}
ATF2 {'ATF2_HUMAN.H11MO.1.B', 'ATF2_HUMAN.H11MO.2.C', 'ATF2_HUMAN.H11MO.0.B'}
ATF3 {'ATF3_HUMAN.H11MO.0.A'}
ATF4 {'ATF4_HUMAN.H11MO.0.A'}
ATF6 {'ATF6A_HUMAN.H11MO.0.B'}
ATF7 {'ATF7_HUMAN.H11MO.0.D'}
ATOH1 {'ATOH1_HUMAN.H11MO.0.B'}
Aire {'AIRE_MOUSE.H11MO.0.C'}
Alx1 {'ALX1_MOUSE.H11MO.0.B'}
Ar {'ANDR_MOUSE.H11MO.0.A', 'ANDR_MOUSE.H11MO.1.A'}
Arid3a {'ARI3A_MOUSE.H11MO.0.D'}
Arid5b {'ARI5B_MOUSE.H11MO.0.C'}
Arnt {'ARNT_MOUSE.H11MO.0.B'}
Arnt2 {'ARNT2_MOUSE.H11MO.0.D'}
Arntl {'BMAL1_MOUSE.H11

NFE2L1 {'NF2L1_HUMAN.H11MO.0.C'}
NFE2L2 {'NF2L2_HUMAN.H11MO.0.A'}
NFIA {'NFIA_HUMAN.H11MO.0.C', 'NFIA_HUMAN.H11MO.1.D'}
NFIB {'NFIB_HUMAN.H11MO.0.D'}
NFIC {'NFIC_HUMAN.H11MO.1.A', 'NFIC_HUMAN.H11MO.0.A'}
NFIL3 {'NFIL3_HUMAN.H11MO.0.D'}
NFKB1 {'NFKB1_HUMAN.H11MO.1.B'}
NFKB2 {'NFKB2_HUMAN.H11MO.0.B'}
NFYA {'NFYA_HUMAN.H11MO.0.A'}
NFYB {'NFYB_HUMAN.H11MO.0.A'}
NFYC {'NFYC_HUMAN.H11MO.0.A'}
NHLH1 {'HEN1_HUMAN.H11MO.0.C'}
NKX2-1 {'NKX21_HUMAN.H11MO.0.A'}
NKX2-2 {'NKX22_HUMAN.H11MO.0.D'}
NKX2-3 {'NKX23_HUMAN.H11MO.0.D'}
NKX2-5 {'NKX25_HUMAN.H11MO.0.B'}
NKX2-8 {'NKX28_HUMAN.H11MO.0.C'}
NKX3-1 {'NKX31_HUMAN.H11MO.0.C'}
NKX3-2 {'NKX32_HUMAN.H11MO.0.C'}
NKX6-1 {'NKX61_HUMAN.H11MO.0.B', 'NKX61_HUMAN.H11MO.1.B'}
NKX6-2 {'NKX62_HUMAN.H11MO.0.D'}
NOBOX {'NOBOX_HUMAN.H11MO.0.C'}
NOTO {'NOTO_HUMAN.H11MO.0.D'}
NR0B1 {'NR0B1_HUMAN.H11MO.0.D'}
NR1D1 {'NR1D1_HUMAN.H11MO.0.B', 'NR1D1_HUMAN.H11MO.1.D'}
NR1H2 {'NR1H2_HUMAN.H11MO.0.D'}
NR1H3 {'NR1H3_HUMAN.H11MO.0.B', 'NR1H3_HUMAN.H11MO.1.B'}
NR1H4 {'NR1H4_HUM

In [20]:
#-------------#
# GTRD        #
#-------------#

#   71 Arabidopsis thaliana
#  213 Caenorhabditis elegans
#   11 Danio rerio
#  249 Drosophila melanogaster
# 1236 Homo sapiens
#  513 Mus musculus
#   12 Rattus norvegicus
#  137 Saccharomyces cerevisiae
#   32 Schizosaccharomyces pombe

# For each line...
for line in Jglobals.parse_tsv_file(gtrd_file):

    # Inialize
    experiment_id = line[0]
    uniacc = line[1]

    # For each TF...
    for tf in sorted(tfs, key=lambda x: x.gene_name):
        if uniacc in tf._uniaccs:
            tf.gtrd.add(experiment_id)

In [21]:
genes = 0
feats = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.gtrd)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        print(tf.gene_name, tf.gtrd)
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

11723 {'EXP043557'}
ABF4 {'EXP041418'}
ADD1 {'EXP044073'}
ADNP {'EXP039553'}
AEBP2 {'EXP040109'}
AGL8 {'EXP040838'}
AP1 {'EXP041282'}
AR {'EXP038192', 'EXP038106', 'EXP038188', 'EXP038189', 'EXP040661', 'EXP036887', 'EXP040665', 'EXP038193', 'EXP037263', 'EXP040660', 'EXP037067', 'EXP038191', 'EXP038636'}
ARF6 {'EXP041307'}
ARID2 {'EXP040161'}
ARID3A {'EXP039839', 'EXP038545', 'EXP040176'}
ARNT {'EXP040255', 'EXP040098', 'EXP039934', 'EXP040302'}
ASCL1 {'EXP037901'}
ATF1 {'EXP040094', 'EXP040091'}
ATF2 {'EXP039489', 'EXP039894', 'EXP039835', 'EXP039995', 'EXP038563'}
ATF3 {'EXP040300', 'EXP039698', 'EXP037426', 'EXP037425', 'EXP039361', 'EXP039763'}
ATF4 {'EXP040245', 'EXP039884'}
ATF7 {'EXP039911', 'EXP040022', 'EXP039442', 'EXP039990'}
ATHB-7 {'EXP041419'}
Arntl {'EXP038302', 'EXP038301', 'EXP036940', 'EXP036943', 'EXP036939'}
Atf-2 {'EXP043626'}
Atf3 {'EXP043928'}
Atf3 {'EXP038726', 'EXP038729', 'EXP038725', 'EXP038727', 'EXP038728'}
B-H2 {'EXP044176'}
B0310.2 {'EXP044241'}
BACH1 {'

ZNF140 {'EXP039719'}
ZNF143 {'EXP040053', 'EXP036969', 'EXP039494'}
ZNF146 {'EXP039853'}
ZNF148 {'EXP039566', 'EXP040238'}
ZNF155 {'EXP039844'}
ZNF157 {'EXP040173'}
ZNF16 {'EXP039611'}
ZNF174 {'EXP039817'}
ZNF18 {'EXP040015', 'EXP038539'}
ZNF184 {'EXP040064', 'EXP040308'}
ZNF189 {'EXP040240'}
ZNF19 {'EXP039381'}
ZNF195 {'EXP039778'}
ZNF197 {'EXP040004'}
ZNF2 {'EXP039786'}
ZNF202 {'EXP039697'}
ZNF211 {'EXP039311'}
ZNF213 {'EXP040242'}
ZNF214 {'EXP040237'}
ZNF217 {'EXP039433', 'EXP040127', 'EXP040266'}
ZNF221 {'EXP040310'}
ZNF223 {'EXP040179'}
ZNF23 {'EXP040025'}
ZNF239 {'EXP039754'}
ZNF24 {'EXP040069', 'EXP039963', 'EXP039470', 'EXP039720', 'EXP039924', 'EXP039728', 'EXP040315', 'EXP039976'}
ZNF248 {'EXP039858'}
ZNF26 {'EXP040133'}
ZNF266 {'EXP039792'}
ZNF274 {'EXP040013', 'EXP039851'}
ZNF280A {'EXP040306'}
ZNF280C {'EXP039565'}
ZNF282 {'EXP040287', 'EXP040288'}
ZNF3 {'EXP040119'}
ZNF300 {'EXP039824'}
ZNF302 {'EXP039821'}
ZNF311 {'EXP039933'}
ZNF316 {'EXP039890', 'EXP039946'}
ZNF324 {'E

In [22]:
#-------------#
# DAP-seq     #
#-------------#

# For each line...
for line in Jglobals.parse_tsv_file(dap_seq_file):

    if line[0] == "AvgSpotLen":
        continue

    species = line[8]
    sra_run = line[9]
    gene = line[15]

    if line[8] not in species:
        continue

    # For each TF...
    for tf in sorted(tfs, key=lambda x: x.gene_name):
        if gene.upper() in tf.gene_name.upper():
            tf.dap_seq.add(sra_run)

In [23]:
genes = 0
feats = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.dap_seq)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        print(tf.gene_name, tf.dap_seq)
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

ABF2 {'SRR2926839'}
ABF2 {'SRR2926839'}
ABI5 {'SRR2926840', 'SRR2926841'}
ABR1 {'SRR2926082', 'SRR2926081'}
AGL13 {'SRR2926463'}
AGL15 {'SRR2926464', 'SRR2926465'}
AGL16 {'SRR2926466', 'SRR2926467'}
AGL42 {'SRR2926470'}
AGL6 {'SRR2926472'}
AGL61 {'SRR2926472'}
AGL62 {'SRR2926472'}
AGL63 {'SRR2926472', 'SRR2926473'}
AGL64 {'SRR2926472'}
AGL65 {'SRR2926472'}
AGL66 {'SRR2926472'}
AGL67 {'SRR2926472'}
AGL95 {'SRR2926678'}
AIL7 {'SRR2926083', 'SRR2926084'}
ANAC011 {'SRR2926607'}
ANAC094 {'SRR2926641'}
ANL2 {'SRR2926395', 'SRR2926394'}
ARF16 {'SRR2926207'}
ARF2 {'SRR2926208'}
ARF20 {'SRR2926208'}
ARF21 {'SRR2926208'}
ARF22 {'SRR2926208'}
ARF23 {'SRR2926208'}
AS2 {'SRR2926449'}
ASHR1 {'SRR2926679'}
AZF1 {'SRR2926298'}
AZF1 {'SRR2926298'}
At1g10720 {'SRR2926223'}
At1g18960 {'SRR2926560'}
At1g19000 {'SRR2926572', 'SRR2926573'}
At1g19040 {'SRR2926645'}
At1g23810 {'SRR2926685'}
At1g24250 {'SRR2926686'}
At1g49010 {'SRR2926574', 'SRR2926575'}
At1g66420 {'SRR2926392'}
At1g72740 {'SRR2926562', 'SRR29

In [26]:
#-------------#
# HT-SELEX    #
#-------------#

# For each file...
for file_name in ht_selex_files:

    m = re.search("HT-SELEX.PMID:(\d+).tsv", file_name)
    pmid = int(m.group(1))

    # For each line...
    for line in Jglobals.parse_tsv_file(file_name):

        if line[0] == "Alias":
            continue

        m = re.search("^([A-Za-z\d]+)_", line[0])
        if m:

            gene_name = m.group(1)

            if pmid == 23332764:
                sra_run = line[14]
            else:
                sra_run = line[19]

            # For each TF...
            for tf in sorted(tfs, key=lambda x: x.gene_name):
                # HT-SELEX data only available for human and mouse
                if tf.species not in ["Homo sapiens", "Mus musculus"]:
                    continue
                if tf.gene_name == gene_name:
                    tf.ht_selex.add(sra_run)

In [27]:
genes = 0
feats = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.ht_selex)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        print(tf.gene_name, tf.ht_selex)
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

ALX1 {'ERR1002053', 'ERR1002057', 'ERR1002051', 'ERR1002055', 'ERR1002056', 'ERR1002050', 'ERR1002052', 'ERR1002054'}
ALX3 {'ERR193772', 'ERR1002059', 'ERR1002064', 'ERR195525', 'ERR1002065', 'ERR1002061', 'ERR1011038', 'ERR1002063', 'ERR1010299', 'ERR195527', 'ERR1002060', 'ERR193771', 'ERR1002062', 'ERR1002071', 'ERR195526', 'ERR193773', 'ERR1002058', 'ERR1002073', 'ERR1011404', 'ERR1002066', 'ERR1010665', 'ERR1002069', 'ERR1002070', 'ERR1002068', 'ERR1002072', 'ERR193770', 'ERR195528', 'ERR1002067'}
ALX4 {'ERR1002075', 'ERR194919', 'ERR1002078', 'ERR1002074', 'ERR194920', 'ERR1002076', 'ERR1002080', 'ERR1002081', 'ERR1002079', 'ERR1002077', 'ERR194917', 'ERR194918'}
AR {'ERR193680', 'ERR193972', 'ERR193681', 'ERR193971', 'ERR193678', 'ERR193679', 'ERR193970', 'ERR193973'}
ARGFX {'ERR1002089', 'ERR1002087', 'ERR1002084', 'ERR1002083', 'ERR1002086', 'ERR1002088', 'ERR1002082', 'ERR1002085'}
ARNT2 {'ERR1002094', 'ERR1011039', 'ERR1002093', 'ERR1002097', 'ERR1002095', 'ERR1010300', 'ERR

SOX3 {'ERR1006146', 'ERR1006143', 'ERR1006142', 'ERR1006145', 'ERR1006148', 'ERR1006149', 'ERR1006144', 'ERR1006147'}
SOX30 {'ERR1895460', 'ERR1895459', 'ERR1895463', 'ERR1006154', 'ERR1006150', 'ERR1006157', 'ERR1006156', 'ERR1006155', 'ERR1895457', 'ERR1895458', 'ERR1895462', 'ERR1006153', 'ERR1006151', 'ERR1895456', 'ERR1006152', 'ERR1895461'}
SOX4 {'ERR1006159', 'ERR194235', 'ERR194232', 'ERR1006163', 'ERR1006162', 'ERR1006158', 'ERR194234', 'ERR1006161', 'ERR194233', 'ERR1006164', 'ERR1006165', 'ERR1006160'}
SOX7 {'ERR194106', 'ERR1006167', 'ERR1006172', 'ERR1006170', 'ERR194109', 'ERR1006173', 'ERR194108', 'ERR1006171', 'ERR194107', 'ERR1006168', 'ERR1006166', 'ERR1006169'}
SOX8 {'ERR1006181', 'ERR194509', 'ERR1006184', 'ERR1006185', 'ERR1006187', 'ERR194505', 'ERR194506', 'ERR193925', 'ERR193922', 'ERR1006178', 'ERR194504', 'ERR1006180', 'ERR1006183', 'ERR1006189', 'ERR193923', 'ERR193924', 'ERR1006188', 'ERR1006179', 'ERR1006182', 'ERR1006186', 'ERR194507', 'ERR1006176', 'ERR10

In [33]:
#-------------#
# CIS-BP      #
#-------------#

valid_matrix_ids = set()

# Get valid PWMs
for matrix_id in Jglobals.parse_file(cisbp_txt_file):
    valid_matrix_ids.add(matrix_id)

# For each line...
for line in Jglobals.parse_file(cisbp_tsv_file):

    matrix_ids = re.findall("(M\d{5}_2.00)", line)

    if matrix_ids:

        line = line.strip("\n").split("\t")
        gene_name = line[1]
        species = line[2].replace("_", " ")

        # Skip inferred TFs
        if line[3] != "D":
            continue

        # For each TF...
        for tf in sorted(tfs, key=lambda x: x.gene_name):
            if tf.gene_name.upper() == gene_name.upper() and species == tf.species:
                tf.cisbp.update(valid_matrix_ids.intersection(set(matrix_ids)))

In [34]:
genes = 0
feats = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.cisbp)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        print(tf.gene_name, tf.cisbp)
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

ABF1 {'M01772_2.00', 'M01773_2.00'}
ABF1 {'M00001_2.00', 'M00902_2.00'}
ABF2 {'M01771_2.00'}
ABF2 {'M00069_2.00'}
ABF3 {'M01782_2.00'}
ABF4 {'M01057_2.00', 'M01056_2.00'}
ABI5 {'M01776_2.00'}
ABR1 {'M01051_2.00'}
ACE2 {'M00033_2.00'}
ADR1 {'M00021_2.00'}
AFT1 {'M01513_2.00'}
AFT2 {'M00002_2.00'}
AHL12 {'M00894_2.00'}
AHL13 {'M01672_2.00'}
AHL20 {'M00896_2.00'}
AHL25 {'M00897_2.00'}
ANHX {'M02096_2.00'}
ANL2 {'M02046_2.00'}
APRR2 {'M02300_2.00'}
ARF14 {'M01647_2.00', 'M01648_2.00'}
ARF14 {'M01647_2.00', 'M01648_2.00'}
ARF3 {'M00852_2.00'}
ARF8 {'M01697_2.00'}
ARO80 {'M01577_2.00'}
ARR1 {'M02296_2.00', 'M02297_2.00'}
ARR11 {'M00872_2.00'}
ARR14 {'M00873_2.00', 'M02293_2.00'}
ARR18 {'M02313_2.00'}
ARX {'M00262_2.00', 'M00261_2.00', 'M00264_2.00', 'M00265_2.00', 'M00263_2.00', 'M00260_2.00'}
ASG1 {'M00088_2.00', 'M01581_2.00'}
ASIL1 {'M02264_2.00'}
ATHB-12 {'M00865_2.00', 'M02045_2.00'}
ATHB-15 {'M00864_2.00'}
ATHB-16 {'M02050_2.00'}
ATHB-22 {'M02041_2.00'}
ATHB-4 {'M01065_2.00', 'M02042_2

MYB33 {'M02306_2.00'}
MYB4 {'M02303_2.00'}
MYB44 {'M01082_2.00'}
MYB46 {'M00877_2.00'}
MYB52 {'M00871_2.00'}
MYB57 {'M01072_2.00'}
MYB59 {'M00883_2.00'}
MYB83 {'M01073_2.00'}
MYB94 {'M02298_2.00'}
MYB98 {'M02301_2.00'}
MYC2 {'M01699_2.00'}
MYC3 {'M01712_2.00'}
MYC4 {'M01707_2.00'}
MYR1 {'M02309_2.00'}
MYR2 {'M02294_2.00'}
MYRF {'M02385_2.00'}
Mafb {'M00126_2.00'}
Mafg {'M01814_2.00'}
Mafk {'M00123_2.00'}
Max {'M01753_2.00', 'M00122_2.00', 'M00121_2.00'}
Mecp2 {'M00806_2.00'}
Meis1 {'M00407_2.00'}
Meis2 {'M00437_2.00'}
Meis3 {'M00488_2.00'}
Meox1 {'M00384_2.00'}
Mitf {'M01741_2.00'}
Mlx {'M00752_2.00', 'M01721_2.00'}
Mlxip {'M01743_2.00'}
Mnt {'M01716_2.00'}
Mnx1 {'M00388_2.00'}
Msc {'M01730_2.00'}
Msx1 {'M00501_2.00'}
Msx2 {'M00414_2.00'}
Msx3 {'M00429_2.00'}
Mtf1 {'M00136_2.00'}
Myb {'M00179_2.00'}
Mybl1 {'M00180_2.00'}
Mybl2 {'M00807_2.00'}
Myf6 {'M01742_2.00', 'M00120_2.00'}
Mypop {'M00808_2.00'}
Myrf {'M02386_2.00'}
Mzf1 {'M00772_2.00', 'M00773_2.00'}
NAC001 {'M00969_2.00'}
NAC002 

Zfp105 {'M00148_2.00'}
Zfp202 {'M00770_2.00'}
Zfp263 {'M00766_2.00'}
Zfp263 {'M00766_2.00'}
Zfp3 {'M00782_2.00'}
Zfp300 {'M00774_2.00'}
Zfp691 {'M00142_2.00'}
Zfp711 {'M01863_2.00'}
Zfx {'M00788_2.00'}
Zic1 {'M00137_2.00'}
Zic2 {'M00150_2.00'}
Zic3 {'M00151_2.00'}
Zic5 {'M00780_2.00'}
Zkscan1 {'M00771_2.00'}
Zkscan5 {'M00785_2.00'}
Zscan10 {'M00769_2.00', 'M00767_2.00', 'M00768_2.00'}
Zscan20 {'M00764_2.00'}
Zscan26 {'M00131_2.00'}
Zscan4c {'M00147_2.00'}
acj6 {'M02241_2.00'}
atf-2 {'M00592_2.00'}
atf-6 {'M00594_2.00'}
atf-7 {'M00595_2.00'}
atf-8 {'M00605_2.00'}
athp-1 {'M02543_2.00'}
athp-3 {'M01691_2.00'}
bap {'M00560_2.00', 'M02119_2.00'}
bcl-11 {'M00641_2.00'}
bed-3 {'M00587_2.00'}
bnc-1 {'M00648_2.00'}
bnl {'M01091_2.00'}
bsh {'M02113_2.00'}
cebp-1 {'M00604_2.00'}
ceh-18 {'M00681_2.00'}
ceh-19 {'M02180_2.00'}
ceh-22 {'M00376_2.00'}
ceh-23 {'M00671_2.00'}
ceh-24 {'M02181_2.00'}
ceh-34 {'M00672_2.00'}
ceh-36 {'M00673_2.00'}
ceh-37 {'M00674_2.00'}
ceh-48 {'M01905_2.00'}
ceh-5 {'M0217

In [36]:
#-------------#
# UniPROBE    #
#-------------#

# For each line...
for line in Jglobals.parse_tsv_file(uniprobe_file):
   
    if line[0] == "Protein":
        continue

    gene_name = line[0]
    uniprobe_id = line[1]

    # For each TF...
    for tf in sorted(tfs, key=lambda x: x.gene_name):
        if tf.gene_name.upper() == gene_name.upper() and line[2] == tf.species:
            tf.uniprobe.add(uniprobe_id)

In [37]:
genes = 0
feats = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.uniprobe)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        print(tf.gene_name, tf.uniprobe)
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

ABF1 {'UP00452'}
AFT1 {'UP00344'}
ARO80 {'UP00329'}
ARX {'UP00584'}
ASG1 {'UP00350'}
Abd-B {'UP00503'}
Ahctf1 {'UP01352'}
Alx3 {'UP00108'}
Alx4 {'UP00187'}
Ar {'UP01353'}
Arid3a {'UP00078'}
Arid5a {'UP00059'}
Arx {'UP00152'}
Ascl2 {'UP00099'}
Atf1 {'UP00020'}
Atf3 {'UP01354'}
BAS1 {'UP00355'}
BCL11A {'UP01464'}
BCL11B {'UP01465'}
BCL6 {'UP00585'}
BEAF-32 {'UP01546'}
Barhl1 {'UP00166'}
Barhl2 {'UP00145'}
Barx1 {'UP00181'}
Barx2 {'UP00151'}
Bbx {'UP00012'}
Bcl6b {'UP00043'}
Bsx {'UP00138'}
CAD1 {'UP00453'}
CBF1 {'UP00309', 'UP00397'}
CEP3 {'UP00278'}
CHA4 {'UP00300'}
CHES-1-like {'UP00520'}
CIN5 {'UP00454'}
CRX {'UP00586'}
CST6 {'UP00455'}
CUP9 {'UP00308'}
Cdx1 {'UP00240'}
Cdx2 {'UP00133'}
Cebpa {'UP01356'}
Cebpb {'UP01357'}
Clamp {'UP00534'}
CrebA {'UP01537'}
Crx {'UP00176'}
D {'UP01515'}
Dbp {'UP01359'}
Dbx1 {'UP00255'}
Dbx2 {'UP00218'}
Dlx1 {'UP00202'}
Dlx2 {'UP00126'}
Dlx3 {'UP00154'}
Dlx4 {'UP00110'}
Dlx5 {'UP00230'}
Dmbx1 {'UP00111'}
Dnajc21 {'UP01348'}
Doc1 {'UP01552'}
Doc2 {'UP01

In [40]:
#-------------#
# SMiLE-seq   #
#-------------#

synonyms = {
    "CEBPb": "CEBPB",
    "cFOS": "FOS",
    "cFOSL2": "FOSL2",
    "cJUN": "JUN",
    "PPARa": "PPARA",
    "PPARg": "PPARG",
    "RXRa": "RXRA",
    "RXRg": "RXRG"            
}

# For each line...
for line in Jglobals.parse_tsv_file(smile_seq):

    if line[0] == "Assay_Type":
        continue

    sra_run = line[13]

    for gene_name in line[16].split("_")[0].split("-"):
        if gene_name in synonyms:
            gene_name = synonyms[gene_name]

        # For each TF...
        for tf in sorted(tfs, key=lambda x: x.gene_name):
            if tf.gene_name.upper() == gene_name.upper() and line[12] == tf.species:
                tf.smile_seq.add(sra_run)

AttributeError: 'TF' object has no attribute '_species'

In [41]:
genes = 0
feats = 0
for tf in sorted(tfs, key=lambda x: x.gene_name):
    subtotal_feats = len(tf.smile_seq)
    if subtotal_feats > 0:
        genes += 1
        feats += subtotal_feats
        print(tf.gene_name, tf.smile_seq)
print("//")
print("Total genes: %s" % genes)
print("Total feats: %s" % feats)

//
Total genes: 0
Total feats: 0


In [37]:
#-------------#
# Cluster TFs #
#-------------#

# Initialize
cluster_num = 0
clusters = {}
clusters_dir = "./Clusters/"
prefix = os.path.join(clusters_dir, "all")
fasta_file = "%s_input.fa" % prefix
cluster_all_seqs = "%s_all_seqs.fasta" % prefix
cluster_clusters = "%s_cluster.tsv" % prefix
cluster_rep_seqs = "%s_rep_seq.fasta" % prefix

if not os.path.exists(fasta_file):
    for pfam_id in sorted(pfam_ids, key=lambda x: len(pfam_ids[x]), reverse=True):
        for uniacc, sequence in pfam_ids[pfam_id]:
            Jglobals.write(fasta_file, ">%s\n%s" % (uniacc, sequence))

if not os.path.exists(cluster_rep_seqs):

    opts = "--min-seq-id 0.3 -c 0.8 --cluster-mode 3 --cov-mode 1"
    cmd = "mmseqs easy-cluster %s %s %s %s" % (fasta_file, prefix, prefix, opts)
    process = subprocess.run(cmd, shell=True, check=True)

if os.path.isdir(prefix):
    shutil.rmtree(prefix)

# For each line...
for line in Jglobals.parse_file(cluster_clusters):

    # Get cluster
    cluster_id, uniacc = line.split("\t")
    if cluster_id not in clusters:
        cluster_num += 1
        clusters.setdefault(cluster_id, cluster_num)

    # For each TF...
    for tf in tfs:
        if uniacc in tf._uniaccs:
            tf.pfam_id = ";".join(tf._pfam_ids)
            tf.cluster_num = clusters[cluster_id]

In [38]:
for tf in sorted(tfs, key=lambda x: x.pfam_id):
    if tf.pfam_id != "HMG_box":
        continue
    print(tf.pfam_id, tf.gene_name, tf.cluster_num, sorted(tf._species)[0])

HMG_box pop-1 803 Caenorhabditis elegans
HMG_box sem-2 2120 Caenorhabditis elegans
HMG_box sox-2 2412 Caenorhabditis elegans
HMG_box sox-3 860 Caenorhabditis elegans
HMG_box NSD2 2036 Homo sapiens
HMG_box swsn-3 946 Caenorhabditis elegans
HMG_box Sry 290 Mus musculus
HMG_box Ssrp1 2119 Mus musculus
HMG_box SMARCE1 946 Homo sapiens
HMG_box Hmgb1 975 Mus musculus
HMG_box Hmgb2 975 Mus musculus
HMG_box Hmgb3 975 Mus musculus
HMG_box Hmgb4 422 Mus musculus
HMG_box Hmgxb4 548 Mus musculus
HMG_box HMGB1 975 Homo sapiens
HMG_box HMGB2 975 Homo sapiens
HMG_box HMGB3 975 Homo sapiens
HMG_box HMGB4 422 Homo sapiens
HMG_box HMGXB4 548 Homo sapiens
HMG_box Wdhd1 1716 Mus musculus
HMG_box UBTF 1023 Homo sapiens
HMG_box UBTFL1 1985 Homo sapiens
HMG_box bbx 418 Drosophila melanogaster
HMG_box Sox100B 2121 Drosophila melanogaster
HMG_box Gm4969 571 Mus musculus
HMG_box TCF7 1859 Homo sapiens
HMG_box TCF7L1 1859 Homo sapiens
HMG_box cic 811 Drosophila melanogaster
HMG_box TCF7L2 1859 Homo sapiens
HMG_b

In [79]:
# For each TF...
for tf in sorted(tfs, key=lambda x: x.gene_name):
    print(tf)

''nuclear factor Y	Q944I5	Q944I5_ARATH	None	MDPMDIVGKSKEDASLPKATMTKIIKEMLPADVRVARDAQDLLIECCVEFINLISSESNEVCNKEDKRTIAPEHVLKALQVLGFGEYVEEVYAAYEQHKYETMQDSQRSVKMNSGAEMTEEEAAAEQQRMFAEARARMNGGVTVPQPEQLEEPQQQQQTSLQS	Arabidopsis thaliana	Unknown	310	nan						SRR2926318;SRR2926319				
1	Q9VGG0	Q9VGG0_DROME	None	MDTPEIHVNISSQTCRVCLETHETNLYVHDEIKYNDLKLELWQLLEAVSKLKWTWTDPNLPMHLCQNCARRLIGAYEFIVEVENAHETLQNLFEQQEVAAKPDEVHVDVVELIDQDDVVSMAQYLSTSFAEQHVEMEEKYGDQDCSAFTSDVGEEPLYASEDRDDEPEDSFQLKPRPDEIENRELSRPSQLGSRLNHSANFIYKCAVCPRVFAKSESLTRHFSQAHKLTADVAAMKLANESCGTGLLTCEHCPRTFKRQDTLRRHMQAFHPDAIALEPEETTDNSARKRIAKRRDCPHCGLSFPVSSLTIHIRRHTGDNPYKCDQCEKAFPRSQDLSLHMRQHTGERPSECKICSKKFISQNKLARHMRLHTGQRPYSCKMCSKSFVQSNDLKIHMRRHTGERPYQCGVCGESFVCGSHLNIHRNRKGHLIAVIPGNEVEANFAADPYVNARVNQRRSEDIERMRLQRIPENQLQQRLENLPKPDVPAMCYKCGVCEQKFKSGALLTVHRNKMSHYEIERVYENPFGKNQKIIKAEYN	Drosophila melanogaster	zf-C2H2	490	nan										
2010315B03Rik	A0A087WQD2	A0A087WQD2_MOUSE	None	MYLEDSPATSFRLGPGWSVQRRRHQRNAMTYADVHVNFTWEEWALLNPSQKSLYKEVMLET

Batf3	Q9D275	BATF3_MOUSE	None	MSQGPPAVSVLQRSVDAPGNQPQSPKDDDRKVRRREKNRVAAQRSRKKQTQKADKLHEEHESLEQENSVLRREISKLKEELRHLSEVLKEHEKMCPLLLCPMNFVQLRSDPVASCLPR	Mus musculus	bZIP_1	34	nan	BATF3_MOUSE.H11MO.0.A									
Baz2a	Q91YE5	BAZ2A_MOUSE	None	MEMEANDHFNFTGLPPAPAASGLKPSPSSGEGLYTNGSPMNFPQQGKSLNGDVNVNGLSTVSHTTTSGILNSAPHSSSTSHLHHPNVAYDCLWNYSQYPSANPGNNLKDPPLLSQFPGGQYPLNGILGGNRQPSSPSHNTNLRAGSQEFWANGTQSPMGLNFDSQELYDSFPDQNFEVMPNGPPSFFTSPQTSPMLGSSIQTFAPSQDVSSDIHPDEAAEKELTSVVAENGTGLVGSLELEEEQPELKMCGYNGSVSSVESLHQEVSVLVPDPTVSCLDDPSHLPDQLEDTPILSEDSLEPFDSLAAEPVSGSLYGIDDAELMGAEDKLPLEGNPVISALDCPALSNANAFSLLADDSQTSASIFVSPTSPPVLGESVLQDNSFGLNSCSDSEQEEIETQSSNFQRPLTEPAPDQPPSTQLHPAVSPTASPAASLTASAEISPAVSPVASSPVPPEVFVAVSPASSPALPAISLEASMTTPVTSPQGSPEPSPAAAFQTVSPARKNVSSAPKARADREETTGGAVAVSGSGDVLKRRIATPEEVRLPLQHGWRREVRIKKGSHRWQGETWYYGPCGKRMKQFPEVIKYLSRNVVHSVRREHFSFSPRMPVGDFFEERDTPEGLQWVQLSAEEIPSRIQAITGKRGRPRNNEKAKNKEVPKVKRGRGRPPKIKMPELLNKTDNRLPKKLETQEILSEDDKAKMTKNKKKMRQKVQRGESQTPVQGQARNKRKQDTKSLKQKDTKKKLKAEKEKMKTKQEKLKEK

CG17822	Q8SZT7	Q8SZT7_DROME	None	MCNLHFERIVDLRAHIQLELKLSLSLHQSYDSPHNYSITNESGFELQLEDSETEDEMQPGVGSRPVYICELCSVQCKRKFEMIQHQRTMHRFDKMPHECDDCIFKCVSKSIMDHHRQGQCSSTEKKHACGKCSYKFMWPENLEQHILLQHSKSSVSNPTGDRHTQGTGDLEKDATEDGIPLLQCPHCDRTYQMKSRLNNHIRDVHINGDRKRKEAIKRFLCSLCGMETRSAAALVTHMRRHTGEKPFKCDLCEMAFPRHSELASHRRMHTGEKPFHCTVCGKDFARSDKLKRHMLTHSGLKPHKCTYCEKSYRQAKDLKLHLQQHTGECPFVCGTCGERFIQSSTLEKHRLMRRHFDEVEAWLRRQK	Drosophila melanogaster	zf-C2H2	301	nan				EXP043490						
CG18011	Q7K1V0	Q7K1V0_DROME	None	MSENAKNNSCLHCSVFSTKYQYQEIFDEFGIELGLQSLLSRHYQLEVSPDPQKQQLLCEVCVTNLIRLFDIDELEREREAAKDAAQGKDSKAEEDPIIITEVQATPPAAKPAKKTVPSRPLTKVLRPVPIIPTREPSARIRNRAAAASNVTPDTSRSPEPPETPSTEDAVSKSEPKLVSKVADQEHFSVLIQNILDEEEAVVEDETAVKEESSEAVQVTEETEAPGQIVILNSEAVTDTNPDENVYIYEHEDVMDPNLDNVKVKPISCSSRLVAQAEESQSDDDIDEGETSNVVLFNFVDIKEKDEVDNIPEYLATVVKTSFEKLTFQWCTVCKHCSLKCPTFESLFSHLSKAHKSRRDVYECPIEGCNKELKGRKFLAMHLVLLHAPVAEIPIYGSCPECKLTFSNILQYNKHSCAHVIKKKRGFRSYCEMCSLEFPSWKRFNFHSQFHLEKHRPRACFVCDYATTNIDELFQHLNYSHEPVGTLFCDLCDRTFRDPSVFMEHNKS

Foxn1	Q61575	FOXN1_MOUSE	None	MVSLLPPQSDVTLPGSTRLEGEPQGDLMQAPGLPDSPAPQNKHANFSCSSFVPDGPPERTPSLPPHSPSIASPDPEQIQGHCTAGPGPGSFRLSPSEKYPGFGFEEGPAGSPGRFLKGNHMPFHPYKRHFHEDIFSEAQTAMALDGHSFKTQGALEAFEEIPVDMGDAEAFLPSFPAEAWCNKLPYPSQEHNQILQGSEVKVKPQALDSGPGMYCYQPPLQHMYCSSQPAFHQYSPGGGSYPVPYLGSPHYPYQRIAPQANAEGHQPLFPKPIYSYSILIFMALKNSKTGSLPVSEIYNFMTEHFPYFKTAPDGWKNSVRHNLSLNKCFEKVENKSGSSSRKGCLWALNPSKIDKMQEELQKWKRKDPIAVRKSMAKPEELDSLIGDKREKLGSPLLGCPPPGLAGPGPIRPMAPSAGLSQPLHPMHPAPGPMPGKNPLQDLLGGHAPSCYGQTYPHLSPSLAPSGHQQPLFPQPDGHLELQAQPGTPQDSPLPAHTPPSHGAKLMAEPSSARTMHDTLLPDGDLGTDLDAINPSLTDFDFQGNLWEQLKDDSLALDPLVLVTSSPTSSSMLPPPPAAHCFPPGPCLAETGNEAGELAPPGSGGSGALGDMHLSTLYSAFVELESTPSSAAAGPAVYLSPGSKPLALA	Mus musculus	Forkhead	33	nan								M01010_2.00	UP00526	
Foxn2	D3Z6Z3	D3Z6Z3_MOUSE	None	MGPVIGMTPDKRAETPGAEKVAGLSQIYKMGSLPEAGDAARPKATLVGSESADDELTNLNWLHESSNLLTNLRLGSEGLPMVSPLYDIEGDEMPSFGPSCYQN	Mus musculus	Forkhead	49	nan										
Foxn3	Q499D0	FOXN3_MOUSE	None	MGPVMPASKKAESSGISVSSGLSQRYRGSGFSKALQEDDDLDFPLPDIRLEEGAMEDEEL

Gata3	P23772	GATA3_MOUSE	None	MEVTADQPRWVSHHHPAVLNGQHPDTHHPGLGHSYMEAQYPLTEEVDVLFNIDGQGNHVPSYYGNSVRATVQRYPPTHHGSQVCRPPLLHGSLPWLDGGKALSSHHTASPWNLSPFSKTSIHHGSPGPLSVYPPASSSSLAAGHSSPHLFTFPPTPPKDVSPDPSLSTPGSAGSARQDEKECLKYQVQLPDSMKLETSHSRGSMTTLGGASSSAHHPITTYPPYVPEYSSGLFPPSSLLGGSPTGFGCKSRPKARSSTEGRECVNCGATSTPLWRRDGTGHYLCNACGLYHKMNGQNRPLIKPKRRLSAARRAGTSCANCQTTTTTLWRRNANGDPVCNACGLYYKLHNINRPLTMKKEGIQTRNRKMSSKSKKCKKVHDALEDFPKSSSFNPAALSRHMSSLSHISPFSHSSHMLTTPTPMHPPSGLSFGPHHPSSMVTAMG	Mus musculus	GATA	29	nan	GATA3_MOUSE.H11MO.0.A			EXP038184				M00166_2.00	UP00032	SRR3402437
Gata4	Q08369	GATA4_MOUSE	None	MYQSLAMAANHGPPPGAYEAGGPGAFMHSAGAASSPVYVPTPRVPSSVLGLSYLQGGGSAAAAGTTSGGSSGAGPSGAGPGTQQGSPGWSQAGAEGAAYTPPPVSPRFSFPGTTGSLAAAAAAAAAREAAAYGSGGGAAGAGLAGREQYGRPGFAGSYSSPYPAYMADVGASWAAAAAASAGPFDSPVLHSLPGRANPGRHPNLDMFDDFSEGRECVNCGAMSTPLWRRDGTGHYLCNACGLYHKMNGINRPLIKPQRRLSASRRVGLSCANCQTTTTTLWRRNAEGEPVCNACGLYMKLHGVPRPLAMRKEGIQTRKRKPKNLNKSKTPAGPAGETLPPSSGASSGNSSNATSSSSSSEEMRPIKTEPGLSSHYGHSSSMSQTFSTVSGHGPSIHPVLSALK

MYB114	Q9FNV8	MY114_ARATH	None	MEGSSKGLRKGAWTAEEDSLLRQCIGKYGEGKWHQVPLRAGLNRCRKSCRLRWLNYLKPSIKRGKFSSDEVDLLLRLHKLLGNRWSLIAGRLPGRTANDVKNYWNTHLSKKHEPCCKTKIKRINIITPPNTPAQKVDIF	Arabidopsis thaliana	Myb_DNA-binding	11	nan						SRR2926483				
MYB115	Q1PDP9	MY115_ARATH	None	MYHQNLISSTPNQNSNPHDWDIQNPLFSIHPSAEIPSKYPFMGITSCPNTNVFEEFQYKITNDQNFPTTYNTPFPVISEGISYNMHDVQENTMCGYTAHNQGLIIGCHEPVLVHAVVESQQFNVPQSEDINLVSQSERVTEDKVMFKTDHKKKDIIGKGQWTPTEDELLVRMVKSKGTKNWTSIAKMFQGRVGKQCRERWHNHLRPNIKKNDWSEEEDQILIEVHKIVGNKWTEIAKRLPGRSENIVKNHWNATKRRLHSVRTKRSDAFSPRNNALENYIRSITINNNALMNREVDSITANSEIDSTRCENIVDEVMNLNLHATTSVYVPEQAVLTWGYDFTKCYEPMDDTWMLMNGWN	Arabidopsis thaliana	Unknown	95	nan						SRR2926483				
MYB117	Q9LQX5	MY117_ARATH	None	MFITEKQVWMDEIVARRASSSWDFPFNDINIHQHHHRHCNTSHEFEILKSPLGDVAVHEEESNNNNPNFSNSESGKKETTDSGQSWSSSSSKPSVLGRGHWRPAEDVKLKELVSIYGPQNWNLIAEKLQGRSGKSCRLRWFNQLDPRINRRAFTEEEEERLMQAHRLYGNKWAMIARLFPGRTDNSVKNHWHVVMARKYREHSSAYRRRKLMSNNPLKPHLTNNHHPNPNPNYHSFISTNHYFAQPFPEFNLTHHLVNNAPITSDHNQLVLPFHCFQGYENNEPPMV

MYRFL	Q96LU7	MRFL_HUMAN	None	MDVVGENEALQQFFEAQGANGTLENPALDTSLLEEFLGNDFDLGALQRQLPDTPPYSASDSCSPPQVKGACYPTLRPTAGRTPAPFLHPTAAPAMPPMHPLQSTSGMGDSCQIHGGFHSCHSNASHLATPLDQSVSSHLGIGCSYPQQPLCHSPGASLPPTKKRKCTQALEDSGECRVWACHCRPMTSRSRSSEVQDPDSEGQNRMPTDQCSPALKWQPCHSVPWHSLLNSHYEKLPDVGYRVVTDKGFNFSPADEAFVCQKKNHFQITIHIQVWGSPKFVETEMGLKPIEMFYLKVFGTKVEATNQIIAIEQSQADRSKKIFNPVKIDLLADQVTKVTLGRLHFSETTANNMRKKGKPNPDQRYFMLVVGLYAANQDQFYLLSAHISERIIVRASNPGQFENDSDALWQRGQVPESIVCHGRVGINTDAPDEALVVCGNMKVMGTIMHPSDSRAKQNIQEVDTNEQLKRIAQMRIVEYDYKPEFASAMGINTAHQTGMIAQEVQEILPRAVREVGDVTCGNGETLENFLMVDKDQIFMENVGAVKQLCKLTNNLEERIEELEIWNRKLARLKRLSSWKSSASEASTISKSSRAVSASSPRRAVHKKNNKVYFSGKRQACPNWVFQTLVITLIAVMAFCALTIVALYILSLKDQDRRVPNLPPSNITSSQEPALLPTASSSAPNTSLVTTPASLQVPEITFCEILPCQETYCCPIRGMKEVSSSPVQRQSEEKEFHQRRWSEDKSKSVLARNALSGPDWESDWIDTTISSIQIMEIQQIIDHQYCIQSLQCGSGNYNYNIPVNKHTPTNVKFSLEINTTEPLIVFQCKFTLGNICFHSKRGTKGLESHREISQEMTQGYQHIWSLPVAPFSDSMFHFRVAAPDLADCSTDPYFAGIFFTDYFFYFYRRCA	Homo sapiens	NDT80_PhoG	2	nan										
MYSM1	Q5VVJ2	MYSM1_H

RHOXF1	Q8NHV9	RHXF1_HUMAN	None	MARSLVHDTVFYCLSVYQVKISPTPQLGAASSAEGHVGQGAPGLMGNMNPEGGVNHENGMNRDGGMIPEGGGGNQEPRQQPQPPPEEPAQAAMEGPQPENMQPRTRRTKFTLLQVEELESVFRHTQYPDVPTRRELAENLGVTEDKVRVWFKNKRARCRRHQRELMLANELRADPDDCVYIVVD	Homo sapiens	Homeodomain	53	MA0719.1	RHXF1_HUMAN.H11MO.0.D						ERR1005874;ERR1005875;ERR1005876;ERR1005877;ERR1005878;ERR1005879;ERR1005880;ERR1005881;ERR193910;ERR193911;ERR193912;ERR193913;ERR194468;ERR194469;ERR194470;ERR194471;ERR194472;ERR194473			
RHOXF2	Q9BQY4	RHXF2_HUMAN	None	MEPPDQCSQYMTSLLSPAVDDEKELQDMNAMVLSLTEEVKEEEEDAQPEPEQGTAAGEKLKSAGAQGGEEKDGGGEEKDGGGAGVPGHLWEGDLEGTSGSDGNVEDSDQSEKEPGQQYSRPQGAVGGLEPGNAQQPNVHAFTPLQLQELERIFQREQFPSEFLRRRLARSMNVTELAVQIWFENRRAKWRRHQRALMARNMLPFMAVGQPVMVTAAEAITAPLFISGMRDDYFWDHSHSSSLCFPMPPFPPPSLPLPLMLLPPMPPAGQAEFGPFPFVIVPSFTFPNV	Homo sapiens	Homeodomain	130	nan							ERR1005882;ERR1005883;ERR1005884;ERR1005885;ERR1005886;ERR1005887;ERR1005888;ERR1005889			
RHOXF2B	P0C7M4	RHF2B_HUMAN	None	MEPPDQCSQYMTSLLSPAVDDEKELQDMNAMVLSLTEEVKEEEEDAQ

RunxB	Q9VRA9	Q9VRA9_DROME	None	MHISAEVSSTTSNQIQQQQQQHQQQQQHQLLQHQQQQTATTTTTKRRNAESSASSNNNNNNNTSTTNNNNTNNNNNSTTNNNNNNNNNNVKTKPVDTSPYLTPENLIERTVDVLLAEHPGELVKTGSPHVVCTTLPTHWRSNKTLPIAFKVLALGEVMDGTIVTIRAGNDENFCGELRNCTAVMKNQVAKFNDLRFVGRSGRGKSFTLTIVISTNPIQIATYTKAIKVTVDGPREPRSKVRHQGFHPFAFGPQRFGPDPLMAGLPFKLPGFAHHLVGMHSHLHAPDWRAHMALGGRPAAFTAAPFFGHHAAAFPTASGLRGLSGDSQQHQQQQQQHQLATVGAAHSTTSPEGSPTTTTTSGTQLSAFVQPPMTSSPPPVTSLQHDNNNNNSNNNNSSSHIDAGFESDSISVTGSPRKSLGSPLTHDEEEAEAEAEAEAEAEAEAEEAEVGGLSRNGGGIQGPLHSESSPGSGGAFTALIQRSGKNPTELFGGFAAAGGNHFAPSGHSFNPALAAQLFLQSPLLPQSSQWLYTQLYGSYSDLPWLRNAAAAAAANINPGQENSGIPPLGSNPDHDGVNLIKRCVTLITHNPPDAENANPNASPPVSSTRRSPSPVETIDLDDVSTTSRSASGSSGHGGGVGGGGAVGPIRTRTPKPSADVWRPY	Drosophila melanogaster	Runt	2	nan										
Rx	Q9W2Q1	RX_DROME	None	MSGSGGVANSSSTAAANNTAATFQHIFEQLVQQGGGNHKLPPKQLEQLRHLLGNVRDAKNLQMIVEKFKNLEQFHEHYAAHLANNNTVISTEDSNDLVKDNARKYGSGGQTLTPRHTIDAILGLKNRNGAANGSGRNPETVSDGSVDPSLGDDDATDLRCGMTLTQLRSMDNHMASMLQQHAKNGGALPYGPPTPPGGQQPQVPNATPLHHGQQMGGQAGHATHAGHGHPTHHGHA

WRKY9	Q9C9F0	WRKY9_ARATH	None	MGFDFSTSKSKAKRQKRIEVRFASPLMGIDLSLKLEAEEKKKEIEGSKHSRENKEDEEHDASGDEDEQMVKEDEDDSSSLGLRTREEENEREELLQLQIQMESVKEENTRLRKLVEQTLEDYRHLEMKFPVIDKTKKMDLEMFLGVQGKRCVDITSKARKRGAERSPSMEREIGLSLSLEKKQKQEESKEAVQSHHQRYNSSSLDMNMPRIISSSQGNRKARVSVRARCETATMNDGCQWRKYGQKTAKGNPCPRAYYRCTVAPGCPVRKQVQRCLEDMSILITTYEGTHNHPLPVGATAMASTASTSPFLLLDSSDNLSHPSYYQTPQAIDSSLITYPQNSSYNNRTIRSLNFDGPSRGDHVSSSQNRLNWMM	Arabidopsis thaliana	WRKY	3	nan										
WT1	P19544	WT1_HUMAN	None	MGSDVRDLNALLPAVPSLGGGGGCALPVSGAAQWAPVLDFAPPGASAYGSLGGPAPPPAPPPPPPPPPHSFIKQEPSWGGAEPHEEQCLSAFTVHFSGQFTGTAGACRYGPFGPPPPSQASSGQARMFPNAPYLPSCLESQPAIRNQGYSTVTFDGTPSYGHTPSHHAAQFPNHSFKHEDPMGQQGSLGEQQYSVPPPVYGCHTPTDSCTGSQALLLRTPYSSDNLYQMTSQLECMTWNQMNLGATLKGVAAGSSSSVKWTEGQSNHSTGYESDNHTTPILCGAQYRIHTHGVFRGIQDVRRVPGVAPTLVRSASETSEKRPFMCAYPGCNKRYFKLSHLQMHSRKHTGEKPYQCDFKDCERRFSRSDQLKRHQRRHTGVKPFQCKTCQRKFSRSDHLKTHTRTHTGKTSEKPFSCRWPSCQKKFARSDELVRHHNMHQRNMTKLQLAL	Homo sapiens	zf-C2H2	42	nan	WT1_HUMAN.H11MO.0.C;WT1_HUMAN.H11MO.1.B			EXP0373

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [107]:
#-------------#
# Cluster TFs #
#-------------#

# Initialize
clusters_dir = "./Clusters/"

# For each Pfam ID...
for pfam_id in sorted(pfam_ids, key=lambda x: len(pfam_ids[x]), reverse=True):

    # Initialize
    cluster_num = 0
    clusters = {}
    prefix = os.path.join(clusters_dir, "+".join(pfam_id))
    fasta_file = "%s_input.fa" % prefix
    cluster_all_seqs = "%s_all_seqs.fasta" % prefix
    cluster_clusters = "%s_cluster.tsv" % prefix
    cluster_rep_seqs = "%s_rep_seq.fasta" % prefix

    if not os.path.exists(fasta_file):
        for uniacc, sequence in pfam_ids[pfam_id]:
            Jglobals.write(fasta_file, ">%s\n%s" % (uniacc, sequence))

    if not os.path.exists(cluster_rep_seqs):

        # Default parameters for kClust (PMID:23945046):
        # Three similarity criteria are used to decide if a query is added to a cluster:
        #
        #   (1) The sequence similarity score from the 4-mer-based dynamic programming algorithm
        #       is larger than a minimum BLOSUM62 score per column (default 1.12 half bits, which
        #       corresponds to a sequence identity of 30%, see Additional file 1: Figure S2);
        #   (2) the alignment achieves an E-value less than a defined threshold (default value 1E-3); and
        #   (3) the alignment covers at least 80% of the residues of the representative sequence.
        #
        # This criterion ensures that clusters contain sequences with nearly identical domain composition.

        cmd = "mmseqs easy-cluster %s %s %s --min-seq-id 0.3 --cluster-reassign 1" % \
              (fasta_file, prefix, prefix)
        process = subprocess.run(cmd, shell=True, check=True)

    if os.path.isdir(prefix):
        shutil.rmtree(prefix)

    # For each line...
    for line in Jglobals.parse_file(cluster_clusters):

        # Get cluster
        cluster_id, uniacc = line.split("\t")
        if cluster_id not in clusters:
            cluster_num += 1
            clusters.setdefault(cluster_id, cluster_num)

        # For each TF...
        for tf in tfs:
            if uniacc in tf._uniaccs:
                tf.pfam_id = ";".join(pfam_id)
                tf.cluster_num = clusters[cluster_id]

KeyboardInterrupt: 