# Iterate in IPython!

Quick and interactive

In [7]:
from itertools import chain

def tf_to_gene_dict(gene_list, gene_to_tf_dict):
    """

    :param gene_list:
    :param gene_to_tf_dict:
    :return:
    """
    # Get a set of TFs from the dictionary values
    tf_set = set(chain(*gene_to_tf_dict.values()))

    # Initialize dictionary with TFs as keys
    tf_dict = {tf: None for tf in tf_set}

    # For each TF
    for tf in tf_dict.keys():

        # Add the gene to the set if the TF is a associated with it
        tf_match = []
        for gene in gene_list:
            try:
                if tf in gene_to_tf_dict[gene]:
                    tf_match.append(gene)
                    # break
            except KeyError:
                pass
        tf_dict[tf] = set(tf_match)
    return tf_dict

Generate some random data that works with the function

In [8]:
import random
import string

def rand_str():
    return ''.join(random.choices(string.ascii_uppercase, k=8))

def synth_data(n=1000, m=1000):
    genes = [rand_str() for _ in range(n)]
    tfs = [rand_str() for _ in range(m)]
    
    gene_list = [g for g in genes if random.random() < 0.9]
    genes_to_tfs = {
        g: [tf for tf in tfs if random.random() < 0.1] 
        for g 
        in genes 
        if random.random() < 0.9
    }
    return gene_list, genes_to_tfs

Scale it to a reasonable size so it takes a little time, but not too much.

In [15]:
n = 600

random.seed(0)
gene_list, gene_to_tf_dict = synth_data(n, n)

%time tf_dict = tf_to_gene_dict(gene_list, gene_to_tf_dict)

CPU times: user 345 ms, sys: 0 ns, total: 345 ms
Wall time: 346 ms


Try optimizing the code under test

In [16]:
import collections

def tf_to_gene_dict2(gene_list, gene_to_tf_dict):
    """
    For all the genes in *gene_list*, flip the *gene_to_tf_dict* mapping
    around and with *tf* as a key, return the set of all related genes.

    :param gene_list:
    :param gene_to_tf_dict:
    :return:
    """
    genes_to_return = set(gene_list)
    
    def gene_tf_pairs():
        for gene, tfs in gene_to_tf_dict.items():
            if gene not in genes_to_return:
                continue
            for tf in tfs:
                yield gene, tf

    tfs_to_genes = collections.defaultdict(set)

    for gene, tf in gene_tf_pairs():
        tfs_to_genes[tf].add(gene)
    
    return dict(tfs_to_genes)

In [20]:
n = 600

random.seed(0)
gene_list, gene_to_tf_dict = synth_data(n, n)

%timeit tf_dict2 = tf_to_gene_dict2(gene_list, gene_to_tf_dict)

6.6 ms ± 496 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [19]:
n = 200
random.seed(1)
for _ in range(50):
    gene_list, gene_to_tf_dict = synth_data(n, n)
    assert (
        tf_to_gene_dict(gene_list, gene_to_tf_dict) 
        == tf_to_gene_dict2(gene_list, gene_to_tf_dict)
    )