CisReg datbase:

* Cis_include_genome2  or Cis_include_mRNA
  * RFXXXX/
    * RFFXXXX.filtered.clustal
    * RFFXXXX.filtered.struct
    * Cis_flanks-XX/
      * ID-SS-EE_known_nt.fasta


Overall Procedure:
For each family:
    1. Read sequences and alignment from RFFXXXX.filtered.clustal
    2. For each pair of sequence compute PSI and SCI. For the PSI, sequences are reliagned but for the SCI Rfam alignmnet is passed to RNAalifold
    3. From pairs with PSI>0.95 only keep one. This is to avoid having too similar alignments at the end.
    3. Write the pair as a fasta ref file
    3. Find the two contextadded fasta files of the pair from *known_nt.fasta
      . If not found swap start end and search for that file
    4. Merge the known_nt.fasta files and report as raw file
    
** Note: In LocalFold dataset seq ids on reverse strand have Rfam-id with swapped start and end locations**

In [1]:
import notebook
E = notebook.nbextensions.EnableNBExtensionApp()
E.enable_nbextension('usability/codefolding/main')

import glob
import os, sys
from Bio import AlignIO, SeqIO
from Bio.Align import MultipleSeqAlignment
import itertools

import pandas as pd


# settings !!!! IMPORTANT UPDATE IT to CisDataset folder !!!!!!
EXTREME_SI = 95 # from two extermly similar seqs discard one
CIS_GENOME_PATH = '/home/milad/DataBase/CisReg/Cis_include_genome2/'
CIS_MRNA_PATH = '/home/milad/DataBase/CisReg/Cis_include_mRNA/'
regx_gaps = '[-.~_]'  # valid gap symbols all be converted to "-"

VIENNA_BIN_PATH = '/home/milad/software/bin/'
QUAKE_PARAM_FILE = '/home/milad/workspace/mmfold/src/misc/rna_turner2004_ML_up_penalty.par '
RNAFOLD =  'RNAfold -p --noPS ' 
RNAPLFOLD ='RNAplfold '
import numpy as np

# Import libraries located in relation to this file
# tools_dir = '/home/milad/workspace/rnaalignclust/bin/analysis/tools/' #os.path.join(parent_dir, '/bin/analysis/tools/')
# print "tools_dir", tools_dir
# sys.path.insert(0, tools_dir)


import re

# -------------
def remove_gap_columns(malign):
    ''' Remove all-gap columns from a biopython multiple-alignment
    Returns pruned multiple-alignment'''

    if (len(malign) == 0):
        return
    for c in reversed(range(len(malign[0]))):
        if len(re.sub(regx_gaps, '', malign[:, c])) == 0:
            malign = malign[:, 0:c] + malign[:, c + 1:]  # concat left & right side of column c
    
    return malign

   # End def remove_gap_columns   


In [2]:
from subprocess import *

def sub_dotplot(dp, lcontext, rcontext):
    '''Returns an squared submatrix by removing pre and post section ....
    the aim is to extract dotplot of RNA from a dotplot of the context extened RNA'''
    assert (dp.shape[0] == dp.shape[1])
    assert (dp.shape[0] >= lcontext + rcontext)
    n = dp.shape[0]
    return dp[lcontext:n-rcontext, lcontext:n-rcontext]

def upper_part (dpX):
    ''' Returns the upper section of matrix in flattened form, expecting to have bp probabilities'''
    
    return dpX[np.triu_indices(dpX.shape[0],1)]

# TODO: Add all repeatedly used functions into a library accros all tools

def dotbracket_to_dict(struct):
    '''Returns a dictionary where basepairs are keys with !ONE! based indices joined by ":" ,
    e.g. dict {'0:10': 1, '2:8': 1} '''
    assert len(struct.replace('.', '').replace('(', '').replace(')', '')) == 0
    stack = list()
    pairs = dict()
    for pos, ch in enumerate(list(struct)):
        #         print pos+1, ch
        if ch=='(':
            stack.append(pos)
        elif ch==')':
            left = stack.pop()
            key= "{}:{}".format(left+1, pos+1)
            pairs[key] = 1
    
    assert len(stack) == 0
    return pairs


def compute_part_func(infile_fa, seq_names, outdir_path="./", use_plfold=False, quake_params=False, use_cache = False):
    '''Runs Vienna RNAfold/RNAplfold with partition function for all sequences inside input fasta file
    If use_cache, it does nothing if If the ps file with same paramaters exists '''
    from subprocess import Popen, PIPE
    #     print "compute_part_func(", infile_fa, seq_names
    if use_plfold:
        out_dir = outdir_path + RNAPLFOLD.replace(' ', '')
    else:
        out_dir = outdir_path + RNAFOLD.replace(' ', '')
    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)
    
    if not os.path.isfile(infile_fa):
        raise IOError("Fastafile not found: {}".format(infile_fa))
    
        
    all_in_cache = all([os.path.isfile(os.path.join(out_dir, sname+'_dp.ps')) for sname in seq_names])
    if all_in_cache and use_cache:
        raise NotImplementedError("Sequence names for caching are not correctly set")
        return out_dir
    
    with open(infile_fa) as in_rna:
        
        arg_param = ""
        if quake_params:
            arg_param = " -P %s " % QUAKE_PARAM_FILE
            
        if use_plfold:
            p = Popen(('cd %s;' %out_dir) + VIENNA_BIN_PATH + RNAPLFOLD + arg_param, stdin=in_rna, shell=True, stdout=PIPE, stderr=PIPE)
        else:
            p = Popen(('cd %s;' %out_dir) + VIENNA_BIN_PATH +  RNAFOLD + arg_param, stdin=in_rna, shell=True, stdout=PIPE, stderr=PIPE)

        out, err = p.communicate()
        if err:
            print "Error in calling RNAfold for ", infile_fa
            print out
            print err
            
            if (not use_plfold and 
            not ("scaling factor" in err
                or "free energy" in err )): # With long sequences RNAfold prints scalign factor to stderr
                raise RuntimeError
    
    return out_dir

def parse_dp_ps(ps_file):
    '''Extracts base pair probabliies from vienna ps file
    returns: Dictinary of form dict[i:j]=p(i,j) '''
    
    # Extract sequence from ps file
    myseq = ""
    read_seq = False
    with open(ps_file) as in_ps:
        for line in in_ps:
            if "/sequence" in line:
                read_seq = True
            elif read_seq and ") } def" in line:
                read_seq = False
            elif read_seq:
                myseq += line.rstrip().rstrip("\\")
    #     print ps_file.rstrip("_dp.ps") , myseq
              
    import re
    ureg = re.compile(r'^(\d+)\s+(\d+)\s+(\d+\.\d+)\s+ubox\s*')
    bp_prob_dict = dict()
    bp_prob_mat = np.zeros((len(myseq), len(myseq)))
    
    with open(ps_file) as in_ps:
        for line in in_ps:
            if "ubox" in line:
                um = ureg.match(line)
                if um:
                    i, j, sqrp = um.groups()

                    #                     print i, j, sqrp
                    
                    # keys are pair of indexes as smaller:larger
                    key = ":".join(sorted([i,j], reverse=True))
                    assert (key not in bp_prob_dict)
                    bpprob = float(sqrp)*float(sqrp)
                    bp_prob_dict[key] = bpprob
                    
                    i,j = int(i), int(j)
                    bp_prob_mat[i-1,j-1] = bpprob             
    return bp_prob_mat

In [57]:
def decode_cisreg_entry(famid, flank_len, runquake=False, dataset='genome'):
    ''' Input is a RFAM id and the flanking size
    Returns Constrained folded(reference) structure and the clustal alignment in format
    [AlignIO, pandas_df_struct]
    '''
    if dataset == 'genome':
        db_path = CIS_GENOME_PATH
    elif dataset == 'mrna':
        db_path = CIS_MRNA_PATH
    else:
        raise RuntimeError("Unknown dataset type: {}".format(dataset))
    
    # ============================= A =================================
    # set file and dirs names then check exitance and remove files within outdir
    clust_filtered_file = '{}{}/{}.filtered.clustal'.format(db_path, famid, famid )
    struct_file = '{}{}/{}.struct'.format(db_path, famid, famid )
    
    if not os.path.isfile(clust_filtered_file):
        raise IOError("Clustal file not found: {}".format(clust_filtered_file))
    if not os.path.isfile(struct_file):
        raise IOError("CisReg-Struct file not found: {}".format(clust_filtered_file))
    

    # ============================= B =================================
    # Read struct file
    import pandas as pd
    df_struct = pd.DataFrame.from_csv(struct_file, sep="\t")
    print "Number of sequences in .struct: ", len(df_struct)
    
    df_struct['flanked-id'] = 'flanked'

    # ============================= C =================================
    # Read clustal file

    clustal_handle = open(clust_filtered_file, 'r')
    print clust_filtered_file
    clustal_alignment = AlignIO.read(clustal_handle, "clustal")
    print "Number of sequences in alignment: ", len(clustal_alignment)
    
    # ============================= C =================================
    # Sanity check struct and clustal sequnces match
    if len(df_struct) !=  len(clustal_alignment):
#         raise RuntimeError
        print ("WARNING: decode_cisreg_entry({}) len(df_struct) !=  len(clustal_alignment) {}!={} \n".format(
                           famid, len(df_struct), len(clustal_alignment)))
    
    assert len(df_struct) <=  len(clustal_alignment) # TODO: Why some sequnces are missing from the .struct ?
    for seq in clustal_alignment:
        print seq.id,
        seq_reverese_corrected = seq.id
        if seq.id not in df_struct.index:
            # On reverse strand the starting ending positions are swapped, so check for both
            splits = seq.id.replace("/", " ").replace("-", " ").split()
            assert(len(splits)==3)
            seq_id_reverse = "{}/{}-{}".format(splits[0], splits[2], splits[1])
            if seq_id_reverse not in df_struct.index: # TODO: Why some sequnces are missing from the .struct ?
#                 raise RuntimeError
                print(" WARNING decode_cisreg_entry Fam:{} struct-clustal mismatch for seq {}\n".format( 
                       famid, seq.id))
                continue
            seq_reverese_corrected = seq_id_reverse
            

        # ============================= D =================================
        # Get the fasta file of specific flanking range
        fasta_flanked_seq = get_extended_fasta_file(seq, famid, flank_len, dataset=dataset)
        # Get the extended_id, which is different from seq.id when flanking is non-zero
        with open(fasta_flanked_seq, "r") as in_fasta_handle:
            fa_recs = list(SeqIO.parse(in_fasta_handle, "fasta"))
        assert len(fa_recs) == 1
        fasta_flanked_id =  fa_recs[0].id

        df_struct.set_value(seq_reverese_corrected, 'flanked-id', fasta_flanked_id)
        
        df_struct.set_value(seq_reverese_corrected, 'flanked-fasta-path', fasta_flanked_seq)
        
    print
    return clustal_alignment, df_struct

def get_extended_fasta_file(seq, famid, flank_len, dataset='genome'):
    # Get the fasta file of specific flanking range
    #TODO: Waht to do with unknown_nt i.e. sequnces with unknown nucleutides

    if dataset == 'genome':
        db_path = CIS_GENOME_PATH
    elif dataset == 'mrna':
        db_path = CIS_MRNA_PATH
    else:
        raise RuntimeError("Unknown dataset type: {}".format(dataset))

    fam_flank_path = '{}{}/Cis_flanks-{}/'.format(db_path, famid, flank_len )
        
    if not os.path.isdir(fam_flank_path):
        raise IOError("Family flanking dir does not exist: {}".format(fam_flank_path))

    fasta_flanked_seq = "{}/{}_known_nt.fasta".format(fam_flank_path, seq.id.replace("/", "_") )
    # Sometimes the starting ending positions are swapped, redfine fasta var
    if not os.path.isfile(fasta_flanked_seq):
        fasta_flanked_seq_unknown = fasta_flanked_seq.replace('_known_', '_unknown_')

        if os.path.isfile(fasta_flanked_seq_unknown):
            print "Warning: benchmarking flanked sequnce with unknown nucleotides {}".format(fasta_flanked_seq_unknown)
            fasta_flanked_seq = fasta_flanked_seq_unknown

        else: # reverse case
            splits = seq.id.replace("/", " ").replace("-", " ").split()
            assert(len(splits)==3)

            fasta_flanked_seq = "{}/{}_known_nt.fasta".format(fam_flank_path, "{}_{}-{}".format(splits[0], splits[2], splits[1]) )
            if not os.path.isfile(fasta_flanked_seq):
                fasta_flanked_seq_unknown = fasta_flanked_seq.replace('_known_', '_unknown_')
                if os.path.isfile(fasta_flanked_seq_unknown):
                    print "Warning: benchmarking flanked sequnce with unknown nucleotides {}".format(fasta_flanked_seq_unknown)
                    fasta_flanked_seq = fasta_flanked_seq_unknown
                else: # none of : direct, reverse, known, unknown
                    raise IOError("Fasta file not found: {}".format(fasta_flanked_seq))
    #             print "reverse strand"

    
    return fasta_flanked_seq


def get_expected_accuracy(reference_struct, dp_matrix):
    '''dp_matrix is a numpy matrix where base indeices are ZERO based'''
    assert dp_matrix.shape[0] == dp_matrix.shape[1]
    assert dp_matrix.shape[0] == len(reference_struct)
    reference_struct_dict = dotbracket_to_dict(reference_struct)
    sum_TP_prob = 0.0
    for bp_key in reference_struct_dict:
        i,j = bp_key.split(":")
        i,j = int(i), int(j)
        sum_TP_prob += dp_matrix[i-1,j-1]
#         print i,j, dp_matrix[i-1,j-1]
    
#     print "    TP_score: %.2f" % (sum_TP_prob/len(reference_struct_dict))
    return (sum_TP_prob/len(reference_struct_dict))

def get_left_right_context_lengths(motif_id, context_id):
    
    motif_splits = motif_id.replace('/',' ').replace('_', ' ').replace('-', ' ').split()
    assert len(motif_splits) == 3

    context_splits = context_id.replace('/',' ').replace('_', ' ').replace('-', ' ').split()
    assert len(context_splits) == 3

    motif_acc, motif_start, motif_end = motif_splits
    context_acc, context_start, context_end = context_splits

    if motif_acc != context_acc:
        raise RuntimeError("Mismtach motif and context accesions {} {}".format(motif_id, context_id))
        
    motif_start, motif_end = motif_splits[1:]
    context_start, context_end = context_splits[1:]
    motif_start, motif_end, context_start, context_end = [ (int)(s) for s in 
                                                          [motif_start, motif_end, context_start, context_end ]]
#         print motif_start, motif_end, context_start, context_end 
    context_len_left = motif_start - context_start 
    context_len_right = context_end - motif_end 
    print  motif_id, context_id, "context_len_left {}, context_len_right {}".format(context_len_left, context_len_right)


    # Verify and adpat reverse strands
    # Sorry for the complication implemented below and imposed by the accesion encoding 
    on_reverse = False
    if motif_start > motif_end:
        if context_start > context_end:
#                 print "reverse strand"
            on_reverse = True
        else: 
            raise RuntimeError("Mismatch1 of context right left positions for seq {}".format(motif_id))
    else:
        if context_start > context_end:
            raise RuntimeError("Mismatch2 of context right left positions for seq {}".format(motif_id))


#         print on_reverse
    if on_reverse:
        context_len_left *= -1
        context_len_right *= -1
    if not on_reverse:
        assert motif_start < motif_end
        assert context_start < context_end 
        assert context_start <= motif_start
        assert motif_end <= context_end
    else:
        assert motif_start > motif_end
        assert context_start > context_end 
        assert context_start >= motif_start
        assert motif_end >= context_end
    
    motif_len = abs(motif_end - motif_start) + 1
    return context_len_left, context_len_right, motif_len, context_start, context_end, motif_acc
    
def generate_assymetric_fasta(motif_seq_id, target_context_len, fasta_supercontext, output_path):
    
    if not os.path.isfile(fasta_supercontext):
        raise IOError("Not found fasta_supercontext:{} ".format(fasta_supercontext))
    # Open supercontext to get the id as well as the super-sequence
    with open(fasta_supercontext, "r") as in_fasta_handle:
        fa_recs = list(SeqIO.parse(in_fasta_handle, "fasta"))
    assert len(fa_recs) == 1
    super_seq =  fa_recs[0]
    
    super_left_len, super_right_len, motif_seq_len, super_start, super_end, accession = get_left_right_context_lengths(motif_seq_id, super_seq.id)
    
    if super_left_len + super_right_len < target_context_len:
        raise RuntimeError("Not enough super context available l:{} r:{} target:{}".format( super_left_len, super_right_len, target_context_len))
    
    len_to_remove = (super_left_len + super_right_len) - target_context_len 
    avg_left_ratio = super_left_len /(float)(super_left_len + super_right_len)
    
#     left_ratio = min(1.0, max(0, np.random.normal(avg_left_ratio, 0.1 )))
    left_ratio =  avg_left_ratio
    left_len_to_remove = min(super_left_len, int(left_ratio * len_to_remove) )
    right_len_to_remove = len_to_remove - left_len_to_remove
    if right_len_to_remove > super_right_len:
        left_len_to_remove = len_to_remove - super_right_len
        right_len_to_remove = super_right_len
        
    assert 0 <= right_len_to_remove  and right_len_to_remove <= super_right_len
    assert 0 <= left_len_to_remove  and left_len_to_remove <= super_left_len
    print "right_len_to_remove + left_len_to_remove == len_to_remove", right_len_to_remove , left_len_to_remove , len_to_remove
    assert right_len_to_remove + left_len_to_remove == len_to_remove
    super_seq_str = str(super_seq.seq)
    assert len(super_seq_str) == (len_to_remove + target_context_len + motif_seq_len)
    
                          
    out_fasta_file = "{}/{}".format(output_path, os.path.basename(fasta_supercontext))
    print "Range: {}:{}".format( left_len_to_remove, (left_len_to_remove+target_context_len+motif_seq_len))
    out_seq_str = super_seq_str[left_len_to_remove:(left_len_to_remove+target_context_len+motif_seq_len)]
    assert len(out_seq_str) == target_context_len + motif_seq_len
    if super_start < super_end: # Direct strand
        out_seq_id = "{}/{}-{}".format(accession, super_start+left_len_to_remove, super_end-right_len_to_remove)
    else: # reverse strand
        out_seq_id = "{}/{}-{}".format(accession, super_start-left_len_to_remove, super_end+right_len_to_remove )
        
    with open(out_fasta_file, "w") as out_fasta_handle:
#         SeqIO.write([], out_fasta_file, "fasta")
        out_fasta_handle.write(">{}\n{}\n".format(out_seq_id, out_seq_str))


    
    
def bechmark_sequence(return_dict, seq_id, df_fam_context, famid, context_len,  recalc_dotplots=True, runquake=False, 
                      dataset='genome', use_assymetric_context=False, create_assymetric_context=False, assym_path="./", 
                      target_context_len=400):
    flanked_fasta_path = df_fam_context['flanked-fasta-path'][seq_id]
    assert not (use_assymetric_context and create_assymetric_context)

    if create_assymetric_context:
        generate_assymetric_fasta(seq_id, target_context_len, flanked_fasta_path, assym_path)
        return
    
    flanked_seq_id = df_fam_context['flanked-id'][seq_id]
    extended_seq_id_flat = flanked_seq_id.replace('/','_')
    if recalc_dotplots:
        dp_out_base = "./dp-ps-Cis-flanks-{}/".format(context_len) 
        if not os.path.isdir(dp_out_base):
            os.mkdir(dp_out_base)
        import tempfile
        dp_out_path = tempfile.mkdtemp(suffix=extended_seq_id_flat, dir=dp_out_base)
        dp_outdir = compute_part_func(flanked_fasta_path, [ flanked_seq_id], 
                                      outdir_path=dp_out_path, use_plfold=False, quake_params=runquake)
    else:
        raise NotImplementedError("Error: This feature is not implemented yet") #"benchmark_family() Use already computed dotplot ps files"

    # Read and parse fasta sequence ids of motif plus extened/flanked one

    seq_splits = seq_id.replace('/',' ').replace('_', ' ').replace('-', ' ').split()
    assert len(seq_splits) == 3
    extended_seq_splits = extended_seq_id_flat.replace('_', ' ').replace('-', ' ').split()
    assert len(extended_seq_splits) == 3

    motif_start, motif_end = seq_splits[1:]
    context_start, context_end = extended_seq_splits[1:]
    motif_start, motif_end, context_start, context_end = [ (int)(s) for s in 
                                                          [motif_start, motif_end, context_start, context_end ]]
#         print motif_start, motif_end, context_start, context_end 
    context_len_left = motif_start - context_start 
    context_len_right = context_end - motif_end 
    print  seq_id, extended_seq_id_flat, "context_len_left {}, context_len_right {}".format(context_len_left, context_len_right)


    # Verify and adpat reverse strands
    # Sorry for the complication implemented below and imposed by the accesion encoding 
    on_reverse = False
    if motif_start > motif_end:
        if context_start > context_end:
#                 print "reverse strand"
            on_reverse = True
        else: 
            raise RuntimeError("Mismatch1 of context right left positions for seq {}".format(seq_id))
    else:
        if context_start > context_end:
            raise RuntimeError("Mismatch2 of context right left positions for seq {}".format(seq_id))


#         print on_reverse
    if on_reverse:
        context_len_left *= -1
        context_len_right *= -1
    if not on_reverse:
        assert motif_start < motif_end
        assert context_start < context_end 
        assert context_start <= motif_start
        assert motif_end <= context_end
    else:
        assert motif_start > motif_end
        assert context_start > context_end 
        assert context_start >= motif_start
        assert motif_end >= context_end

    # Verify and parse dotplot ps file into numpy matrix
#         dp_ps = './dp-ps-Cis-flanks-{}/RNAfold-p--noPS/{}_dp.ps'.format(context_len, extended_seq_id_flat)
    dp_ps = '{}/{}_dp.ps'.format(dp_outdir, extended_seq_id_flat)
    assert(os.path.isfile(dp_ps))
    dp_matrix = parse_dp_ps(dp_ps)

    # Get the subplot of motif from extended-sequence folded dotplot
    sub_dp_matrix = sub_dotplot(dp_matrix, context_len_left, context_len_right)

    seq_score = get_expected_accuracy(df_fam_context['STRUCTURE_CONSTRAINT_MFE'][seq_id], sub_dp_matrix)
    
    return_dict[seq_id] = seq_score
    
def benchmark_family(famid, context_len, recalc_dotplots=True, runquake=False, dataset='genome', use_assymetric_context=False,
                     create_assymetric_context=False, target_context_len=400, parallel=False):
    # Run the tool per family and per context length
    fam_alignment, df_fam_context = decode_cisreg_entry(famid, context_len, runquake=runquake, dataset=dataset)
    assert not (use_assymetric_context and create_assymetric_context)
    # Configure multiprocessing of benchmark_sequence()
    
    if dataset == 'genome':
        db_path = CIS_GENOME_PATH
    elif dataset == 'mrna':
        db_path = CIS_MRNA_PATH
    assym_flank_dir = ""
    if create_assymetric_context:
        assym_fam_path = "{}/{}/".format(db_path, famid)
        if not os.path.isdir(assym_fam_path):
            os.mkdir(assym_fam_path)

        assym_flank_dir = "{}/Cis_flanks-Assym{}/".format(assym_fam_path, target_context_len/2)
        if not os.path.isdir(assym_flank_dir):
            os.mkdir(assym_flank_dir)


    if parallel is True:
        import multiprocessing
        manager = multiprocessing.Manager()
        family_scores_dict = manager.dict()
        jobs = []
        for seq_id in list(df_fam_context.index)[0:]:
            #bechmark_sequence(family_scores_dict, seq_id, df_fam_context, famid, context_len, recalc_dotplots, runquake, dataset)
            p = multiprocessing.Process( target=bechmark_sequence,
                args=(family_scores_dict, seq_id, df_fam_context, famid, context_len, recalc_dotplots, runquake, dataset, 
                      use_assymetric_context, create_assymetric_context, assym_flank_dir))
            jobs.append(p)
            p.start()

        for proc in jobs:
            proc.join()
        print "RESULT: family_scores_dict"
        print family_scores_dict
    else:
        family_scores_dict = dict()
        for seq_id in list(df_fam_context.index)[0:]:
            bechmark_sequence(family_scores_dict, seq_id, df_fam_context, famid, context_len, recalc_dotplots, runquake, 
                             dataset, use_assymetric_context, create_assymetric_context, assym_flank_dir)
    
    if create_assymetric_context:
        return
    df_family_scores = pd.DataFrame.from_dict(family_scores_dict, orient='index')
    df_family_scores.columns = ['bp-accuracy']
    df_family_scores['seq-id'] = df_family_scores.index
    df_family_scores['dataset'] = dataset
    df_family_scores['fam-id'] = famid
    df_family_scores['context-len'] = context_len
    df_family_scores['instance-name'] = "C{}-{}-".format(context_len, famid) + df_family_scores.index 
    df_family_scores.set_index('instance-name', inplace=True, drop=False)
    return df_family_scores
#     get_expected_accuracy(df_fam_context['STRUCTURE_CONSTRAINT'][seq_id], parse_dp_ps(dp_ps))
import random
random.seed(10)
np.random.seed(10)
# TODO: CisReg Erronous families: RF00515 RF01418
# df_fam_scores = benchmark_family('RF00032', context_len=3000, runquake=False, dataset='mrna')
# Interesting fams: RF00050
df_fam_scores = benchmark_family('RF00515', context_len=500, runquake=False, dataset='genome', create_assymetric_context=True, parallel=True)
import sys
sys.stdout.flush()

# df_fam_scoresq = benchmark_family('RF00032', context_len=100, runquake=True, dataset='mrna')
import sys
sys.stdout.flush()

print len(df_fam_scores)
print df_fam_scores.median()
print df_fam_scores.mean()
print len(df_fam_scoresq)
print df_fam_scoresq.median()
print df_fam_scoresq.mean()
# fam_alignment, df_fam_context, dp_dir = decode_cisreg_entry('RF01418', 0)
# df_fam_context


Number of sequences in .struct:  56
/home/milad/DataBase/CisReg/Cis_include_genome2/RF00515/RF00515.filtered.clustal
Number of sequences in alignment:  56
CP000259.1/744106-744214 AL591981.1/165154-165262 M59757.2/1239-1357 AE017334.2/3710572-3710686 AL009126.3/1620329-1620445 AAPZ02000001.1/1290955-1291071 AL596170.1/182555-182663 CP000919.1/624651-624755 CP000705.1/148104-148220 AAMN01000002.1/828893-829048 AF044978.1/176-335 AE017333.1/1725773-1725889 AL766848.1/175198-175311 CP001033.1/1229224-1229356 AL009126.3/1618161-1618277 CP000023.1/493796-493904 X73308.1/235-367 CP000259.1/678792-678909 AE005176.1/1384872-1385029 AE017225.1/3711140-3711254 CP000557.1/1070259-1070361 CP000002.3/1724430-1724551 AL591981.1/164471-164574 AJ132624.1/224-346 X76083.1/1336-1438 M59757.2/2712-2828 Z54240.2/641-760 X76083.1/2797-2929 AE017225.1/3709736-3709821 AE014133.1/803283-803394 AF068902.1/6383-6487 M59757.2/547-663 X74207.1/3091-3251 X76083.1/647-773 AE005176.1/1080400-1080547 AM263198.1/18914

TypeError: object of type 'NoneType' has no len()

In [216]:
print db_path
import glob
rfam_fams_genome = [os.path.basename(d) for d in glob.glob(CIS_GENOME_PATH+'/RF*[0-9]')]
rfam_fams_mrna = [os.path.basename(d) for d in glob.glob(CIS_MRNA_PATH+'/RF*[0-9]')]
print "\n".join(sorted(rfam_fams_genome))
print
print "\n".join(sorted(rfam_fams_mrna))

/home/milad/DataBase/CisReg/Cis_include_genome2/
RF00023
RF00036
RF00038
RF00040
RF00041
RF00048
RF00140
RF00164
RF00171
RF00175
RF00176
RF00182
RF00184
RF00185
RF00192
RF00193
RF00194
RF00196
RF00214
RF00215
RF00220
RF00233
RF00243
RF00250
RF00252
RF00260
RF00290
RF00362
RF00374
RF00375
RF00376
RF00384
RF00385
RF00386
RF00388
RF00389
RF00434
RF00453
RF00459
RF00465
RF00467
RF00468
RF00469
RF00470
RF00481
RF00490
RF00491
RF00496
RF00499
RF00500
RF00501
RF00502
RF00510
RF00515
RF00525
RF00550
RF00552
RF00617
RF00620
RF01047
RF01065
RF01068
RF01313
RF01380
RF01381
RF01382
RF01415
RF01417
RF01418
RF01453
RF01454

RF00031
RF00032
RF00037
RF00109
RF00161
RF00172
RF00179
RF00180
RF00183
RF00207
RF00227
RF00232
RF00259
RF00433
RF00435
RF00436
RF00437
RF00454
RF00460
RF00463
RF00485
RF00524
RF00551
RF00626
RF00632
RF01046
RF01455


In [169]:
import pandas as pd
df_benchmark_all_mrna = pd.DataFrame()
df_benchmark_all_quake_mrna = pd.DataFrame() #columns=['instance-name', 'seq-id', 'family-id', 'context-len'])
for famid in rfam_fams[0:10]:
    print "***** RFAM Family: ", famid
    for cont_len in [500]:#0 , 100, 200]:
        df_fam_scores = benchmark_family(famid, context_len=cont_len, runquake=False)
        df_benchmark_all = df_benchmark_all.append(df_fam_scores)

        df_fam_scores_quake = benchmark_family(famid, context_len=cont_len, runquake=True)
        df_benchmark_all_quake = df_benchmark_all_quake.append(df_fam_scores_quake)
    
df_benchmark_all_quake




Number of sequences in .struct:  25
/home/milad/DataBase/CisReg/Cis_include_genome2/RF00194/RF00194.filtered.clustal
Number of sequences in alignment:  25
AB003341.1/1417-1484 AY397695.1/1417-1484 AB047330.1/9674-9741 M15240.1/9667-9734 AB003353.1/1417-1484 AF435866.1/9655-9722 AB080731.1/1417-1484 AB072383.1/1417-1484 AB072387.1/1417-1484 AB003349.1/1417-1484 AB003345.1/1417-1484 AB080733.1/1417-1484 X05259.1/3294-3361 AY280704.1/1417-1484 AB003337.1/1417-1484 AY161374.1/1390-1443 AB003355.1/1417-1484 AY258323.1/9674-9741 AB003343.1/1417-1484 D00156.1/2363-2430 AY161373.1/1390-1443 AY258322.1/9674-9741 AB080198.1/1417-1484
D00156.1/2363-2430 D00156.1_1863-2451 context_len_left 500, context_len_right 21
    TP_score: 0.65
AB080198.1/1417-1484 AB080198.1_917-1484 context_len_left 500, context_len_right 0
    TP_score: 0.64
AB003343.1/1417-1484 AB003343.1_917-1484 context_len_left 500, context_len_right 0
    TP_score: 0.51
AY280704.1/1417-1484 AY280704.1_917-1484 context_len_left 500, c

TypeError: object of type 'float' has no len()

In [169]:
import pandas as pd
# df_benchmark_all = pd.DataFrame()
# df_benchmark_all_quake = pd.DataFrame() #columns=['instance-name', 'seq-id', 'family-id', 'context-len'])
for famid in rfam_fams[0:10]:
    print "***** RFAM Family: ", famid
    for cont_len in [500]:#0 , 100, 200]:
        df_fam_scores = benchmark_family(famid, context_len=cont_len, runquake=False)
        df_benchmark_all = df_benchmark_all.append(df_fam_scores)

        df_fam_scores_quake = benchmark_family(famid, context_len=cont_len, runquake=True)
        df_benchmark_all_quake = df_benchmark_all_quake.append(df_fam_scores_quake)
    
df_benchmark_all_quake




Number of sequences in .struct:  25
/home/milad/DataBase/CisReg/Cis_include_genome2/RF00194/RF00194.filtered.clustal
Number of sequences in alignment:  25
AB003341.1/1417-1484 AY397695.1/1417-1484 AB047330.1/9674-9741 M15240.1/9667-9734 AB003353.1/1417-1484 AF435866.1/9655-9722 AB080731.1/1417-1484 AB072383.1/1417-1484 AB072387.1/1417-1484 AB003349.1/1417-1484 AB003345.1/1417-1484 AB080733.1/1417-1484 X05259.1/3294-3361 AY280704.1/1417-1484 AB003337.1/1417-1484 AY161374.1/1390-1443 AB003355.1/1417-1484 AY258323.1/9674-9741 AB003343.1/1417-1484 D00156.1/2363-2430 AY161373.1/1390-1443 AY258322.1/9674-9741 AB080198.1/1417-1484
D00156.1/2363-2430 D00156.1_1863-2451 context_len_left 500, context_len_right 21
    TP_score: 0.65
AB080198.1/1417-1484 AB080198.1_917-1484 context_len_left 500, context_len_right 0
    TP_score: 0.64
AB003343.1/1417-1484 AB003343.1_917-1484 context_len_left 500, context_len_right 0
    TP_score: 0.51
AY280704.1/1417-1484 AY280704.1_917-1484 context_len_left 500, c

TypeError: object of type 'float' has no len()

In [170]:
def my_plot(df, context_len, famid='all'):
    df_selection = df[df['context-len'] == context_len]
    if famid != 'all':
        df_selection = df_selection[df_selection['fam-id']==famid]
    print len(df_selection)
    print "contextlen: ", int(df_selection.median()['context-len'])
    print "median:     %.3f" % df_selection.median()['bp-accuracy']
    print "mean:       %.3f"% df_selection.mean()['bp-accuracy']
    print
    return df_selection
desired_fam = 'all'
my_plot(df_benchmark_all, context_len=0, famid=desired_fam)
my_plot(df_benchmark_all, context_len=100, famid=desired_fam)
my_plot(df_benchmark_all, context_len=200, famid=desired_fam)
my_plot(df_benchmark_all, context_len=500, famid=desired_fam)


# df_benchmark_all.median()
    

282
contextlen:  0
median:     0.727
mean:       0.664

282
contextlen:  100
median:     0.500
mean:       0.457

282
contextlen:  200
median:     0.563
mean:       0.527

282
contextlen:  500
median:     0.501
mean:       0.481



Unnamed: 0_level_0,bp-accuracy,seq-id,fam-id,context-len,instance-name
instance-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C500-RF00194-M15240.1/9667-9734,0.635891,M15240.1/9667-9734,RF00194,500,C500-RF00194-M15240.1/9667-9734
C500-RF00194-AB047330.1/9674-9741,0.668492,AB047330.1/9674-9741,RF00194,500,C500-RF00194-AB047330.1/9674-9741
C500-RF00194-X05259.1/3294-3361,0.571225,X05259.1/3294-3361,RF00194,500,C500-RF00194-X05259.1/3294-3361
C500-RF00194-AB003355.1/1417-1484,0.664692,AB003355.1/1417-1484,RF00194,500,C500-RF00194-AB003355.1/1417-1484
C500-RF00194-AB003349.1/1417-1484,0.673725,AB003349.1/1417-1484,RF00194,500,C500-RF00194-AB003349.1/1417-1484
C500-RF00194-AB080731.1/1417-1484,0.639689,AB080731.1/1417-1484,RF00194,500,C500-RF00194-AB080731.1/1417-1484
C500-RF00194-D50677.1/1417-1484,0.640700,D50677.1/1417-1484,RF00194,500,C500-RF00194-D50677.1/1417-1484
C500-RF00194-D00156.1/2363-2430,0.650776,D00156.1/2363-2430,RF00194,500,C500-RF00194-D00156.1/2363-2430
C500-RF00194-AB003337.1/1417-1484,0.580462,AB003337.1/1417-1484,RF00194,500,C500-RF00194-AB003337.1/1417-1484
C500-RF00194-L19420.1/1530-1597,0.267234,L19420.1/1530-1597,RF00194,500,C500-RF00194-L19420.1/1530-1597


In [172]:
my_plot(df_benchmark_all_quake, context_len=0, famid=desired_fam)
my_plot(df_benchmark_all_quake, context_len=100, famid=desired_fam)
my_plot(df_benchmark_all_quake, context_len=200, famid=desired_fam)
my_plot(df_benchmark_all_quake, context_len=500,  famid=desired_fam)

282
contextlen:  0
median:     0.721
mean:       0.659

282
contextlen:  100
median:     0.483
mean:       0.452

282
contextlen:  200
median:     0.551
mean:       0.514

282
contextlen:  500
median:     0.488
mean:       0.468



Unnamed: 0_level_0,bp-accuracy,seq-id,fam-id,context-len,instance-name
instance-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C500-RF00194-M15240.1/9667-9734,0.642392,M15240.1/9667-9734,RF00194,500,C500-RF00194-M15240.1/9667-9734
C500-RF00194-AB047330.1/9674-9741,0.668756,AB047330.1/9674-9741,RF00194,500,C500-RF00194-AB047330.1/9674-9741
C500-RF00194-X05259.1/3294-3361,0.545181,X05259.1/3294-3361,RF00194,500,C500-RF00194-X05259.1/3294-3361
C500-RF00194-AB003355.1/1417-1484,0.660441,AB003355.1/1417-1484,RF00194,500,C500-RF00194-AB003355.1/1417-1484
C500-RF00194-AB003349.1/1417-1484,0.670451,AB003349.1/1417-1484,RF00194,500,C500-RF00194-AB003349.1/1417-1484
C500-RF00194-AB080731.1/1417-1484,0.630060,AB080731.1/1417-1484,RF00194,500,C500-RF00194-AB080731.1/1417-1484
C500-RF00194-D50677.1/1417-1484,0.635100,D50677.1/1417-1484,RF00194,500,C500-RF00194-D50677.1/1417-1484
C500-RF00194-D00156.1/2363-2430,0.655016,D00156.1/2363-2430,RF00194,500,C500-RF00194-D00156.1/2363-2430
C500-RF00194-AB003337.1/1417-1484,0.574499,AB003337.1/1417-1484,RF00194,500,C500-RF00194-AB003337.1/1417-1484
C500-RF00194-L19420.1/1530-1597,0.264798,L19420.1/1530-1597,RF00194,500,C500-RF00194-L19420.1/1530-1597
