In [1]:
import pandas as pd

## Pre-process VMIR output

In [2]:
def parse_vmir_sequence(path):
    with open(path, 'r') as f:
        txt = f.readlines()
        return txt[10].strip()
    
    
def complementary_seq(sequence, reverse=False):
    complement_trans = str.maketrans('ATGC', 'TACG')
    sequence = sequence.translate(complement_trans)
    if reverse:
        sequence = ''.join(reversed(sequence))
    return sequence

    
def get_seq_from_summary(summary, full_sequence):
    seq_start = summary.Start - 1
    sequence = full_seq[seq_start:seq_start+summary.Size]
    if summary.Orientation == 'Reverse':
        sequence = complementary_seq(sequence, reverse=True)
    return sequence

In [3]:
vmir_outputs = ['./non-conserved-region/raw/VMir_hairpins/VMir_hairpins_starting_from_S_gene.txt']

vmir_combined = pd.DataFrame()

for vmir_output in vmir_outputs:
    df = pd.read_csv(vmir_output, skiprows=32, sep='\s')
    df['Source'] = vmir_output
    print(f'loaded dataframe of size {len(df)} from {vmir_output}')
    if vmir_combined.empty:
        vmir_combined = df
    else:
        vmir_combined = pd.concat([df, vmir_combined]).drop_duplicates(subset=['Start', 'Size']).reset_index(drop=True)

vmir_combined

loaded dataframe of size 14 from ./non-conserved-region/raw/VMir_hairpins/VMir_hairpins_starting_from_S_gene.txt


  df = pd.read_csv(vmir_output, skiprows=32, sep='\s')


Unnamed: 0,Rank,Name,Orientation,Start,Apex,Size,Score,Sub,HPs,Rep,HPs.1,Wind.Cnt.Abs.,Wind.Cnt.Rel.,Source
0,11,MD7,Direct,1406,1435,58,139.7,0,0,37,37,,,./non-conserved-region/raw/VMir_hairpins/VMir_...
1,10,MD13,Direct,3237,3271,62,140.1,1,0,2,43,,,./non-conserved-region/raw/VMir_hairpins/VMir_...
2,12,MD15,Direct,3655,3682,60,126.9,0,0,44,44,,,./non-conserved-region/raw/VMir_hairpins/VMir_...
3,14,MD16,Direct,4036,4080,89,115.6,0,0,37,37,,,./non-conserved-region/raw/VMir_hairpins/VMir_...
4,9,MD17,Direct,4504,4534,62,143.7,3,0,38,44,,,./non-conserved-region/raw/VMir_hairpins/VMir_...
5,3,MD20,Direct,5353,5396,85,188.2,4,0,4,42,,,./non-conserved-region/raw/VMir_hairpins/VMir_...
6,7,MD21,Direct,5433,5496,119,169.1,3,0,2,42,,,./non-conserved-region/raw/VMir_hairpins/VMir_...
7,1,MD23,Direct,6104,6159,104,205.6,4,0,12,44,,,./non-conserved-region/raw/VMir_hairpins/VMir_...
8,13,MD24,Direct,6254,6286,59,120.6,0,0,37,37,,,./non-conserved-region/raw/VMir_hairpins/VMir_...
9,8,MR4,Reverse,1052,1082,66,143.9,1,0,38,39,,,./non-conserved-region/raw/VMir_hairpins/VMir_...


In [4]:
full_seq = ''
with open('./non-conserved-region/raw/starting_from_S_gene.fasta', 'r') as f:
    full_seq = f.read()
    
len(full_seq)

8341

In [5]:
sequences = []

for index, row in vmir_combined.iterrows():
    seq = get_seq_from_summary(row, full_seq)
    sequences.append(seq)
    
vmir_combined['Sequence'] = sequences
vmir_combined

Unnamed: 0,Rank,Name,Orientation,Start,Apex,Size,Score,Sub,HPs,Rep,HPs.1,Wind.Cnt.Abs.,Wind.Cnt.Rel.,Source,Sequence
0,11,MD7,Direct,1406,1435,58,139.7,0,0,37,37,,,./non-conserved-region/raw/VMir_hairpins/VMir_...,CAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGA...
1,10,MD13,Direct,3237,3271,62,140.1,1,0,2,43,,,./non-conserved-region/raw/VMir_hairpins/VMir_...,TGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTC...
2,12,MD15,Direct,3655,3682,60,126.9,0,0,44,44,,,./non-conserved-region/raw/VMir_hairpins/VMir_...,GGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGC...
3,14,MD16,Direct,4036,4080,89,115.6,0,0,37,37,,,./non-conserved-region/raw/VMir_hairpins/VMir_...,GGCAACTAGCACTCTCCAAGGGTGTTCACTTTGTTTGCAACTTGCT...
4,9,MD17,Direct,4504,4534,62,143.7,3,0,38,44,,,./non-conserved-region/raw/VMir_hairpins/VMir_...,TTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGCC...
5,3,MD20,Direct,5353,5396,85,188.2,4,0,4,42,,,./non-conserved-region/raw/VMir_hairpins/VMir_...,ACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGTGATCCTTCGT...
6,7,MD21,Direct,5433,5496,119,169.1,3,0,2,42,,,./non-conserved-region/raw/VMir_hairpins/VMir_...,GCTGTGACATCAAGGACCTGCCTAAAGAAATCACTGTTGCTACATC...
7,1,MD23,Direct,6104,6159,104,205.6,4,0,12,44,,,./non-conserved-region/raw/VMir_hairpins/VMir_...,GGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCA...
8,13,MD24,Direct,6254,6286,59,120.6,0,0,37,37,,,./non-conserved-region/raw/VMir_hairpins/VMir_...,GTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAAG...
9,8,MR4,Reverse,1052,1082,66,143.9,1,0,38,39,,,./non-conserved-region/raw/VMir_hairpins/VMir_...,ATGCGGAATTATATAGGACAGAATAATCAGCAACACAGTTGCTGAT...


In [7]:
vmir_combined.to_csv('non-conserved-region/raw/vmir-output.csv', index=False)

## Preprocess miRNAFold output

In [8]:
with open('./non-conserved-region/raw/miRNAFold_hairpins/miRNAFold_hairpins_starting_from_S_gene.txt', 'r') as f:
    mirnas = f.read().split('\n\n')
    mirnas = [mirna.split('\n')[1].strip() for mirna in mirnas if len(mirna) > 0]
    
    # NOTE: we replace uracil by thymine for compatibility with VMIR output
    mirnas = [mirna.replace('U', 'T') for mirna in mirnas]
    
len(mirnas)

139

In [11]:
mirnafold = pd.DataFrame(mirnas)

In [12]:
mirnafold.to_csv('non-conserved-region/raw/mirnafold-output.csv', index=False)

# Find common pre-miRNA predictions 

In [55]:
vmir_seqs = set(list(vmir_combined['Sequence']))
mirnafold_seqs = set(mirnas)

direct_intersection = mirnafold_seqs.intersection(vmir_seqs)
reverse_intersection = set([complementary_seq(seq, reverse=True) for seq in mirnafold_seqs]).intersection(vmir_seqs)
total_intersection = direct_intersection.union(reverse_intersection)

print('Direct intersections:', len(direct_intersection))
print('Reverse intersections:', len(reverse_intersection))

Direct intersections: 4
Reverse intersections: 1


In [56]:
print('miRNA candidates predicted by both VMIR and miRNAFold:')
vmir_combined[vmir_combined['Sequence'].isin(total_intersection)]

miRNA candidates predicted by both VMIR and miRNAFold:


Unnamed: 0,Rank,Name,Orientation,Start,Apex,Size,Score,Sub,HPs,Rep,HPs.1,Wind.Cnt.Abs.,Wind.Cnt.Rel.,Source,Sequence
18,21,MD76,Direct,17137,17166,61,159.0,1,0,1,44,,,default_min_score_115_win_count_35_size_50-220...,TCTGCTCGCATAGTGTATACAGCTTGCTCTCATGCCGCTGTTGATG...
20,23,MD79,Direct,17951,18020,131,151.5,7,0,8,41,,,default_min_score_115_win_count_35_size_50-220...,TACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTTGCAATT...
55,95,MD5,Direct,1083,1124,86,147.3,0,0,1,1,,,sequence_no_header_no_whitespace_Export.txt,TTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGT...
89,15,MD40,Direct,7723,7764,89,191.5,0,0,1,1,,,sequence_no_header_no_whitespace_Export.txt,GTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATGGTTCCATC...
241,187,MR64,Reverse,24722,24762,79,123.0,0,0,1,1,,,sequence_no_header_no_whitespace_Export.txt,CAGGAGCAGTTGTGAAGTTCTTTTCTTGTGCAGGGACATAAGTCAC...


# MFEI calculation

In [1]:
#pre-mirna candidate
#generally lower the better, but at least MFEI ≤ -0.85 kcal/mol per https://www.biorxiv.org/content/10.1101/2020.11.02.365049v1.full.pdf
string = 'GAUUGCUGCAGUCAUAACAAGAGAAGUGGGUUUUGUCGUGCCUGGUUUGCCUGGCACGAUAUUACGCACAACUAAUGGUGACUUUUUGCAUUUC'

def mfei_calculation(precursor, mfe):
    g_and_c = (precursor.count('G')+precursor.count('C'))/len(precursor)
    return mfe/len(precursor)/g_and_c

mfei_calculation(string,-37.3)

-0.8880952380952382