In [1]:
import pandas as pd

### Do we get novel sequences from running the pipeline on a slice (non-conserved region)? (raw)

In [43]:
full_genome_vmir = pd.read_csv('./full-genome/raw/vmir-output.csv')
full_genome_mirnafold = pd.read_csv('./full-genome/raw/mirnafold-output.csv')

# rename columns for consistency
full_genome_vmir.rename(columns={'Sequence':'sequence'}, inplace=True)
full_genome_mirnafold.rename(columns={'0':'sequence'}, inplace=True)

# combine vmir+mirnafold into single table
full_genome = pd.concat([full_genome_vmir, full_genome_mirnafold], ignore_index=True, verify_integrity=True)

print(f'Full-genome total: {len(full_genome)} (VMIR: {len(full_genome_vmir)}, miRNAFold: {len(full_genome_mirnafold)})')

Full-genome total: 775 (VMIR: 256, miRNAFold: 519)


In [44]:
non_conserved_region_vmir = pd.read_csv('./non-conserved-region/raw/vmir-output.csv')
non_conserved_region_mirnafold = pd.read_csv('./non-conserved-region/raw/mirnafold-output.csv')

# rename columns for consistency
non_conserved_region_vmir.rename(columns={'Sequence':'sequence'}, inplace=True)
non_conserved_region_mirnafold.rename(columns={'0':'sequence'}, inplace=True)

# combine vmir+mirnafold into single table
non_conserved = pd.concat([non_conserved_region_vmir, non_conserved_region_mirnafold], ignore_index=True, verify_integrity=True)

print(f'Non-conserved total: {len(non_conserved)} (VMIR: {len(non_conserved_region_vmir)}, miRNAFold: {len(non_conserved_region_mirnafold)})')

Non-conserved total: 153 (VMIR: 14, miRNAFold: 139)


In [45]:
full_genome_seqs = set(full_genome.sequence)
non_conserved_seqs = set(non_conserved.sequence)

combination = full_genome_seqs.union(non_conserved_seqs)
new_non_conserved = combination.difference(full_genome_seqs)
print(f'Found {len(new_non_conserved)} new sequences in non-conserved region 😅')

vmir = 0
mirnafold = 0

for sequence in new_non_conserved:
    if sequence in list(non_conserved_region_vmir.sequence):
        vmir += 1
    if sequence in list(non_conserved_region_mirnafold.sequence):
        mirnafold += 1
        
print(f'Vmir: {vmir} miRNAFold: {mirnafold}')

Found 6 new sequences in non-conserved region 😅
Vmir: 1 miRNAFold: 5


### Do we get novel sequences from a slice (non-conserved region)? (after filtering)

In [37]:
full_genome_vmir_filtered = pd.read_csv('./full-genome/filtered/vmir_output_filtered.csv')
full_genome_vmir_filtered.rename(columns={'Sequence':'sequence'}, inplace=True)
full_genome_mirnafold_filtered = pd.read_csv('./full-genome/filtered/filtered-pre-mirnas.csv')
full_genome_mirnafold_filtered.rename(columns={'pre-miRNA candidate seq':'sequence', 'MFE':'mfe', 'MFEIS':'mfeis'}, inplace=True)
full_genome_filtered = pd.concat([full_genome_vmir_filtered, full_genome_mirnafold_filtered], ignore_index=True, verify_integrity=True)

print(f'Full-genome total: {len(full_genome_filtered)} (VMIR: {len(full_genome_vmir_filtered)}, miRNAFold: {len(full_genome_mirnafold_filtered)})')

Full-genome total: 37 (VMIR: 10, miRNAFold: 27)


In [38]:
non_conserved_region_vmir_filtered = pd.read_csv('./non-conserved-region/filtered/vmir_output_from_S_filtered.csv')
non_conserved_region_mirnafold_filtered = pd.read_csv('./non-conserved-region/filtered/mirnafold_output_from_S_filtered.csv')
non_conserved_region_mirnafold_filtered.rename(columns={'pre-miRNA candidate seq':'sequence', 'MFE':'mfe', 'MFEIS':'mfeis'}, inplace=True)
non_conserved_filtered = pd.concat([non_conserved_region_vmir_filtered, non_conserved_region_mirnafold_filtered], ignore_index=True, verify_integrity=True)

print(f'Non-conserved total: {len(non_conserved)} (VMIR: {len(non_conserved_region_vmir)}, miRNAFold: {len(non_conserved_region_mirnafold)})')

Non-conserved total: 15 (VMIR: 4, miRNAFold: 11)


In [39]:
full_genome_seqs_filtered = set(full_genome_filtered.sequence)
non_conserved_seqs_filtered = set(non_conserved_filtered.sequence)

combination_filtered = full_genome_seqs_filtered.union(non_conserved_seqs_filtered)
new_non_conserved_filtered = combination_filtered.difference(full_genome_seqs_filtered)
print(f'Found {len(new_non_conserved_filtered)} new sequences in non-conserved region 😅')

vmir_filtered = 0
mirnafold_filtered = 0

for sequence in new_non_conserved_filtered:
    if sequence in list(non_conserved_region_vmir_filtered.sequence):
        vmir_filtered += 1
    if sequence in list(non_conserved_region_mirnafold_filtered.sequence):
        mirnafold_filtered += 1
        
print(f'Vmir: {vmir_filtered} miRNAFold: {mirnafold_filtered}')

Found 7 new sequences in non-conserved region 😅
Vmir: 3 miRNAFold: 4


## Does filtering pre-miRNAs add new sequences? (full genome)

In [54]:
(full_genome_seqs_filtered.union(full_genome_seqs)).difference(full_genome_seqs)

{'AAAGUAUGAGCAGUAUAUAAAAUGGCCAUGGUACAUUUGGCUAGGUUUUAUAGCUGGCUUGAUUGCCAUAGUAAUGGUGACAAUUAUGCUUU',
 'AACCUGCUUCAAGAGAGCUUAAAGUUACAUUUUUCCCUGACUUAAAUGGUGAUGUGGUGGCUAUUGAUUAUAAACACUACACACCCUCUUUUAAGAAAGGAGCUAAAUUGUUACAUAAACCUAUUGUUUGGCAUGUU',
 'AGCGAUUAUGACUACUAUCGUUAUAAUCUACCAACAAUGUGUGAUAUCAGACAACUACUAUUUGUAGUUGAAGUUGUUGAUAAGUACUUUGAUUGUUACGAUGGUGGCUGUAUUAAUGCU',
 'AGGAGUCAAAUGGAAAUUGAUUUCUUAGAAUUAGCUAUGGAUGAAUUCAUUGAACGGUAUAAAUUAGAAGGCUAUGCCUUCGAACAUAUCGUUUAUGGAGAUUUUAGUCAUAGUCAGUUAGGUGGUUUACAUCUACUGAUUGGACU',
 'AUUAAGGGGUACUGCUGUUAUGUCUUUAAAAGAAGGUCAAAUCAAUGAUAUGAUUUUAUCUCUUCUUAGUAAAGGUAGACUUAUAAUUAGAGAAAACAACAGAGUUGUUAUUUCUAGUGAU',
 'CAUUAUUUUGGCAUCUUUUUCUGCUUCCACAAGUGCUUUUGUGGAAACUGUGAAAGGUUUGGAUUAUAAAGCAUUCAAACAAAUUGUUGAAUCCUGUGGUAAUUUUAAAGUUACAAAAGGAAAAGCUAAAAAAGGUGCCUGGAAUAUUG',
 'CUUUUCUUAUGGACCUUGAAGGAAAACAGGGUAAUUUCAAAAAUCUUAGGGAAUUUGUGUUUAAGAAUAUUGAUGGUUAUUUUAAAAUAUAUUCUAAGCACACGCCUAUUAAUUUAGUGCGUGAUCUCCCUCAGGGUUUUUCGGCUUUAG',
 'CUUUUGAAGAAGCUGCGCUGUGCACCUUUUUGUUAAAUAAAGAAAUGU

In [57]:
full_genome_seqs_filtered_u_replaced_with_t = set([seq.replace('U', 'T') for seq in full_genome_seqs_filtered])
(full_genome_seqs_filtered_u_replaced_with_t.union(full_genome_seqs)).difference(full_genome_seqs)

set()

**No new sequences if we transform U<->T. All good!**
____

## Does filtering pre-miRNAs add new sequences? (non conserved genome)

In [60]:
(non_conserved_seqs_filtered.union(non_conserved_seqs)).difference(non_conserved_seqs)

{'AAAGUAUGAGCAGUAUAUAAAAUGGCCAUGGUACAUUUGGCUAGGUUUUAUAGCUGGCUUGAUUGCCAUAGUAAUGGUGACAAUUAUGCUUU',
 'AGCAAGUGCACUUGGAAAACUUCAAGAUGUGGUCAACCAAAAUGCACAAGCU',
 'CAGAGUAGUAGUACUUUCUUUUGAACUUCUACAUGCACCAGCAACUGUUUGUGGACCUAAAAAGUCUACUAAUUUG',
 'CUUUUCUUAUGGACCUUGAAGGAAAACAGGGUAAUUUCAAAAAUCUUAGGGAAUUUGUGUUUAAGAAUAUUGAUGGUUAUUUUAAAAUAUAUUCUAAGCACACGCCUAUUAAUUUAGUGCGUGAUCUCCCUCAGGGUUUUUCGGCUUUAG',
 'GCAUCAUUUUCCACUUUUAAGUGUUAUGGAGUGUCUCCUACUAAAUUAAAUGAUCUCUGCUUUACUAAUGUCUAUGCAGAUUCAUUUGUAAUUAGAGGUGAUGAAGUCAGACAAAUCGCUCCAGGGCAAACUGGAAAGAUUGC',
 'GGAAGUUCAAGAACUUUACUCUCCAAUUUUUCUUAUUGUUGCGGCAAUAGUGUUUAUAACACUUUGCUUCACACUCAAAAGAAAGACAGAAUGAUUGAACUUUCAUUAAUUGACUUCU',
 'GUAACUUCUUCAAUUGUCAUUACUUCAGGUGAUGGCACAACAAGUCCUAUUUCUGAACAUGACUACCAGAUUGGUGGUUAUACUGAAAAAUGGGAAUCUGGAGUAAAAGACUGUGUUGUAUUACACAGUUAC',
 'UCAGACAAGAGGAAGUUCAAGAACUUUACUCUCCAAUUUUUCUUAUUGUUGCGGCAAUAGUGUUUAUAACACUUUGCUUCACACUCAAAAGAAAGACAGAAUGAUUGAACUUUCAUUAAUUGA',
 'UCAUUUGUAAUUAGAGGUGAUGAAGUCAGACAAAUCGCUCCAGGGCAAACUGGAAAGAUUGCUGAUUAUAAU

In [61]:
non_conserved_seqs_filtered_u_replaced_with_t = set([seq.replace('U', 'T') for seq in non_conserved_seqs_filtered])
(non_conserved_seqs_filtered_u_replaced_with_t.union(non_conserved_seqs)).difference(non_conserved_seqs)

set()

**No new sequences if we transform U<->T. All good!**
____