In [12]:
import pandas as pd
from pathlib import Path
from Bio import SeqIO

import urllib.request
import gzip   
import shutil

# URL to the latest (CURRENT) mature miRNA sequences
url = "https://www.mirbase.org/download/CURRENT/mature.fa"

# local filename
fa_file = "mature.fa"     # FASTA file


# download fasta file
print(f'fetching {url} ...') # f-string to embed variable, here the url
urllib.request.urlretrieve(url, fa_file) # download the file from url and save it as fa_file
print(f'Download complete! file saved as: {fa_file}')


# move file from notebook folder to data folder

# Define paths
notebook_file = Path("mature.fa")  # create Path object for the downloaded file
data_folder = Path("../data") # create Path object for the data folder 
destination = data_folder / notebook_file.name # create full destination path

# Move file
shutil.move(notebook_file, destination) # move the file to the destination

print(f"Moved {notebook_file} â†’ {destination}") # print confirmation message


In [13]:
# confirm file is in data folder

# local filename
fa_file = "mature.fa"     # FASTA file
data_folder = Path("../data") # create Path object for the data folder
data_fa_file = data_folder / "mature.fa" # create full destination path
print(f"File in data folder: {data_fa_file.exists()}")


File in data folder: True


In [14]:
import os

# Get the current working directory
cwd = os.getcwd()

# Print it
print(f"Current working directory: {cwd}")

Current working directory: \\wsl.localhost\Ubuntu\home\piets\miRNA_scanner\miRNA-target-scanner\data


In [15]:
os.chdir("../data")  # Change to data directory
print(f"Changed working directory to: {os.getcwd()}")

Changed working directory to: \\wsl.localhost\Ubuntu\home\piets\miRNA_scanner\miRNA-target-scanner\data


In [16]:
# initial parse of miRBase mature.fa file

records = list(SeqIO.parse("../data/mature.fa", "fasta-pearson"))
len(records)


48885

In [17]:
# intital parse of miRBase mature.fa file

fa_file = 'mature.fa'
count = 0 # keep count of miRNA entries found in the file
max_show = 3 # maximum number of entries to show (just want to have a peek at the data)
header, seq = None, []  # store header and sequence lines; seq is a list to hold multiple lines and starts empty, i.e. None

with open(fa_file, 'r') as fhand: # open in read mode, and with statement ensures proper closing of the file
    for line in fhand: # iterate over each line in the file
        line = line.strip() # remove whitespace including newline characters
        if line.startswith('>'): # if the line is a header line
            if header and count <= max_show: # if we have a previous header and we are still within the max_show limit
                print(f'{header}\n{''.join(seq)}\n') # print the previous header and sequence (join the sequence list into a single string)
            count += 1 # increment the count of miRNA entries
            header = line # update the header to the current line
            seq = [] # reset the sequence list for the new entry
        else:
            seq.append(line) # add the line to the sequence list

# print the last entry if within max_show limit
    if header and count <= max_show:
        print(f'{header}\n{''.join(seq)}\n')

print(f' Total miRNA entries found: {count}')
            



>cel-let-7-5p MIMAT0000001 Caenorhabditis elegans let-7-5p
UGAGGUAGUAGGUUGUAUAGUU

>cel-let-7-3p MIMAT0015091 Caenorhabditis elegans let-7-3p
CUAUGCAAUUUUCUACCUUACC

>cel-lin-4-5p MIMAT0000002 Caenorhabditis elegans lin-4-5p
UCCCUGAGACCUCAAGUGUGA

 Total miRNA entries found: 48885


In [18]:
# filtering for human miRNAs

fa_file = 'mature.fa'
mature_human = [] # create list to store header and seq (tuples)

seq = None # holds current sequence
header = None # holds current header

with open (fa_file, 'r') as fhand:
    for line in fhand:
        line = line.strip()

        if line.startswith('>'): # new entry begins
            if header is not None: # if this is NOT the first entry, process the previous one (e.g. 2nd entry, process 1st entry )
                if header.startswith('>hsa-'):
                    mature_human.append((header, seq)) # append header and sequence to lines starting with hsa

            header = line # replace current header with new header
            seq = None # reset sequence buffer

        else:
            seq = line # if line is not a header (i.e. starts with '>'), append line to seq

# Process the last entry
if header is not None and header.startswith('>hsa-'):
    mature_human.append((header, seq))

print(len(mature_human))
print(mature_human[:3])

2656
[('>hsa-let-7a-5p MIMAT0000062 Homo sapiens let-7a-5p', 'UGAGGUAGUAGGUUGUAUAGUU'), ('>hsa-let-7a-3p MIMAT0004481 Homo sapiens let-7a-3p', 'CUAUACAAUCUACUGUCUUUC'), ('>hsa-let-7a-2-3p MIMAT0010195 Homo sapiens let-7a-2-3p', 'CUGUACAGCCUCCUAGCUUUCC')]


In [19]:
# convert list to pandas dataframe

df = pd.DataFrame(mature_human, columns=['header', 'sequence'])

df.head(5)


Unnamed: 0,header,sequence
0,>hsa-let-7a-5p MIMAT0000062 Homo sapiens let-7...,UGAGGUAGUAGGUUGUAUAGUU
1,>hsa-let-7a-3p MIMAT0004481 Homo sapiens let-7...,CUAUACAAUCUACUGUCUUUC
2,>hsa-let-7a-2-3p MIMAT0010195 Homo sapiens let...,CUGUACAGCCUCCUAGCUUUCC
3,>hsa-let-7b-5p MIMAT0000063 Homo sapiens let-7...,UGAGGUAGUAGGUUGUGUGGUU
4,>hsa-let-7b-3p MIMAT0004482 Homo sapiens let-7...,CUAUACAACCUACUGCCUUCCC


In [20]:
df.shape

(2656, 2)

In [23]:
# split header into mature name and accession number

df['mature_name']=df['header'].apply(lambda h: h[1:].split()[0])
df['accession']=df['header'].apply(lambda h: h[1:].split()[1])

print(df.head(5))

                                              header                sequence  \
0  >hsa-let-7a-5p MIMAT0000062 Homo sapiens let-7...  UGAGGUAGUAGGUUGUAUAGUU   
1  >hsa-let-7a-3p MIMAT0004481 Homo sapiens let-7...   CUAUACAAUCUACUGUCUUUC   
2  >hsa-let-7a-2-3p MIMAT0010195 Homo sapiens let...  CUGUACAGCCUCCUAGCUUUCC   
3  >hsa-let-7b-5p MIMAT0000063 Homo sapiens let-7...  UGAGGUAGUAGGUUGUGUGGUU   
4  >hsa-let-7b-3p MIMAT0004482 Homo sapiens let-7...  CUAUACAACCUACUGCCUUCCC   

       mature_name     accession  
0    hsa-let-7a-5p  MIMAT0000062  
1    hsa-let-7a-3p  MIMAT0004481  
2  hsa-let-7a-2-3p  MIMAT0010195  
3    hsa-let-7b-5p  MIMAT0000063  
4    hsa-let-7b-3p  MIMAT0004482  


In [25]:
df.to_csv('../data/mature_human-miRNAs.csv', index = False)