# dataframe 1

In [2]:
import pandas as pd
import re
from Bio.SeqIO.FastaIO import SimpleFastaParser
import gzip

# Function to return the count number  in the fasta in a dictionary
def lncRNA_db(file_path, separator, RNA_index):
    lncRNA_dictionary = {}
    with gzip.open(file_path, 'r') as fasta_file:
        fasta_contents = fasta_file.readlines()
        for line in fasta_contents:
            if line.startswith(">"):
                name = line.split(separator)[RNA_index]
                if name not in lncRNA_dictionary:
                    lncRNA_dictionary[name] = 1
                else:
                    lncRNA_dictionary[name] += 1
    return lncRNA_dictionary

# Function to read fasta and return a DataFrame
def read_fasta(file_path, columns):
    file_path = "C:/Users/Swadha/Desktop/INTERNSHIP 2024/data collection/lncRNA_LncBookv2.0.fa.gz"
   
    bd_func = lncRNA_db(file_path, '|', 1)  # Execute function
    
    with gzip.open(file_path, 'rt') as fasta_file:
        records = []  # Create empty list
        for title, sequence in SimpleFastaParser(fasta_file):
            record = []
            title_splits = re.findall(r"[\w']+", title)  # Data cleaning
            
            record.append(title_splits[0])  # First values are ID
            record.append(len(sequence))  # Second values are sequence lengths
            sequence = " ".join(sequence)  # It converts into one line
            record.append(sequence)  # Third values are sequences

            
            records.append(record)  # Add the record to the list

    return pd.DataFrame(records, columns=columns)  # Return DataFrame

# Example usage
file_path = "lncRNA_LncBookv2.0.fa.gz"
columns = ["id", "sequence_length", "sequence"]
data = read_fasta(file_path, columns)

print(data)


BadGzipFile: Not a gzipped file (b'\x1f\xef')

# dataframe 2

In [None]:
import pandas as pd
import re
from Bio import SeqIO
import gzip

# Function to return the count number of RNA types in the FASTA file
def lncRNA_db(file_path, separator, RNA_index):
    lncRNA_dictionary = {}
    with gzip.open(file_path, 'rt') as fasta_file:
        for line in fasta_file:
            if line.startswith(">"):
                name = line.split(separator)[RNA_index]
                if name not in lncRNA_dictionary:
                    lncRNA_dictionary[name] = 1
                else:
                    lncRNA_dictionary[name] += 1
    return lncRNA_dictionary

# Function to read FASTA file and return a DataFrame
def read_fasta(file_path, columns):
    # Get the RNA count dictionary
    bd_func = lncRNA_db(file_path, '|', 1)  # Execute function
    
    records = []  # Create empty list to store records
    with gzip.open(file_path, 'rt') as handle:
        for record in SeqIO.parse(handle, 'fasta'):
            rec = []
            title_splits = re.findall(r"[\w']+", record.description)  # Data cleaning
            
            rec.append(title_splits[0])  # First value is ID
            rec.append(len(record.seq))  # Second value is sequence length
            rec.append(str(record.seq))  # Third value is the sequence
            
            records.append(rec)  # Add the record to the list

    # Return DataFrame and RNA count dictionary
    return pd.DataFrame(records, columns=columns), bd_func

# Example usage
file_path = "lncRNA_LncBookv2.0.fa.gz"
columns = ["id", "sequence_length", "sequence"]
data, rna_counts = read_fasta(file_path, columns)

print(data)
print(rna_counts)


# dataframe 3

In [2]:
import pandas as pd
import re
from Bio.SeqIO.FastaIO import SimpleFastaParser

# Function to return the count number of animals in the fasta in a dictionary
def lncRNA_db(file_path, separator, index):
    lncRNA_dictionary = {}
    with open(file_path) as fasta_file:
        fasta_contents = fasta_file.readlines()
        for line in fasta_contents:
            if line.startswith(">"):
                parts = line.split(separator)
                if len(parts) > index:  # Check if parts has enough elements
                    name = parts[index]
                    if name not in lncRNA_dictionary:
                        lncRNA_dictionary[name] = 1
                    else:
                        lncRNA_dictionary[name] += 1
    return lncRNA_dictionary

# Function to read FASTA file and return a DataFrame
def read_fasta(file_path, columns):
    bd_func = lncRNA_db(file_path, '|', 1)  # Execute function
    
    with open(file_path, encoding="utf-8") as fasta_file:
        records = []  # Create empty list
        for title, sequence in SimpleFastaParser(fasta_file):
            record = []
            title_splits = re.findall(r"[\w']+", title)  # Data cleaning is needed

            if len(title_splits) > 0:  # Ensure title_splits has at least one element
                record.append(title_splits[0])  # First values are ID
            else:
                record.append("")  # Use an empty string if no ID found

            sequence = "".join(sequence)  # Convert sequence into one line
            record.append(sequence)  # Append sequence

            records.append(record)  # Add record to records list

    df = pd.DataFrame(records, columns=columns)  # Create DataFrame
    return df

# Define the column names
columns = ["id", "sequence"]

# Use the function to read the FASTA file and create a DataFrame
data = read_fasta("rnacentral_seq.fasta", columns=columns)

# Display the DataFrame
print(data)


              id                                           sequence
0  URS0000BC45A5  GUGAAAUGCAAAUGAGUUUUUUGAGUUUUUGUUGACACCUCUAACU...
1  URS00026A285D  GAAGCCCGGGCCAGCCCCAGCGCCCAGCUGGAUCCCGGAAUAACCG...
2  URS00026A23D0  CUCCCCUUCACCAGCUCCGCGGAAACCAUAGCAACGGGCAGCCGGA...
3  URS0000456120  AGACUCCUGCAAACAGCAGCUGGCUUAGUGGAGGCUGCCAGACUUC...
4  URS00007E3599  AAGAAGUGCCUUUCACCUCUCAUCAUGAUUCUGUGGCCUCCCCAGC...


In [4]:
data.to_excel("lncRNA_data.xlsx", index=True, sheet_name="lncRNA Data")

# Reading using UtilityLib and Bio.SeqIO


In [None]:
# pip install git+https://github.com/TheBiomics/UtilityLib
from UtilityLib import ProjectManager
from UtilityLib.lib import EntityPath

T0113  = ProjectManager(path_bases=["/mnt/DataDrive/MDD/T0113--ncRNA-TCS/ncRNA-ML/"])

T0113.config.path_bioinfo_large = EntityPath("/mnt/DataDrive/MDD/T0043--BioInfoData--part-2--Large")
T0113.config.path_bioinfo = EntityPath("/mnt/DataDrive/MDD/T0043--BioInfoData")
T0113.config.path_rnacentral = T0113.config.path_bioinfo_large / 'RNACentral/24.0/'
T0113.config.path_lincip =  T0113.config.path_bioinfo / 'LNCipedia/5.2/'


In [None]:
# Count total lines in the fa.gz file
# ~490_549_792 lines and ~32_524_827 fasta records
_rc_seq_path = T0113.config.path_rnacentral / 'sequences/rnacentral_active.fasta.gz'
T0113.count_file_lines(_rc_seq_path)


In [None]:
# Count records

from Bio import SeqIO
T0113.require('gzip', 'GZip')

with T0113.GZip.open(_rc_seq_path, 'rt') as _fh:
  _counter = 0
  for _rec in SeqIO.parse(_fh, 'fasta'):
      _counter += 1

# Total records ~32_524_827 fasta records
_counter

In [None]:
# Write LncRNA records in chunks

from Bio import SeqIO
T0113.require('gzip', 'GZip')

_total_sequence_records = round(32_524_827, -6) + 10

_df_pkl_storage = (T0113.path_base / 'lnc-dfs').validate()
_df_file_template = "lncRNA-%s.df.gz"

_rc_seq_path = T0113.config.path_rnacentral / 'sequences/rnacentral_active.fasta.gz'

with T0113.GZip.open(_rc_seq_path, 'rt') as _fh:
  _fasta_seq_ids = []
  _file_written_num = 0
  _counter = 0
  _df_row_size = 100_000

  for _rec in SeqIO.parse(_fh, 'fasta'):
    _file_num = _counter // _df_row_size
    _seq_df_path = _df_pkl_storage / (_df_file_template % str(_file_written_num + 1))

    if _seq_df_path.exists():
      continue

    if not (_file_written_num == _file_num):
      _DF = T0113.DF(_fasta_seq_ids, columns=['LncRNA', 'Description', 'Seq'])
      T0113.pickle(_seq_df_path, _DF)

      _fasta_seq_ids = []

      # Set next file
      _file_written_num = _file_num
      print(f"{_file_written_num=} {_counter}")

    _counter += 1
    _fasta_seq_ids.append((_rec.id, _rec.description, str(_rec.seq)))
