# dataframe 1

In [8]:
import pandas as pd
import re
from Bio.SeqIO.FastaIO import SimpleFastaParser

# defining a function that helps in counting the entries of FASTA file while seperating the titles
def lncRNA_db(file_path, separator, index):
    lncRNA_dictionary = {}                             #initialize an empty dictionary
    with open(file_path) as fasta_file:                #open fasta file, "with" statement ensure the file is closed after execution
        fasta_contents = fasta_file.readlines()        #read all lines and store them in list "fasta_contents"
        for line in fasta_contents:                    #iterates over each line in FASTA file 
            if line.startswith(">"):                   #checks if line is a header using ".startswith()" method
                parts = line.split(separator)          #splits the header line into parts using the preffered seperator
                if len(parts) > index:                 #This line checks whether the length of the parts list is greater than the specified index.
                    name = parts[index]                #extracts "parts" from the "index"  and stores it in variable name 
                    if name not in lncRNA_dictionary:  #update the dictionary
                        lncRNA_dictionary[name] = 1    #if name is not in dictionary
                    else:
                        lncRNA_dictionary[name] += 1   #if name is in dictionary index=1
    return lncRNA_dictionary

# MAIN Function to read FASTA file and return a DataFrame
def read_fasta(file_path, columns):                          #defining the function with two parameters file path and names of coloumn
    bd_func = lncRNA_db(file_path, '|', 1)                   # calling the lncRNA_db function with index=1
    
    with open(file_path, encoding="utf-8") as fasta_file:    #open fasta file with utf-8 encoder for displaying FASTA characters instead of gibberish
        records = []                                         #initialize and Create empty list
        for title, sequence in SimpleFastaParser(fasta_file):#simplefastaparser iterates FASTA file returning title and seq
            record = []                                      #this empty list will temporarily hold the processed ID and sequence for the current record.
            title_splits = re.findall(r"[\w']+", title)      # Data cleaning is needed

            if len(title_splits) > 0:                        # Ensure title_splits has at least one element
                record.append(title_splits[0])               # First value is added/ appended as ID
            else:
                record.append("")                            # Use an empty string if no ID found

            sequence = "".join(sequence)                     # Convert sequence into one continous line
            record.append(sequence)                          # Append sequence
            record.append(len(sequence)) #Second values are sequences lengths

            records.append(record)                           # Add record with ID and seq to records list

    df = pd.DataFrame(records, columns=columns)              # Create DataFrame by converting records list into a pandas df
    return df

# Define the column names
columns = ["id","sequence" , "seq_length"]

# Use the function to read the FASTA file and create a DataFrame
data = read_fasta("rnacentral_seq.fasta", columns=columns)

# Display the DataFrame
print(data)


               id                                           sequence  \
0   URS0000BC45A5  GUGAAAUGCAAAUGAGUUUUUUGAGUUUUUGUUGACACCUCUAACU...   
1   URS00026A285D  GAAGCCCGGGCCAGCCCCAGCGCCCAGCUGGAUCCCGGAAUAACCG...   
2   URS00026A23D0  CUCCCCUUCACCAGCUCCGCGGAAACCAUAGCAACGGGCAGCCGGA...   
3   URS0000456120  AGACUCCUGCAAACAGCAGCUGGCUUAGUGGAGGCUGCCAGACUUC...   
4   URS00007E3599  AAGAAGUGCCUUUCACCUCUCAUCAUGAUUCUGUGGCCUCCCCAGC...   
5   URS000075E21A  GAACUCCUUACCCCAGCUGCCUGGCUGCCCUCAGCUUCCCAAAGCU...   
6   URS0000CCE02F  CAUCAGACACGAAGGAGAGGCCAACAGAUGAGGGAAGCCAUUUUUC...   
7   URS00026A1FD4  CCUUUCCUGCUUCCUCACCCGCUGUAGCAAGCAGGUGUGGAGUCGA...   
8   URS000075E59E  UUGAAGAGAUGAGUGCGGGGCUCAUCUAUCCCUGGAAUUGUCUUUC...   
9   URS000075A563  GAGAAGGGAGGAGUUAUUCAGGCCUCCGCCAGCUUCUAGGCCCUGG...   
10  URS0000193C7E  GGCCGGGCGCGGUGGCUCACGCCUGUAAUCCCAGCUCUCAGGGAGG...   
11  URS000075B07A  AGCGGGCUGCAGGGCUGCGGGCGCUUGGUUCGGCCUGGCCCGGCCG...   
12  URS000075AA30  AAACAGUUGCUGUGGGGAUUGAAUGACUAGUGCAUGUGAAGCUGC

In [9]:
data.to_excel("lncRNA_data.xlsx", index=True, sheet_name="lncRNA Data")

# In case of large files we use gzip.open method

In [None]:
#IN CASES FOR LARGE FILES, WE USE gzip.open 
import gzip
from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio import SeqIO

_df= 'lncRNA_LncBookv2.0.fa.gz'

blank_list = []

_reads= 0

with gzip.open(_df, 'rt') as _fh:
    for _rec in SeqIO.parse(_fh , 'fasta'):
        _reads += 1
        blank_list.append((_rec.id , _rec.description , _rec.seq))
        if _reads > 1_00000: break #forming a limit of only 1,00,000 sequences



In [None]:
import pandas as pd

data2= pd.DataFrame(blank_list, columns=('id', 'desc', 'seq'))

data2.to_csv("lncRNA_LncBookv2.csv" , index=True)

# Count number of "A,U,G,C" in a sequence and return a fraction value

In [3]:
def nucleotide_fractions(rna_seq):

    
   
    # Calculate the length of the sequence
    seq_length = len(rna_seq)
    
    # Count the occurrences of each nucleotide
    a_count = rna_seq.count('A')
    u_count = rna_seq.count('U')
    g_count = rna_seq.count('G')
    c_count = rna_seq.count('C')
    
    # Calculate the fraction of each nucleotide
    a_fraction = a_count / seq_length
    u_fraction = u_count / seq_length
    g_fraction = g_count / seq_length
    c_fraction = c_count / seq_length
    
    return {
        'A': a_fraction,
        'U': u_fraction,
        'G': g_fraction,
        'C': c_fraction
    }

rna_seq= "CACACTCTGAGTTTCTGAGAGTAAGCCACTGTCAGTTCCTGGGGTGAGCCACCAGCCACATGGACACAATTTCCTCTTTTTGATGGAAACGTTTTCTTCCAAATTGTGCTGCACGTTTTTGGCGAGAGCATGGGGCTGTGCGGCGTCCCCTCCCTGGCGCCCACCTGTGCCCTGCACACTGGCCTGCACTGTGGTGATCTCGCTTGGCCCCCACCTGATTCCCGACATACAGCAGAGGAACCTTAGGCTCAGGTGGAACAGCCTCAACTGATTCTGTCCCTGAACTTCCGTACACAGCCCTGGAGTCGTCTTAGAGCCATGATTTATTTAACTGTTCTTTCATTTTACAGAACATAAAATGTATTGTTTCCAACTTTTTTCCTATGGTAAATAATACTAAAGTAAATATCTCTGTGCATGAATCTTTTTGTATATGTTGGAATATCCTTAAGATAAGGCCCCAGAACTAAAAGTACCCTGTCAAAAGGTGAGCATTTCCGGTTCCCCTGCTGTGCTTTGCTGCGTTGTTCTCTCCTGCTGCAACGTTCTCACTCCACAATCCTGGGGCAGGGAGGGGAGGCCCAGCTGAGTTTGGATCATAATCCTGAAAGACACAATCCCAAGCACCATAATGTGGAATGTTGAAATCCCTAAAGATCAAAATCCCTCAAGTCTAAAATCCCTGATATTTCAGATGACCACAGCTACAGGGCTAGGTGCACACAATTAGTAACCGTAGCGATATACGTGTACACGTTTCTCTTTTGACTTATTTCTTTATGGTCTGTCTTCTTATAACTGCTACACCCATGCCGCCGTCGTTAGTTACCTCAGTGTTTATGCAAAAATACCTGTTATCATTGCCTATTTTATTGTGTAAAGTGGCCTATGAAATGTTCTGTTGTGTTTTTATGTTTCTCAAATACATACCTTTTAAAAATGTAAA"
fractions = nucleotide_fractions(rna_seq)
print(fractions)


{'A': 0.24735729386892177, 'U': 0.0, 'G': 0.19873150105708245, 'C': 0.24312896405919662}


# The next code is for Datasets and a fraction of thier "A,U,G,C" content

In [None]:
import pandas as pd

def nucleotide_fractions(rna_sequence):
  
    # Calculate the length of the sequence
    sequence_length = len(rna_sequence)
    
    # Count the occurrences of each nucleotide
    a_count = rna_sequence.count('A')
    u_count = rna_sequence.count('U')
    g_count = rna_sequence.count('G')
    c_count = rna_sequence.count('C')
    
    # Calculate the fraction of each nucleotide
    a_fraction = a_count / sequence_length if sequence_length > 0 else 0
    u_fraction = u_count / sequence_length if sequence_length > 0 else 0
    g_fraction = g_count / sequence_length if sequence_length > 0 else 0
    c_fraction = c_count / sequence_length if sequence_length > 0 else 0
    
    return a_fraction, u_fraction, g_fraction, c_fraction

def process_rna_sequences(file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)


    # Print the column names to verify the column containing RNA sequences
    print("Column names in the CSV file:", df.columns.tolist())
    
    # Replace 'sequence' with the actual column name in your CSV file
    sequence_column = 'seq'  # Adjust this based on your CSV file
    
    # Check if the sequence column exists in the DataFrame
    if sequence_column not in df.columns:
        raise KeyError(f"Column '{sequence_column}' not found in the CSV file.")
    
    # Assuming the RNA sequences are in a column named 'seq'
    results = {
        'A_fraction': [],
        'U_fraction': [],
        'G_fraction': [],
        'C_fraction': []
    }
    
    for sequence in df['seq']:
        a_fraction, u_fraction, g_fraction, c_fraction = nucleotide_fractions(sequence)
        results['A_fraction'].append(a_fraction)
        results['U_fraction'].append(u_fraction)
        results['G_fraction'].append(g_fraction)
        results['C_fraction'].append(c_fraction)
    
    # Create a new DataFrame with the results
    results_df = pd.DataFrame(results)
    
    # Optionally, you can concatenate the results with the original DataFrame
    df = pd.concat([df, results_df], axis=1)
    
    return df

# Example usage
file_path = 'lncRNA_LncBookv2.csv'  # Replace with the path to your CSV file
processed_df = process_rna_sequences(file_path)
print(processed_df)

# Optionally, save the processed DataFrame to a new CSV file
processed_df.to_csv('processed_rna_sequences.csv', index=False)
