In [7]:
import pandas as pd
from pathlib import Path
from Bio import AlignIO
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Blast import NCBIWWW, NCBIXML

In [8]:
input_files = ["data/EP.csv", "data/ESP.csv", "data/SP.csv"]
data_dfs = { Path(f).stem:pd.read_csv(f) for f in input_files}

In [9]:
columns = data_dfs["EP"].columns
# Check that all headers are the same

for k, v in data_dfs.items():
    set(columns) == set(v.columns), f"Columns for {k} are not the same as EP"

data_dfs["EP"].head()

Unnamed: 0,RNA1 name,RNA2 name,RNA1 Strand,RNA1 from,RNA1 to,RNA1 ligation from,RNA1 ligation to,RNA2 Strand,RNA2 from,RNA2 to,RNA2 ligation from,RNA2 ligation to,Number of interactions,Odds Ratio,Fisher's exact test p-value,RNA1 type,RNA2 type,RNA1 seq,RNA2 seq
0,thrL(SL1344_0001),GcvB(ncRNA0014),+,146,170,182,198,+,3156915,3156984,3156808,3156914,10,7896213417,0.000118,5UTR,sRNA,AATACAAGACAGACAAATAAAAATGACAGAGTACACAACATCCATG...,GGAATGCGTGTTCTGATGGGCTTTTGGCTTACGGTTGTGATGTTGT...
1,SL1344_0018(SL1344_0018).SL1344_0019(SL1344_00...,ArcZ(ncRNA0002),+,19997,20004,20042,20047,+,3511915,3511919,--,--,22,2121390825,8.55e-55,IGR,sRNA,ATGATTCAAGCCCTGGTTTACCGGGGCTTTTCTCCACCAGGGACAG...,GGTCAAACCGGGGTCATTTTTTTTC
2,SL1344_0032(SL1344_0032).SL1344_0033(SL1344_00...,ArcZ(ncRNA0002),+,35145,35150,--,--,+,3511913,3511918,3511860,3511862,13,8773839147,2.28e-34,IGR,sRNA,CTTTTATAGTAGCGCCAGGGAAATAA,ATTTCCCTGGTGTTGGCGCAGTATTCGCGCACCCCGGTCAAACCGG...
3,StyR-29(ncRNA0419),CyaR(ncRNA0009),-,52558,52583,52529,52539,+,2228924,2228931,2228863,2228867,21,2308607016,1.37e-19,sRNA,sRNA,CCTTTGAATTGTCCATATAGAACACATTTGGGAGTTGGACCTTGGC...,ATAAATGCTAGCTGTACCAGGAACCACCTCCTTGGCCTGCGTAATC...
4,StyR-29(ncRNA0419),CpxQ(ncRNA0205),-,52559,52598,52529,52534,+,4292577,4292600,4292541,4292542,10,8940498395,3.2e-05,sRNA,sRNA,GGGCATATTCCTCGGCCTTTGAATTGTCCATATAGAACACATTTGG...,TTTTCCTTGCCATAGACACCATCCCTGTCTTCCCCCACATGATGTG...


In [19]:
def map_record_to_SeqRecord(r, col_name="RNA1"):
    strand_code = 1 if r[f"{col_name} Strand"] == "+" else -1
    id = r[f"{col_name} name"]
    name = r[f"{col_name} name"]
    seq = Seq(r[f"{col_name} seq"])
    from_pos = r[f"{col_name} from"]
    to_pos = r[f"{col_name} to"]
    type = r[f"{col_name} type"]
    description = '>' + id + ' ' + ' '
    return SeqRecord(
        id=id,
        description=description,
        features=[
                SeqFeature(FeatureLocation(from_pos, to_pos, strand=strand_code),type=type)
            ],
        name=name, 
        seq=seq)

bio_dfs = {}
for k, df in data_dfs.items():
    bio_dfs[k+"_RNA1"] = df.apply(axis=1, func=lambda x: map_record_to_SeqRecord(x, "RNA1"))
    bio_dfs[k+"_RNA2"] = df.apply(axis=1, func=lambda x: map_record_to_SeqRecord(x, "RNA2"))

In [20]:
for k, df in bio_dfs.items():
    SeqIO.write(bio_dfs[f"{k}"], f"data/{k}.fasta", "fasta")

EP_RNA1: 436
EP_RNA2: 436
ESP_RNA1: 855
ESP_RNA2: 855
SP_RNA1: 1705
SP_RNA2: 1705
