In [172]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import pandas as pd
import matplotlib as plt
import os

In [125]:
os.chdir("/home/matvey/data/LshCas13a_RNA_cleavage/LshCas13a_in_vitro_total_RNA/")

In [144]:
LRTTableFile = "Results/Tables/TCS_detection_tables/LRTest_table_and_genome_features.tsv"
EcoliChrFile = "Reference_sequences/NC_000913.3.fasta"

In [154]:
LRTTable = pd.read_csv(LRTTableFile, sep="\t")
LRTTable.sort_values(by="PValue.adj", ascending=True, inplace=True, ignore_index=True)
EcoliChrSeq = SeqIO.read(EcoliChrFile, "fasta").seq.transcribe()

In [164]:
def extract_adj_seqs(seq, pos, strand, width):
    assert (strand in ["+", "-"]), "Invalid strand value"
    #should be 0-based coordinates
    if strand == "+":
        seq_slice = seq[pos-width : pos+width]
    elif strand == "-":
        seq_slice = seq[pos-width+1 : pos+width+1].reverse_complement()
    return(seq_slice)

In [183]:
Width = 100
SubSeqsList = list()

for i, row in LRTTable[LRTTable["MatchedFeatureType"] == "CDS"].head().iterrows():
    record_id = f'{row["MatchedFeatureGene"]}-{row["Pos"]}-({row["Strand"]})'
    record_seq = extract_adj_seqs(seq=EcoliChrSeq,
                                  pos=row["Pos"]-1,
                                  strand=row["Strand"],
                                  width=Width)
    
    SubSeqsList.append(SeqRecord(id=record_id,
                                 name=record_id,
                                 description=f'{row["MatchedFeatureDescription"]} fragment', 
                                 seq=record_seq))

In [184]:
SubSeqsList

[SeqRecord(seq=Seq('AAGGCCUGAAAGCUGGCGACCAGAUUCAGUCUGGCGUUGAUGCUGCAAUCAAAC...CUG'), id='rplB-3450942-(-)', name='rplB-3450942-(-)', description='50S ribosomal subunit protein L2 fragment', dbxrefs=[]),
 SeqRecord(seq=Seq('UGCAGGAGCACUCCGUGAUCCUGAUCCGUGGCGGUCGUGUUAAAGACCUCCCGG...CAA'), id='rpsL-3474232-(-)', name='rpsL-3474232-(-)', description='30S ribosomal subunit protein S12 fragment', dbxrefs=[]),
 SeqRecord(seq=Seq('AGGAGCAAAGCUAAUGGCUUUAAAUCUUCAAGACAAACAAGCGAUUGUUGCUGA...GUG'), id='rplJ-4180083-(+)', name='rplJ-4180083-(+)', description='50S ribosomal subunit protein L10 fragment', dbxrefs=[]),
 SeqRecord(seq=Seq('UUGACGAUCUGGAAUUGACUGUCCGCUCUGCUAACUGCCUUAAAGCAGAAGCUA...CCG'), id='rpoA-3440160-(-)', name='rpoA-3440160-(-)', description='RNA polymerase subunit alpha fragment', dbxrefs=[]),
 SeqRecord(seq=Seq('UCGUAACUCCGAUGUAGCCAAAGGUGUGGUUGAAACCUCCCUGAACGUCGGUGU...GCU'), id='pepD-254621-(-)', name='pepD-254621-(-)', description='peptidase D fragment', dbxrefs=[])]