In [2]:
import pandas as pd
from pathlib import Path
from Bio import AlignIO
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Blast import NCBIWWW, NCBIXML
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import re

from sup_data_to_fasta import load_xlsx, transform_raw_dfs_to_queries

sup5_file_path = Path("../data/Liu_sup5_data.xlsx")

# Análise Exploratória dos Dados Suplementares 5

## Sobre

Este notebook contém uma análise exploratória dos dados fornecidos por Liu, et al. No [suplemento 5](https://www.nature.com/articles/s41467-023-43632-1#additional-information
) com o objetivo de averiguar os tratamentos executados no artigo

### Objetivo

* Verificar a origem das tags dos pares sRNA/mRNA dos dados suplementares.
* Ganhar mais insight sobre as interações medidas.



In [3]:
import pandas as pd
from pathlib import Path

def load_alignments():
    output_dir = Path("/workspaces/sRNAs_Interactomes/output")
    alignments_files = output_dir.glob("*alignments_results.tsv")
    
    exp_word_size_list = []

    alignments_df_list = []
    for file_path in alignments_files:
        prefix = file_path.stem.split("_")[0]
        df = pd.read_csv(file_path, sep='\t')
        word_sz = prefix.split("w")[1]
        experiment = prefix.split("-")[0]
        exp_word_size_list.append((experiment, word_sz))
        df["experiment"] = experiment
        df["word_sz"] = word_sz
        alignments_df_list.append(df)
    
    alignments_dict = {(exp, word_sz): df for (exp, word_sz), df in zip(exp_word_size_list, alignments_df_list)}
    return alignments_dict, exp_word_size_list


In [4]:
alignments_dict, exp_word_size_list = load_alignments()

In [14]:
def load_queries():
    sup5_file_path = Path("/workspaces/sRNAs_Interactomes/data/Liu_sup5_data.xlsx")
    dict = load_xlsx(sup5_file_path)
    queries_dict = transform_raw_dfs_to_queries(dict)
    queries_dict = {k.split('-')[0]:v for k,v in queries_dict.items()}
    return queries_dict

In [15]:
queries_dict = load_queries()

In [16]:
queries_df = queries_dict['EP']
alignments_df = alignments_dict[('EP', '11')]
print(f"#queries {len(queries_df)}")
print(f"#alignments {len(alignments_df)}")

KeyError: ('EP', '11')

In [7]:
def join_dfs(queries_df, alignments_df):
    joined_df = pd.merge(queries_df, alignments_df, left_on='name', right_on='qseqid', how='left', suffixes=('_query', '_alignment'))
    return joined_df

In [8]:
aligned_queries_dict = {}
for (exp, word_sz) in exp_word_size_list:
    queries_df = queries_dict[exp]
    alignments_df = alignments_dict[(exp, word_sz)]
    joined_df = join_dfs(queries_df, alignments_df)
    aligned_queries_dict[(exp, word_sz)] = joined_df

In [9]:
not_aligned_dict = {}

for (exp, word_sz), df in sorted(aligned_queries_dict.items()):
    non_aligned_queries = df[df['qseqid'].isnull()]
    
    print(f"Number of queries in {exp} experiment not aligned for word_size={word_sz}: {len(non_aligned_queries)}")
    if len(non_aligned_queries):
        not_aligned_dict[(exp, word_sz)] = non_aligned_queries['name'].tolist()
        # print(f"Queries not aligned in {exp} experiment for word_size={word_sz}: {non_aligned_queries['name'].tolist()}")


In [30]:
for (exp, word_sz), not_aligned in sorted(not_aligned_dict.items()):
    print(f"Queries not aligned in {exp} experiment for word_size={word_sz}: {not_aligned}")

Queries not aligned in EP experiment for word_size=15: ['SL1344_1792(SL1344_1792).SL1344_1791(SL1344_1791).IGR', 'SL1344_1967(SL1344_1967)', 'SL1344_2696(SL1344_2696)', 'SL1344_2698(SL1344_2698)', 'STnc1680(ncRNA0265)', 'gpB(SL1344_2645).AS', 'rfbX(SL1344_2065)']
Queries not aligned in ESP experiment for word_size=15: ['SL1344_1967(SL1344_1967)', 'SL1344_1968(SL1344_1968)', 'SL1344_1976(SL1344_1976).AS', 'SL1344_2696(SL1344_2696)', 'SL1344_2698(SL1344_2698)', 'cspE(SL1344_0617)', 'folA(SL1344_0088)', 'gpB(SL1344_2645)', 'gpB(SL1344_2645).AS']
Queries not aligned in SP experiment for word_size=10: ['ibsC(SL1344_3172A ).SL1344_3172(SL1344_3172).IGR']
Queries not aligned in SP experiment for word_size=11: ['ibsC(SL1344_3172A ).SL1344_3172(SL1344_3172).IGR', 'sopE(SL1344_2674)']
Queries not aligned in SP experiment for word_size=15: ['SL1344_1967(SL1344_1967)', 'SL1344_2593(SL1344_2593)', 'SL1344_2696(SL1344_2696)', 'SL1344_2698(SL1344_2698)', 'SL1344_2715(SL1344_2715)', 'STnc1680(ncRNA026