In [12]:
import pandas as pd
from pathlib import Path
from Bio import AlignIO
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Blast import NCBIWWW, NCBIXML
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import re

from src.sup_data_to_fasta import load_xlsx

sup5_file_path = Path("data/Liu_sup5_data.xlsx")
alignments_files_paths = [
    Path("output/EP-OD0_alignments_results.tsv"),
    Path("output/ESP-OD2_alignments_results.tsv"),
    Path("output/SP-OD2_alignments_results.tsv")
    ]

# Análise Exploratória dos Dados Suplementares 5

## Sobre

Este notebook contém uma análise exploratória dos dados fornecidos por Liu, et al. No [suplemento 5](https://www.nature.com/articles/s41467-023-43632-1#additional-information
) com o objetivo de averiguar os tratamentos executados no artigo

In [13]:
def add_scenario_column(df_dict):
    return {k:df.assign(scenario=k) for k,df in df_dict.items()}

def extract_scenario_from_file_path(string_arg):
    return re.search(r"(.*)-(.*)", string_arg).group(1)

def concat_df_dict(df_dict):
    return pd.concat(df_dict.values())

alignments_df_list = [pd.read_csv(file_path, sep='\t') for file_path in alignments_files_paths]
alignments_df_dict = {extract_scenario_from_file_path(k.name):v for k,v in zip(alignments_files_paths, aligments_df_list)}
alignments_df_dict = add_scenario_column(alignments_df_dict)
alignments_df = concat_df_dict(alignments_df_dict)

sup5_data_df_dict = load_xlsx(sup5_file_path)
sup5_data_df_dict = {extract_scenario_from_file_path(k):v for k,v in sup5_data_df_dict.items()}
sup5_data_df_dict = add_scenario_column(sup5_data_df_dict)
sup5_data_df = concat_df_dict(sup5_data_df_dict)

In [18]:
alignments_df.head()

Unnamed: 0,Query_ID,Subject_ID,PIdentity,Alignment_Length,Mismatches,Gap_Openings,Query_Start,Query_End,Subject_Start,Subject_End,E_value,Bit_Score,scenario
0,ArcZ(ncRNA0002),NC_003197.2,100.0,58,0,0,1,58,3490451,3490508,5.09e-25,108.0,EP
1,STnc2110(ncRNA0286),NC_003197.2,100.0,49,0,0,1,49,4009394,4009346,3.9399999999999996e-20,91.6,EP
2,CpxQ(ncRNA0205),NC_003197.2,100.0,61,0,0,1,61,4271116,4271176,1.18e-26,113.0,EP
3,STnc2000(ncRNA0278),NC_003197.2,100.0,43,0,0,1,43,1248179,1248137,7.110000000000001e-17,80.5,EP
4,ArcZ(ncRNA0002),NC_003197.2,100.0,58,0,0,1,58,3490451,3490508,5.09e-25,108.0,EP


In [6]:
def should_be_only_one_genome(df):
    return df.Subject_ID.nunique() == 1

def query_start_should_always_be_less_than_query_end(df):
    return (df.Query_Start <= df.Query_End).all()

# validationg
assert should_be_only_one_genome(alignments_df), "o banco de dados deveria ter apenas um genoma de referência de Salmonella" # all alignments are done to same Salmonella enterica genome
assert query_start_should_always_be_less_than_query_end(alignments_df), "O ínicio da query deve ser menor ou igual ao fim da query"

In [55]:
def not_aligned(col):
    return col[~col.isin(alignments_df.Query_ID)]

RNA1_not_aligned = not_aligned(sup5_data_df["RNA1 name"])
RNA2_not_aligned = not_aligned(sup5_data_df["RNA2 name"])

filtered_alignments_df = alignments_df[alignments_df['Query_ID'].isin(RNA1_not_aligned) & alignments_df['Query_ID'].isin(RNA2_not_aligned)]
print(f"# queries não alinhadas: {len(filtered_alignments_df)}")

# queries não alinhadas: 0
