1. snakemake output
2. filtering steps
    - removing control SRRs
    - removing cellular contigs
    - removing duplicated studies
3. selected 510 contigs - add metadata

In [69]:
import os
import pandas as pd
import numpy as np
from Bio import SeqIO
from pysradb.sraweb import SRAweb

In [70]:
# import data
non_celular_path = '/home/tobamo/analize/project-tobamo/analysis/data/contigs/contigs_non_cellular_filtered.fasta'
records = list(SeqIO.parse(non_celular_path, 'fasta'))
record_names = list(set([r.id.split('_')[-1] for r in records]))
gt = pd.read_excel('/home/tobamo/analize/project-tobamo/analysis/data/domain_sci_input/ground_truth_20250901.xlsx')
template_cols = pd.read_excel('../data/domain_sci_input/contig_report_columns.xlsx').columns
sra_metadata_template_cols = pd.read_excel('../data/domain_sci_input/mapping_sra_metadata.xlsx')

ground truth prep

In [71]:
# prep ground truth
gt.columns = gt.columns.str.strip()
gt.rename(columns={'category': 'category_old'}, inplace=True)
gt['category'] = gt['category_old'].map({
    1: 'tob1',
    2: 'tob2',
    8: 'tob3',
    3: 'oth1',
    5: 'oth2',
    9: 'oth3',
    7: 'mas'
})

gt_category_mapper_old = gt.set_index('contig_name')['category_old'].to_dict()
gt_category_mapper = gt.set_index('contig_name')['category'].to_dict()

download metadata & keep columns of interest

In [72]:
# # RUN ONLY ONCE
# # Initialize SRAweb
# db = SRAweb()

# # Fetch metadata for all SRRs
# metadata = db.sra_metadata(record_names, detailed=True)

# metadata.to_csv('results/metadata.csv')
metadata = pd.read_csv('results/metadata.csv', index_col=0)

In [73]:
# create sra_metadata_columnames_mapper
sra_mapper = sra_metadata_template_cols.dropna().set_index('run_accession')['corresponding_srr'].to_dict()

# keep selected columns
metadata_sub = metadata[['run_accession', 'study_accession', 'study_title', 'organism_name', 'collection_date', 'geo_loc_name', 'insdc center name']].drop_duplicates()
metadata_sub.rename(columns={'run_accession': 'corresponding_srr'}, inplace=True)

# Create mapping dictionaries for each metadata column
col_map = {
    'collection_date': dict(zip(metadata_sub['corresponding_srr'], metadata_sub['collection_date'])),
    'study_accession': dict(zip(metadata_sub['corresponding_srr'], metadata_sub['study_accession'])),
    'study_title': dict(zip(metadata_sub['corresponding_srr'], metadata_sub['study_title'])),
    'organism_name': dict(zip(metadata_sub['corresponding_srr'], metadata_sub['organism_name'])),
    'country': dict(zip(metadata_sub['corresponding_srr'], metadata_sub['geo_loc_name'])), 
    'submitter': dict(zip(metadata_sub['corresponding_srr'], metadata_sub['insdc center name'])),
}

create a supplementary table

In [74]:
# Create an empty DataFrame with the template columns
df = pd.DataFrame(columns=template_cols)

In [75]:
# start filling df columns
df['contig_id'] = [rec.id.replace('=', '_') for rec in records]
df['sequence'] = [str(rec.seq) for rec in records]
df['corresponding_srr'] = df['contig_id'].str.split('_').str[-1]
df['assembler'] = np.where(df['contig_id'].str.contains('NODE'), 'spades', 'megahit')
df['contig_length'] = [len(seq) for seq in df['sequence']]
df['ground_truth_category_old'] = df['contig_id'].map(gt_category_mapper_old)
df['ground_truth_category'] = df['contig_id'].map(gt_category_mapper)
df['known_or_potentially_novel_tobamovirus'] = np.where(df['ground_truth_category'].isin(['tob1', 'tob2', 'tob3']), True, False)

In [76]:
# Map each metadata column to df using the corresponding_srr
for col, mapper in col_map.items():
    df[col] = df['corresponding_srr'].map(mapper)

domain scientists input

In [77]:
table = pd.read_excel('/home/tobamo/analize/project-tobamo/analysis/data/domain_sci_input/Tobamo - tabela za tobamo kontige - kategorije (1).xlsx')
table.columns = table.columns.str.strip()

In [78]:
# check what columns are completely NA
na_cols = df.columns[df.isna().all()].tolist()

In [79]:
# Create mapping dictionaries for columns in na_cols using table
na_col_mappers = {}
for col in na_cols:
    if col in table.columns:
        na_col_mappers[col] = table.set_index('contig_id')[col].to_dict()

In [80]:
# apply mappers to fill in missing data
for col, mapper in na_col_mappers.items():
    df[col] = df['contig_id'].map(mapper)

In [81]:
cols = [col for col in df.columns if col not in ['ground_truth_subcategory', 'ground_truth_category_old', 'ground_truth_category']]
cols += ['ground_truth_subcategory', 'ground_truth_category_old', 'ground_truth_category']
df = df[cols]

add model predictions

In [82]:
model_predictions = pd.read_csv('/home/tobamo/analize/project-tobamo/analysis/model/results/snakemake/predictions/contig_predictions.csv')

In [83]:
# Create mapping dictionaries for model predictions
model_prediction_mapper = model_predictions.set_index('contig_name')['predicted_class'].to_dict()
model_probability_mapper = model_predictions.set_index('contig_name')['prob_1'].to_dict()

df.rename(columns={'model_prediction_probabiility': 'model_prediction_probability'}, inplace=True)

# Map model predictions to df
df['model_prediction'] = df['contig_id'].map(model_prediction_mapper)
df['model_prediction_probability'] = df['contig_id'].map(model_probability_mapper)

In [84]:
df

Unnamed: 0,contig_id,sequence,corresponding_srr,assembler,cluster_membership,known_or_potentially_novel_tobamovirus,contig_length,orf1_complete,orf1_partial,orf1_length,...,organism_name,submitter,country,publication_link,source_sample_category,genbank_accession_number,collection_date,ground_truth_subcategory,ground_truth_category_old,ground_truth_category
0,NODE_3447_length_1836_cov_746.295340_DRR146894,TTTTTTTTCTTCTTGAGTGTATGTTAAATATTTGTCAAATCCTTTC...,DRR146894,spades,,False,1836,,,,...,Eotetranychus uncatus,,Japan:Gumma,,,,2014,3,3,oth1
1,NODE_3346_length_1986_cov_333.205593_DRR146906,AAAAAAAACTTGCGAAGAGATCAAGGCAAGATTCGCTGATAAGCAG...,DRR146906,spades,,False,1986,,,,...,Schizotetranychus lespedezae,,Japan:Ibaraki,,,,2012,3,3,oth1
2,NODE_4755_length_1547_cov_49.162869_DRR146906,ACCAACGGGAACTACAAACCTTATCGCGACAACAAAGTCCTTGAGG...,DRR146906,spades,,False,1547,,,,...,Schizotetranychus lespedezae,,Japan:Ibaraki,,,,2012,3,3,oth1
3,NODE_7104_length_1501_cov_4.732365_ERR1356733,CTTGATGTGTTTAGCACCAAGTTTCGAAATCAGCTTCAAAGGGTCG...,ERR1356733,spades,,True,1501,,,,...,metagenome,EAWAG,,,,,,1,1,tob1
4,NODE_5816_length_1652_cov_2.914214_ERR1356733,CTGCGATATCCGAATTCACAAAGGAGAAATCACCGCCTTTCGGACA...,ERR1356733,spades,Cluster_15,True,1652,False,True,550,...,metagenome,EAWAG,,,,,,2,2,tob2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,NODE_199_length_2867_cov_7.196416_SRR9596414,TTTTTTTTTTTTTTTTAAAATAAAAGGGCGGAAAGGAAATGTTACA...,SRR9596414,spades,,False,2867,,,,...,Triticum aestivum,,Portugal: vora,,,,,3,3,oth1
506,NODE_34528_length_748_cov_7.391952_SRR9596414,TGAGTCGTTTCGCTTGCACTCGGCAAAAATGTATGGTGCGATGAAG...,SRR9596414,spades,,False,748,,,,...,Triticum aestivum,,Portugal: vora,,,,,5,5,oth2
507,NODE_2_length_3626_cov_0.985424_SRR9665731,CTGCAATCCGTGACTAGATCTTAAAGATGTTGCGAGGAATGTGATG...,SRR9665731,spades,,True,3626,,,,...,tick metagenome,,China:Shanghai,,,,2018-06,1,1,tob1
508,NODE_164_length_702_cov_1.326957_SRR9665731,CTTAAGTATTTATCTATAACTGACTTTAGTGTAGTTAAAGCTCAGA...,SRR9665731,spades,,True,702,,,,...,tick metagenome,,China:Shanghai,,,,2018-06,1,1,tob1


In [87]:
df.to_csv('results/contigs_supp_data_20250902.csv')
df.to_excel('results/contigs_supp_data_20250902.xlsx', index=False)