#### steps overview:
0. import packages and data (Snakemake final output table)
1. remove control SRRs (keep only selected 253 SRRs (config/samples_test.tsv))
2. check for duplicated contigs (same study, different SRA entries), keep only one copy (with better metadata)
3. remove cellular organisms as per megan taxonomy
4. check numbers
5. make fasta files

In [6]:
# import packages
import pandas as pd
import os
from pysradb.sraweb import SRAweb
from Bio import SeqIO 

In [2]:
# import Snakemake final output
df = pd.read_csv('/home/tobamo/analize/project-tobamo/results/megan6_results_combined.csv', index_col=0)
# import selected samples list
samples_path = '/home/tobamo/analize/project-tobamo/config/samples_test.tsv' #253 selected SRRs
with open(samples_path) as file: samples = [line.strip() for line in file.readlines()][1:]

How many SRRs are left in snakemake contigs out of original 253 selection

In [None]:
ri = {rec.id.split("_")[-1] for rec in SeqIO.parse("../data/contigs/contigs_non_cellular_filtered.fasta", "fasta")}
len(ri)

1. remove control SRRs

In [3]:
# remove control contigs, keep only selected samples
test_results = df[df['SRR'].isin(samples)] # filter the main DataFrame

2. check for duplicated contigs (same study, different SRA entries), keep only one copy (with better metadata)

In [4]:
# Group by sequence and collect unique qseqid values for each sequence
seq_to_qseqid = test_results.groupby('sequence')['qseqid'].unique()

# Find sequences that are associated with more than one unique qseqid
duplicated_seqs = seq_to_qseqid[seq_to_qseqid.apply(len) > 1]

# Get all qseqid values involved in duplicated sequences
duplicated_qseqids = set(qseqid for qseqids in duplicated_seqs for qseqid in qseqids)

# Print the results
print(f"Number of sequences shared by multiple qseqid: {len(duplicated_seqs)}")
print(f"Number of qseqid involved: {len(duplicated_qseqids)}")
print("qseqid involved in duplicated sequences:")
print(duplicated_qseqids)
print('SRRs with duplicated qseqids:')
srr_list = test_results[test_results['qseqid'].isin(duplicated_qseqids)]['SRR'].unique()
print(srr_list)

Number of sequences shared by multiple qseqid: 15
Number of qseqid involved: 31
qseqid involved in duplicated sequences:
{'NODE_83_length_1545_cov_832.384344_ERR3179625', 'NODE_121_length_1179_cov_560.262357_ERR2737479', 'NODE_316_length_714_cov_3.042589_ERR2737479', 'NODE_242_length_5620_cov_498.148836_SRR8658357', 'NODE_316_length_714_cov_3.042589_ERR3179625', 'NODE_244_length_5617_cov_62.071480_SRR8749695', 'NODE_186_length_6730_cov_324.596573_SRR8749694', 'NODE_83_length_1545_cov_832.384344_ERR2737479', 'NODE_263_length_789_cov_170.131420_ERR2737479', 'NODE_228_length_845_cov_1276.600279_ERR2737479', 'NODE_80_length_1581_cov_92.495186_ERR2737479', 'NODE_251_length_6673_cov_596.344603_SRR8749693', 'NODE_138_length_1097_cov_1637.345361_ERR2737479', 'NODE_228_length_845_cov_1276.600279_ERR3179625', 'NODE_4862_length_2337_cov_106.419807_SRR5087400', 'NODE_323_length_705_cov_6047.968858_ERR3179625', 'NODE_244_length_5617_cov_62.071480_SRR8658357', 'NODE_14_length_5191_cov_521.552923_ERR

! manual inspection (look it up online or get metadata from SRAweb) 

In [13]:
srr_list = srr_list.tolist()

In [15]:
# Initialize SRAweb
db = SRAweb()

# Fetch metadata for all SRRs
metadata = db.sra_metadata(srr_list, detailed=True)

In [None]:
metadata.to_csv('results/metadata_of_duplicated_contigs.csv')

In [8]:
metadata = pd.read_csv('results/metadata_of_duplicated_contigs.csv', index_col=0)
metadata

Unnamed: 0,run_accession,study_accession,study_title,experiment_accession,experiment_title,experiment_desc,organism_taxid,organism_name,library_name,library_strategy,...,cell_type,sample_type,treatment,replicate,ena_fastq_http,ena_fastq_http_1,ena_fastq_http_2,ena_fastq_ftp,ena_fastq_ftp_1,ena_fastq_ftp_2
0,ERR2737479,ERP108694,Virus Discovery for Vietnam Initiative on Zoon...,ERX2750552,Illumina HiSeq 2500 paired end sequencing; Ill...,Illumina HiSeq 2500 paired end sequencing; Ill...,1070528,viral metagenome,,WGS,...,,,,,,http://ftp.sra.ebi.ac.uk/vol1/fastq/ERR273/009...,http://ftp.sra.ebi.ac.uk/vol1/fastq/ERR273/009...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/ERR273/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/ERR273/...
1,ERR3179625,ERP006046,Virus_Discovery_for_Vietnam_Initiative_on_Zoon...,ERX3207476,Illumina HiSeq 2500 paired end sequencing,Illumina HiSeq 2500 paired end sequencing,1070528,viral metagenome,DN459406Q:F12,WGS,...,,,,,,http://ftp.sra.ebi.ac.uk/vol1/fastq/ERR317/005...,http://ftp.sra.ebi.ac.uk/vol1/fastq/ERR317/005...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/ERR317/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/ERR317/...
2,SRR5087400,SRP094756,Gene expression changes after microenvironment...,SRX2404651,"P493-6 cell line, untreated","P493-6 cell line, untreated",9606,Homo sapiens,P493-6 untreated replicate 5,RNA-Seq,...,B-cell,cells,none,Biological replicate 5,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR508/000...,,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR508/...,,
3,SRR5087405,SRP094756,Gene expression changes after microenvironment...,SRX2404656,"P493-6 cell line, dox-treated, environmental s...","P493-6 cell line, dox-treated, environmental s...",9606,Homo sapiens,P493-6 combination 3,RNA-Seq,...,B-cell,cells,"1 ng/ml doxycycline for 16h; CpG (0.5muM, ODN2...",Biological replicate 1,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR508/005...,,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR508/...,,
4,SRR6233765,SRP094756,Gene expression changes after microenvironment...,SRX3342207,"P493-6 cell line, a-IgM treated","P493-6 cell line, a-IgM treated",9606,Homo sapiens,P493-6 a-IgM replicate 1,RNA-Seq,...,B-cell,cells,"a-IgM F(ab)2 fragments (130ng/ml, Jackson Immu...",Biological replicate 1,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR623/005...,,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR623/...,,
5,SRR8658357,SRP187337,Panonychus citri Genome sequencing,SRX5456065,RNA-Seq of Panonychus citri: susceptible_3,RNA-Seq of Panonychus citri: susceptible_3,50023,Panonychus citri,Pc_SUS_3,RNA-Seq,...,,,,,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR865/007...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR865/007...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR865/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR865/...
6,SRR8658358,SRP187337,Panonychus citri Genome sequencing,SRX5456064,RNA-Seq of Panonychus citri: susceptible_2,RNA-Seq of Panonychus citri: susceptible_2,50023,Panonychus citri,Pc_SUS_2,RNA-Seq,...,,,,,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR865/008...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR865/008...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR865/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR865/...
7,SRR8658359,SRP187337,Panonychus citri Genome sequencing,SRX5456063,RNA-Seq of Panonychus citri: susceptible_1,RNA-Seq of Panonychus citri: susceptible_1,50023,Panonychus citri,Pc_SUS_1,RNA-Seq,...,,,,,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR865/009...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR865/009...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR865/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR865/...
8,SRR8749693,SRP188804,Panonychus citri Raw sequence reads,SRX5540672,RNA-Seq of Panonychus citri: susceptible_no-in...,RNA-Seq of Panonychus citri: susceptible_no-in...,50023,Panonychus citri,Pc_SUS_non1,RNA-Seq,...,,,,,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR874/003...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR874/003...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR874/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR874/...
9,SRR8749694,SRP188804,Panonychus citri Raw sequence reads,SRX5540671,RNA-Seq of Panonychus citri: susceptible_no-in...,RNA-Seq of Panonychus citri: susceptible_no-in...,50023,Panonychus citri,Pc_SUS_non2,RNA-Seq,...,,,,,,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR874/004...,http://ftp.sra.ebi.ac.uk/vol1/fastq/SRR874/004...,,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR874/...,era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR874/...


In [None]:
# after manual metadata inspection, remove duplicated SRRs (same study, different SRA entries), keep only one copy (with better metadata)
SRRs_to_remove = ['ERR2737479', 'SRR8749694', 'SRR8749695', 'SRR8749693', 'SRR5087405', 'SRR6233765']

# remove SRRs
test_results_deduplicated = test_results[~test_results['SRR'].isin(SRRs_to_remove)]

3. remove cellular organisms as per megan taxonomy

In [22]:
# check how many contigs have 'cellular' in megan taxonomy
test_results_non_cellular = test_results_deduplicated[~test_results_deduplicated['megan_tax'].str.contains('cellular')]
test_results_cellular = test_results_deduplicated[test_results_deduplicated['megan_tax'].str.contains('cellular')]

4. Remove problematic SRR6846476

In [23]:
# remove problematic Singapore contigs from SRR6846476
test_results_non_cellular_filtered = test_results_non_cellular[~(test_results_non_cellular['SRR'] == 'SRR6846476')]

check numbers

In [24]:
print("Contig counts report:")
print(f"1. All contigs: {df.qseqid.nunique()}")
print(f"2. After removing control SRRs: {test_results.qseqid.nunique()}")
print(f"3. After removing duplicated SRRs: {test_results_deduplicated.qseqid.nunique()}")
print(f"4A. Non-cellular contigs: {test_results_non_cellular.qseqid.nunique()}")
print(f"4B. Cellular contigs: {test_results_cellular.qseqid.nunique()}")
print(f"5. Non-cellular without problematic SRR6846476: {test_results_non_cellular_filtered.qseqid.nunique()}")

Contig counts report:
1. All contigs: 3406
2. After removing control SRRs: 2567
3. After removing duplicated SRRs: 2549
4A. Non-cellular contigs: 2383
4B. Cellular contigs: 166
5. Non-cellular without problematic SRR6846476: 510


In [25]:
test_results_non_cellular_filtered.qseqid.nunique()

510

5. make fasta files and .csv

In [26]:
# make a dict from from qseqid and sequence
contigs_all = dict(df.filter(['qseqid', 'sequence']).values) # all contigs
contigs_all_test = dict(test_results.filter(['qseqid', 'sequence']).values) # test contigs (removed control SRRs)
contigs_all_deduplicated = dict(test_results_deduplicated.filter(['qseqid', 'sequence']).values) # deduplicated contigs
contigs_non_cellular = dict(test_results_non_cellular.filter(['qseqid', 'sequence']).values) # non-cellular contigs
contigs_non_cellular_filtered = dict(test_results_non_cellular_filtered.filter(['qseqid', 'sequence']).values) # non-cellular without problematic SRR6846476

In [27]:
# define function for writing fasta from a dictionary
def write_fasta(seq_dict, output_file):
    with open(output_file, 'w') as o:
        for key, val in seq_dict.items():
            o.write('>' + key + '\n')
            o.write(val + '\n')
    return output_file 

In [30]:
# make fasta files, skip if file already exists
path = '../data/contigs/'

for seq_dict, output_file in [
    (contigs_all, f'{path}contigs_all.fasta'),
    (contigs_all_test, f'{path}contigs_all_test.fasta'),
    (contigs_all_deduplicated, f'{path}contigs_all_deduplicated.fasta'),
    (contigs_non_cellular, f'{path}contigs_non_cellular.fasta'),
    (contigs_non_cellular_filtered, f'{path}contigs_non_cellular_filtered.fasta'),
]:
    write_fasta(seq_dict, output_file)
    print(f"Wrote {output_file}")

Wrote ../data/contigs/contigs_all.fasta
Wrote ../data/contigs/contigs_all_test.fasta
Wrote ../data/contigs/contigs_all_deduplicated.fasta
Wrote ../data/contigs/contigs_non_cellular.fasta
Wrote ../data/contigs/contigs_non_cellular_filtered.fasta
