#### steps overview:
0. import packages and data (Snakemake final output table)
1. remove control SRRs (keep only selected 253 SRRs (config/samples_test.tsv))
2. check for duplicated contigs (same study, different SRA entries), keep only one copy (with better metadata)
3. remove cellular organisms as per megan taxonomy
4. check numbers
5. make fasta files

In [1]:
# import packages
import pandas as pd
import os

In [2]:
# import Snakemake final output
df = pd.read_csv('/home/tobamo/analize/project-tobamo/results/megan6_results_combined.csv', index_col=0)
# import selected samples list
samples_path = '/home/tobamo/analize/project-tobamo/config/samples_test.tsv' #253 selected SRRs
with open(samples_path) as file: samples = [line.strip() for line in file.readlines()][1:]

1. remove control SRRs

In [3]:
# remove control contigs, keep only selected samples
test_results = df[df['SRR'].isin(samples)] # filter the main DataFrame

2. check for duplicated contigs (same study, different SRA entries), keep only one copy (with better metadata)

In [1]:
# Group by sequence and collect unique qseqid values for each sequence
seq_to_qseqid = test_results.groupby('sequence')['qseqid'].unique()

# Find sequences that are associated with more than one unique qseqid
duplicated_seqs = seq_to_qseqid[seq_to_qseqid.apply(len) > 1]

# Get all qseqid values involved in duplicated sequences
duplicated_qseqids = set(qseqid for qseqids in duplicated_seqs for qseqid in qseqids)

# Print the results
print(f"Number of sequences shared by multiple qseqid: {len(duplicated_seqs)}")
print(f"Number of qseqid involved: {len(duplicated_qseqids)}")
print("qseqid involved in duplicated sequences:")
print(duplicated_qseqids)
print('SRRs with duplicated qseqids:')
srr_list = test_results[test_results['qseqid'].isin(duplicated_qseqids)]['SRR'].unique()
print(srr_list)

NameError: name 'test_results' is not defined

! manual inspection (look it up online or get metadata from SRAweb) 

In [None]:
# Get unique SRR values
srr_list = df['Corresponding SRR'].unique().tolist()

# Initialize SRAweb
db = SRAweb()

# Fetch metadata for all SRRs
metadata = db.sra_metadata(srr_list, detailed=True)

In [21]:
# after manual metadata inspection, remove duplicated SRRs (same study, different SRA entries), keep only one copy (with better metadata)
SRRs_to_remove = ['ERR2737479', 'SRR8749694', 'SRR8749695', 'SRR8749695', 'SRR8749693', 'SRR5087405', 'SRR6233765']

# remove SRRs
test_results_deduplicated = test_results[~test_results['SRR'].isin(SRRs_to_remove)]

3. remove cellular organisms as per megan taxonomy

In [22]:
# check how many contigs have 'cellular' in megan taxonomy
test_results_non_cellular = test_results_deduplicated[~test_results_deduplicated['megan_tax'].str.contains('cellular')]
test_results_cellular = test_results_deduplicated[test_results_deduplicated['megan_tax'].str.contains('cellular')]

4. Remove problematic SRR6846476

In [23]:
# remove problematic Singapore contigs from SRR6846476
test_results_non_cellular_filtered = test_results_non_cellular[~(test_results_non_cellular['SRR'] == 'SRR6846476')]

check numbers

In [24]:
print("Contig counts report:")
print(f"1. All contigs: {df.qseqid.nunique()}")
print(f"2. After removing control SRRs: {test_results.qseqid.nunique()}")
print(f"3. After removing duplicated SRRs: {test_results_deduplicated.qseqid.nunique()}")
print(f"4A. Non-cellular contigs: {test_results_non_cellular.qseqid.nunique()}")
print(f"4B. Cellular contigs: {test_results_cellular.qseqid.nunique()}")
print(f"5. Non-cellular without problematic SRR6846476: {test_results_non_cellular_filtered.qseqid.nunique()}")

Contig counts report:
1. All contigs: 3406
2. After removing control SRRs: 2567
3. After removing duplicated SRRs: 2549
4A. Non-cellular contigs: 2383
4B. Cellular contigs: 166
5. Non-cellular without problematic SRR6846476: 510


In [25]:
test_results_non_cellular_filtered.qseqid.nunique()

510

5. make fasta files and .csv

In [26]:
# make a dict from from qseqid and sequence
contigs_all = dict(df.filter(['qseqid', 'sequence']).values) # all contigs
contigs_all_test = dict(test_results.filter(['qseqid', 'sequence']).values) # test contigs (removed control SRRs)
contigs_all_deduplicated = dict(test_results_deduplicated.filter(['qseqid', 'sequence']).values) # deduplicated contigs
contigs_non_cellular = dict(test_results_non_cellular.filter(['qseqid', 'sequence']).values) # non-cellular contigs
contigs_non_cellular_filtered = dict(test_results_non_cellular_filtered.filter(['qseqid', 'sequence']).values) # non-cellular without problematic SRR6846476

In [27]:
# define function for writing fasta from a dictionary
def write_fasta(seq_dict, output_file):
    with open(output_file, 'w') as o:
        for key, val in seq_dict.items():
            o.write('>' + key + '\n')
            o.write(val + '\n')
    return output_file 

In [30]:
# make fasta files, skip if file already exists
path = '../data/contigs/'

for seq_dict, output_file in [
    (contigs_all, f'{path}contigs_all.fasta'),
    (contigs_all_test, f'{path}contigs_all_test.fasta'),
    (contigs_all_deduplicated, f'{path}contigs_all_deduplicated.fasta'),
    (contigs_non_cellular, f'{path}contigs_non_cellular.fasta'),
    (contigs_non_cellular_filtered, f'{path}contigs_non_cellular_filtered.fasta'),
]:
    write_fasta(seq_dict, output_file)
    print(f"Wrote {output_file}")

Wrote ../data/contigs/contigs_all.fasta
Wrote ../data/contigs/contigs_all_test.fasta
Wrote ../data/contigs/contigs_all_deduplicated.fasta
Wrote ../data/contigs/contigs_non_cellular.fasta
Wrote ../data/contigs/contigs_non_cellular_filtered.fasta
