In [37]:
import os
import json
import gzip
import subprocess
from collections import defaultdict
import numpy as np
from scipy.stats import gmean


dashboard = os.path.expanduser("~/code/mgs-pipeline/dashboard/")

with open(os.path.join(dashboard, "human_virus_sample_counts.json")) as inf:
    human_virus_sample_counts = json.load(inf)

with open(os.path.join(dashboard, "metadata_samples.json")) as inf:
    metadata_samples = json.load(inf)

with open(os.path.join(dashboard, "metadata_bioprojects.json")) as inf:
    metadata_bioprojects = json.load(inf)

with open(os.path.join(dashboard, "metadata_papers.json")) as inf:
    metadata_papers = json.load(inf)

studies = list(metadata_papers.keys())


def start():
    human_virus_shares = []
    zero_counts = 0

    for study in studies:
        # Dropping studies that aren't WTP based
        if study in [
            "Johnson 2023",  # unpublished data
            "Cui 2023",  # untreated undigested sludge
            "Wang 2022",  # COVID-19 hospital wastewater
            "Petersen 2015",  # air plane waste
            "Hendriksen 2019",  # man hole"
            "Moritz 2019",  # university wastewater
            "Wu 2020",  # lung sample
            "Fierer 2022",  # university campus
        ]:
            continue

        for bioproject in metadata_papers[study]["projects"]:
            samples = metadata_bioprojects[bioproject]

            if study == "Bengtsson-Palme 2016":
                samples = [
                    sample
                    for sample in samples
                    if metadata_samples[sample]["fine_location"].startswith(
                        "Inlet"
                    )
                ]

            if study == "Ng 2019":
                samples = [
                    sample
                    for sample in samples
                    if metadata_samples[sample]["fine_location"] == "Influent"
                ]

            for sample in samples:
                if metadata_samples[sample].get("enrichment") == "panel":
                    continue

                humanreads = "%s.humanviruses.tsv" % sample

                if not os.path.exists(f"humanviruses/{humanreads}"):
                    subprocess.check_call(
                        [
                            "aws",
                            "s3",
                            "cp",
                            "s3://nao-mgs/%s/humanviruses/%s"
                            % (bioproject, humanreads),
                            "humanviruses/",
                        ]
                    )

                with open(f"humanviruses/{humanreads}") as inf:
                    human_virus_reads = 0
                    for line in inf:
                        (
                            line_taxid,
                            clade_assignments,
                            _,
                        ) = line.strip().split("\t")
                        clade_hits = int(clade_assignments)
                        human_virus_reads += int(clade_hits)
                    human_virus_relative_abundance = (
                        human_virus_reads / metadata_samples[sample]["reads"]
                    )

                    # print (human_virus_relative_abundance)

                    if human_virus_relative_abundance == 0.0:
                        zero_counts += 1
                        continue

                    human_virus_shares.append(human_virus_relative_abundance)
    # Dropping all zeros from human_virus_shares

    perc_zero_human_read_samples = round(
        (zero_counts / len(human_virus_shares)) * 100, 2
    )

    gmean_viral_share = round(gmean(human_virus_shares), 7)

    return f"When dropping {zero_counts} samples without human reads ({perc_zero_human_read_samples}% of all samples), the geometric mean of samples' human read share is {gmean_viral_share * 100}% \n Put differently 1 in {round(1 / gmean_viral_share)} reads in the wastewater samples are human reads"


if __name__ == "__main__":
    print(start())

When dropping 62 samples without human reads (3.71% of all samples), the geometric mean of samples' human read share is 0.00025% 
 Put differently 1 in 400000 reads in the wastewater samples are human reads


In [38]:
def start():

    bengtsson_virus_shares = []
    ng_virus_shares = []
    
    virus_taxid = 10239

    for study in studies:
        # Dropping studies that aren't WTP based
        if study not in [
            "Bengtsson-Palme 2016", 
            "Ng 2019", ]:
            continue

        for bioproject in metadata_papers[study]["projects"]:
            samples = metadata_bioprojects[bioproject]

            if study == "Bengtsson-Palme 2016":
                samples = [
                    sample
                    for sample in samples
                    if metadata_samples[sample]["fine_location"].startswith(
                        "Inlet"
                    )
                ]

            if study == "Ng 2019":
                samples = [
                    sample
                    for sample in samples
                    if metadata_samples[sample]["fine_location"] == "Influent"
                ]
                           

            for sample in samples:
                cladecounts = "%s.tsv.gz" % sample
                with gzip.open(f"cladecounts/{cladecounts}") as inf:

                    for line in inf:
                        (
                            line_taxid,
                            _,
                            _,
                            clade_assignments,
                            _,
                        ) = line.strip().split()
                        taxid = int(line_taxid)
                        clade_hits = int(clade_assignments)
                        if taxid == virus_taxid:
                            if study == "Bengtsson-Palme 2016":
                                bengtsson_virus_shares.append(clade_hits / metadata_samples[sample]["reads"])
                            if study == "Ng 2019":
                                ng_virus_shares.append(clade_hits / metadata_samples[sample]["reads"])
                
                
            
    

    perc_gmean_bengtsson = round(gmean(bengtsson_virus_shares) * 100, 8)
    perc_gmean_ng = round(gmean(ng_virus_shares) * 100, 8)
    
    return f"The geometric mean of samples' viral read share is {perc_gmean_bengtsson}% for Bengtsson-Palme 2016 and {perc_gmean_ng}% for Ng 2019 \n In other words, 1 in {round(1 / (perc_gmean_bengtsson / 100), 2)} reads in Bengtsson-Palme 2016 and 1 in {round(1 / (perc_gmean_ng / 100), 2)} reads in Ng 2019 is a viral read."

if __name__ == "__main__":
    print(start())

The geometric mean of samples' viral read share is 0.01525929% for Bengtsson-Palme 2016 and 0.00904169% for Ng 2019 
 In other words, 1 in 6553.38 reads in Bengtsson-Palme 2016 and 1 in 11059.88 reads in Ng 2019 is a viral read.


In [39]:
def start():

    bengtsson_virus_shares = []
    yang_virus_shares = []
    
    virus_taxid = 10239

    for study in studies:
        # Dropping studies that aren't WTP based
        if study not in [
            "Bengtsson-Palme 2016", 
            "Yang 2020", ]:
            continue

        for bioproject in metadata_papers[study]["projects"]:
            samples = metadata_bioprojects[bioproject]

            if study == "Bengtsson-Palme 2016":
                samples = [
                    sample
                    for sample in samples
                    if metadata_samples[sample]["fine_location"].startswith(
                        "Inlet"
                    )
                ]


            for sample in samples:
                humanreads = "%s.humanviruses.tsv" % sample

                if not os.path.exists(f"humanviruses/{humanreads}"):
                    subprocess.check_call(
                        [
                            "aws",
                            "s3",
                            "cp",
                            "s3://nao-mgs/%s/humanviruses/%s"
                            % (bioproject, humanreads),
                            "humanviruses/",
                        ]
                    )

                with open(f"humanviruses/{humanreads}") as inf:
                    human_virus_reads = 0
                    for line in inf:
                        (
                            line_taxid,
                            clade_assignments,
                            _,
                        ) = line.strip().split("\t")
                        clade_hits = int(clade_assignments)
                        human_virus_reads += int(clade_hits)
                    human_virus_relative_abundance = (
                        human_virus_reads / metadata_samples[sample]["reads"]
                    )

                    # print (human_virus_relative_abundance)

                    if human_virus_relative_abundance == 0.0:

                        continue

                    if study == "Bengtsson-Palme 2016":
                        bengtsson_virus_shares.append(human_virus_relative_abundance)
                    if study == "Yang 2020":
                        yang_virus_shares.append(human_virus_relative_abundance)

                    
            
    

    share_gmean_bengtsson = gmean(bengtsson_virus_shares)
    share_gmean_yang = gmean(yang_virus_shares)
    
    return f"1 in {round(1 / share_gmean_bengtsson, 2)} reads in Bengtsson-Palme 2016 and 1 in {round(1 / share_gmean_yang, 2)} reads in Yang 2020 is a human-infecting virus read."


if __name__ == "__main__":
    print(start())

1 in 8324122.38 reads in Bengtsson-Palme 2016 and 1 in 1567.9 reads in Yang 2020 is a human-infecting virus read.


Creating SI table of human virus and all virus abundance.

In [50]:
def start():
    study_abundances = {}

    virus_taxid = 10239

    for study in studies:
        # Dropping studies that aren't WTP based
        if study in [
            "Johnson 2023",  # unpublished data
            "Cui 2023",  # untreated undigested sludge
            "Wang 2022",  # COVID-19 hospital wastewater
            "Petersen 2015",  # air plane waste
            "Hendriksen 2019",  # man hole
            "Moritz 2019",  # university wastewater
            "Wu 2020",  # lung sample
            "Fierer 2022",  # university campus
        ]:
            continue

       
        for bioproject in metadata_papers[study]["projects"]:
            samples = metadata_bioprojects[bioproject]

            if study == "Bengtsson-Palme 2016":
                samples = [
                    sample
                    for sample in samples
                    if metadata_samples[sample]["fine_location"].startswith(
                        "Inlet"
                    )
                ]

            if study == "Ng 2019":
                samples = [
                    sample
                    for sample in samples
                    if metadata_samples[sample]["fine_location"] == "Influent"
                ]

            
            for sample in samples:
                if metadata_samples[sample].get("enrichment") == "panel":
                    continue

    
                cladecounts = "%s.tsv.gz" % sample
                with gzip.open(f"cladecounts/{cladecounts}") as inf:
                    

                    for line in inf:
                        (
                            line_taxid,
                            _,
                            _,
                            clade_assignments,
                            _,
                        ) = line.strip().split()
                        taxid = int(line_taxid)
                        clade_hits = int(clade_assignments)
                        if taxid == virus_taxid:        
                            if study not in study_abundances:
                                study_abundances[study] = [[], []] # Initialize lists for the study if not already present
                            study_abundances[study][1].append(clade_hits /metadata_samples[sample]["reads"])
                            
            
                
                humanreads = "%s.humanviruses.tsv" % sample

                if not os.path.exists(f"humanviruses/{humanreads}"):
                    subprocess.check_call(
                        [
                            "aws",
                            "s3",
                            "cp",
                            "s3://nao-mgs/%s/humanviruses/%s"
                            % (bioproject, humanreads),
                            "humanviruses/",
                        ]
                    )

                with open(f"humanviruses/{humanreads}") as inf:
                    human_virus_reads = 0
                    for line in inf:
                        (
                            line_taxid,
                            clade_assignments,
                            _,
                        ) = line.strip().split("\t")
                        clade_hits = int(clade_assignments)
                        human_virus_reads += int(clade_hits)
                    human_virus_relative_abundance = (
                        human_virus_reads / metadata_samples[sample]["reads"]
                    )


                    if human_virus_relative_abundance > 0.0:
                        study_abundances[study][0].append(human_virus_relative_abundance) 



    # Create the table
    gmean_human_virus_shares = []
    gmean_all_virus_shares = []
    table = [("Study", "Geometric mean of human virus shares", "Geometric mean of all virus shares")]
    

    for study, abundances in study_abundances.items():
        gmean_human_virus_shares.append(gmean(abundances[0]))
        gmean_all_virus_shares.append(gmean(abundances[1]))
        table.append(
            (study,
             gmean(abundances[0]), 
             gmean(abundances[1]),
            )
        )

    table.append(("All studies", gmean(gmean_human_virus_shares), gmean(gmean_all_virus_shares)))



    return table

def format_number(num):
    ROUNDING_DIGITS = 2
    if "e" in "{:.9e}".format(num):
        base, exponent = "{:.9e}".format(num).split("e")
        rounded_base = round(float(base), ROUNDING_DIGITS)
        return "{} * 10^{}".format(rounded_base, int(exponent))
    else:
        return str(num)

if __name__ == "__main__":
    table = start()


    formatted_table = []
    for row in table:
        formatted_row = []
        for cell in row:
            if isinstance(cell, float):
                formatted_row.append(format_number(cell))
            else:
                formatted_row.append(str(cell))
        formatted_table.append("\t".join(formatted_row))
    print("\n".join(formatted_table))

    

Study	Geometric mean of human virus shares	Geometric mean of all virus shares
Bengtsson-Palme 2016	1.2 * 10^-7	1.53 * 10^-4
Brinch 2020	2.24 * 10^-6	3.59 * 10^-3
Brumfield 2022	1.52 * 10^-6	5.27 * 10^-3
Crits-Christoph 2021	5.01 * 10^-6	5.37 * 10^-3
Maritz 2019	9.23 * 10^-7	8.49 * 10^-4
McCall 2023	1.82 * 10^-6	6.71 * 10^-4
Munk 2022	2.73 * 10^-6	2.42 * 10^-3
Ng 2019	4.02 * 10^-7	9.04 * 10^-5
Rothman 2021	2.43 * 10^-6	4.2 * 10^-2
Spurbeck 2023	1.36 * 10^-6	8.38 * 10^-5
Yang 2020	6.38 * 10^-4	2.08 * 10^-1
All studies	2.26 * 10^-6	1.93 * 10^-3
