# Create Kronaplot for all viruses across all cells grouped by animal and timepoint:

In [36]:
bp = "PRJNA665227"
fastq_folder = f"/home/laura/projects/virus-watch-data/{bp}"
u_tax = f"/home/laura/projects/virus-watch-data/virus_ref/u_tax_nodup_clu.csv"

In [37]:
import anndata
import pandas as pd
import numpy as np

def nd(arr):
    """
    Function to transform numpy matrix to nd array.
    """
    return np.asarray(arr).reshape(-1)

# Path to Krona tool
ktImportText = "/home/laura/bin/ktImportText"

___
# Load data

In [38]:
palmdb_adata = anndata.read(f"{fastq_folder}/host_flag/virus_cdna_dna_masked_meta.h5ad")
palmdb_adata

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 225898 × 11176
    obs: 'sample_barcode', 'srr', 'Run', 'donor_animal', 'Experiment', 'mdck_spike_in', 'hours_post_innoculation', 'day_post_infection', 'dpi', 'dpi_accessions', 'dpi_clean', 'leiden', 'celltype_clusters', 'celltype', 'species', 'dpi_clean_merged'
    var: 'v_type'

Only keep macaque viruses and cells that passed QC:

In [39]:
palmdb_adata = palmdb_adata[palmdb_adata.obs["celltype"].notnull(), palmdb_adata.var["v_type"] != "below_threshold"].copy()
palmdb_adata

AnnData object with n_obs × n_vars = 195399 × 80
    obs: 'sample_barcode', 'srr', 'Run', 'donor_animal', 'Experiment', 'mdck_spike_in', 'hours_post_innoculation', 'day_post_infection', 'dpi', 'dpi_accessions', 'dpi_clean', 'leiden', 'celltype_clusters', 'celltype', 'species', 'dpi_clean_merged'
    var: 'v_type'

Load ID to phylogeny mapping:

In [40]:
id2tax = pd.read_csv(u_tax)

# Drop columns not needed here and drop taxonomyduplicates
id2tax = id2tax.drop("ID", axis=1).drop("strandedness", axis=1)
id2tax = id2tax.drop_duplicates()

id2tax

Unnamed: 0,rep_ID,phylum,class,order,family,genus,species
0,u1,Pisuviricota,Pisoniviricetes,Nidovirales,Coronaviridae,Betacoronavirus,Severe acute respiratory syndrome-related coro...
409,u10,Negarnaviricota,Monjiviricetes,Mononegavirales,Filoviridae,Ebolavirus,Zaire ebolavirus
475,u100,Kitrinoviricota,Flasuviricetes,Amarillovirales,Flaviviridae,Flavivirus,West Nile virus
590,u102,Kitrinoviricota,Alsuviricetes,Hepelivirales,Hepeviridae,Orthohepevirus,Orthohepevirus A
909,u113,Negarnaviricota,Monjiviricetes,Mononegavirales,Paramyxoviridae,Morbillivirus,Measles morbillivirus
...,...,...,...,...,...,...,...
296556,u296608,.,.,.,.,.,.
296557,u296609,.,.,.,.,.,.
296558,u296613,.,.,.,.,.,.
296559,u296616,.,.,.,.,.,.


___
# Generate Krona html

In [43]:
%%time
master = pd.DataFrame()
for idx, timepoint in enumerate(palmdb_adata.obs["dpi_clean_merged"].unique()):
    for animal_id in palmdb_adata.obs[palmdb_adata.obs["dpi_clean_merged"] == timepoint]["donor_animal"].unique():
        adata_temp = palmdb_adata[(palmdb_adata.obs["dpi_clean_merged"] == timepoint) & (palmdb_adata.obs["donor_animal"] == animal_id), :]

        # Add total number of counts (across all cells) for each virus ID to phylogeny data temp
        virus_ids = adata_temp.var.index.values
        total_counts = nd(adata_temp.X.sum(axis=0))
        total_count_dict = {virus_ids[i]: total_counts[i] for i in range(len(virus_ids))}

        phylogeny_data_temp = id2tax.copy()
        phylogeny_data_temp['total_count'] = phylogeny_data_temp['rep_ID'].map(total_count_dict)
        
        # Drop viruses not in filter list
        phylogeny_data_temp = phylogeny_data_temp.dropna()

        # Remove non-relevent columns and change order of columns  
        phylogeny_data_temp = phylogeny_data_temp[["total_count", "phylum", "class", "order", "family", "genus", "species", "rep_ID"]]

        # Replace dots with NaN
        phylogeny_data_temp = phylogeny_data_temp.replace(".", np.nan)

        # Add column with timepoint
        phylogeny_data_temp["timepoint"] = timepoint

        # Add column with animal id
        phylogeny_data_temp["animal_id"] = animal_id

        # Append to master dataframe
        if idx == 0:
            master = phylogeny_data_temp.copy()
        else:
            master = pd.concat([master, phylogeny_data_temp])

# Save counts + taxnomomies data to txt
master.to_csv(f'krona.txt', sep ='\t', header=None, index=False)

# Generate Krona plot
krona_out = "krona.html"
!$ktImportText krona.txt -o $krona_out -n "Virus-positive cells"

Writing krona.html...
CPU times: user 4.05 s, sys: 0 ns, total: 4.05 s
Wall time: 4.69 s


In [42]:
!ls -lh

total 149M
-rw-rw-r-- 1 laura laura  160K Apr  3  2023 1_data_download.ipynb
-rw-rw-r-- 1 laura laura   34K Oct 18 20:34 2.1_align_HostGenomeTranscriptome_noGTF.ipynb
-rw-rw-r-- 1 laura laura  154K Oct 28 19:18 2.1_align_remove_reads_bus_AlignSRRindividually.ipynb
-rw-rw-r-- 1 laura laura  287K Oct 31 07:34 2.1_align_remove_reads_bus+d-list.ipynb
-rw-rw-r-- 1 laura laura  251K Oct 28 13:35 2.1_align_remove_reads_bus.ipynb
-rw-rw-r-- 1 laura laura   26K Oct 27 14:43 2.1_align_remove_reads_bus_testing.ipynb
-rw-rw-r-- 1 laura laura   50K Oct 18 07:59 2.1_align_virus_host_transcriptome+genome+ambigious.ipynb
-rw-rw-r-- 1 laura laura   54K Oct 16 11:15 2.1_align_virus_host_transcriptome+genome_CheckEachKmer.ipynb
-rw-rw-r-- 1 laura laura   11K Oct 18 16:45 2.1_align_virus_host_transcriptome+genome_DlistAll.ipynb
-rw-rw-r-- 1 laura laura   43K Oct  7 11:42 2.1_align_virus_host_transcriptome+genome.ipynb
-rw-rw-r-- 1 laura laura   48K Oct  4 16:37 2.1_align_virus_host_transcriptome.ipynb
-rw