<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/create_optimized_palmdb/2_create_RdRP_t2g.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# To improve loss due to the multimapping of sequences, we will create a new t2g file which groups virus IDs with the same taxonomy
The resulting structure is similar to how transcripts of the same gene are handled.

In [None]:
import pandas as pd
import numpy as np

Load virus ID to taxonomy mapping as generated in the [previous notebook](https://github.com/pachterlab/LSCHWCP_2023/blob/main/Notebooks/create_optimized_palmdb/1_remove_cfc_duplicates.ipynb).

In [None]:
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/create_optimized_palmdb/u_tax_noduplicates.tsv
u_tax = pd.read_csv("u_tax_noduplicates.tsv", sep="\t")
u_tax

In [None]:
u_tax_nolabel = u_tax[["phylum", "class", "order", "family", "genus", "species"]]

# Get indeces of all duplicated rows
dup_rows = (u_tax_nolabel.groupby(u_tax_nolabel.columns.tolist())
            .apply(lambda x: tuple(x.index))
            .reset_index(name='idx'))

# Add IDs corresponding to indeces
dup_rows["label_list"] = [u_tax["Label"].values[list(tuple_)] for tuple_ in dup_rows["idx"]]

# Drop first row (all taxonomies = ".") and explode those IDs since they should not be grouped
dup_rows_exploded = dup_rows.iloc[[0]].explode(["idx", "label_list"])
dup_rows_exploded["label_list"] = [np.array([i]) for i in dup_rows_exploded["label_list"]]
dup_rows = dup_rows.iloc[1:, :].append(dup_rows_exploded)

# Convert arrays to list
dup_rows["label_list"] = [list(i) for i in dup_rows["label_list"]]

# Drop index column
dup_rows = dup_rows.drop("idx", axis=1)

# Add column with representative ID
dup_rows["label"] = dup_rows["label_list"].str[0]

# Move labels to first positions in dataframe
first_column = dup_rows.pop('label_list')
second_column = dup_rows.pop('label')
dup_rows.insert(0, 'ID', first_column)
dup_rows.insert(1, 'rep_ID', second_column)

# Temporarily add ID as int for sorting
dup_rows["temp_ID"] = dup_rows["rep_ID"].str.replace("u", "")
dup_rows["temp_ID"] = dup_rows["temp_ID"].astype(int)
dup_rows = dup_rows.sort_values("temp_ID").drop("temp_ID", axis=1).reset_index(drop=True)

dup_rows

Add virus strandedness to virus ID to sOTU mapping:

In [None]:
virus_types = []
for i, phylum in enumerate(dup_rows["phylum"].values):
    if phylum == "Pisuviricota":
        class_ = dup_rows.iloc[i]["class"]
        if class_ == "Pisoniviricetes":
            virus_types.append("+ssRNA")
        elif class_ == "Stelpaviricetes":
            virus_types.append("+ssRNA")
        elif class_ == "Duplopiviricetes":
            virus_types.append("dsRNA")
        else:
            virus_types.append("+ssRNA or dsRNA")

    elif phylum == "Negarnaviricota":
        virus_types.append("-ssRNA")

    elif phylum == "Kitrinoviricota":
        virus_types.append("+ssRNA")

    elif phylum == "Lenarviricota":
        virus_types.append("+ssRNA")

    elif phylum == "Duplornaviricota":
        virus_types.append("dsRNA")

    elif phylum == "Artverviricota":
        virus_types.append("ssRNA-RT or dsDNA-RT")

    elif phylum == "Nucleocytoviricota":
        virus_types.append("NCLDV")

    elif phylum == "Peploviricota":
        virus_types.append("dsDNA")

    elif phylum == "Uroviricota":
        virus_types.append("tailed bacteriophage")

    else:
        virus_types.append("unknown")

dup_rows["strandedness"] = virus_types
dup_rows

In [None]:
# Explode column containing all IDs for easier handling
dup_rows = dup_rows.explode("ID").reset_index(drop=True)

Save new ID2tax map:

In [None]:
new_u_tax_file = "ID_to_taxonomy_mapping.csv"
dup_rows.to_csv(new_u_tax_file, index=False)

# Create corresponding t2g (transcripts to genes) file:

In [None]:
new_t2g = "palmdb_clustered_t2g.txt"

In [None]:
%%time
with open(new_t2g, "w") as t2g:
    for rep_id in dup_rows["rep_ID"].unique():
        for group_id in dup_rows[dup_rows["rep_ID"]==rep_id]["ID"].values:
            t2g.write(group_id + "\t" + rep_id + "\n")