# To improve loss due to multimapping, we will create a new t2g file which groups IDs with the same taxonomy as the same transcript

In [174]:
import pandas as pd
import numpy as np

Create new ID2tax map in which the same taxonomies are grouped:

In [175]:
u_tax_file = "/home/laura/projects/virus-watch-data/virus_ref/u_tax_noduplicates.tsv"

u_tax = pd.read_csv(u_tax_file, sep="\t")
u_tax

Unnamed: 0,Label,phylum,class,order,family,genus,species
0,u1,Pisuviricota,Pisoniviricetes,Nidovirales,Coronaviridae,Betacoronavirus,Severe acute respiratory syndrome-related coro...
1,u10,Negarnaviricota,Monjiviricetes,Mononegavirales,Filoviridae,Ebolavirus,Zaire ebolavirus
2,u100,Kitrinoviricota,Flasuviricetes,Amarillovirales,Flaviviridae,Flavivirus,West Nile virus
3,u1000,Negarnaviricota,Monjiviricetes,Mononegavirales,Paramyxoviridae,Henipavirus,Hendra henipavirus
4,u10000,Pisuviricota,Pisoniviricetes,Picornavirales,Caliciviridae,Norovirus,Norwalk virus
...,...,...,...,...,...,...,...
296556,u99995,.,.,.,.,.,.
296557,u99996,Lenarviricota,Howeltoviricetes,Cryppavirales,Mitoviridae,.,.
296558,u99997,.,.,.,.,.,.
296559,u99998,.,.,.,.,.,.


In [176]:
u_tax_nolabel = u_tax[["phylum", "class", "order", "family", "genus", "species"]]

# Get indeces of all duplicated rows
dup_rows = (u_tax_nolabel.groupby(u_tax_nolabel.columns.tolist())
            .apply(lambda x: tuple(x.index))
            .reset_index(name='idx'))

# Add IDs corresponding to indeces
dup_rows["label_list"] = [u_tax["Label"].values[list(tuple_)] for tuple_ in dup_rows["idx"]]

# Drop first row (all taxonomies = ".") and explode those IDs since they should not be grouped
dup_rows_exploded = dup_rows.iloc[[0]].explode(["idx", "label_list"])
dup_rows_exploded["label_list"] = [np.array([i]) for i in dup_rows_exploded["label_list"]]
dup_rows = dup_rows.iloc[1:, :].append(dup_rows_exploded)

# Convert arrays to list
dup_rows["label_list"] = [list(i) for i in dup_rows["label_list"]]

# Drop index column
dup_rows = dup_rows.drop("idx", axis=1)

# Add column with representative ID
dup_rows["label"] = dup_rows["label_list"].str[0]

# Move labels to first positions in dataframe
first_column = dup_rows.pop('label_list')
second_column = dup_rows.pop('label')
dup_rows.insert(0, 'ID', first_column)
dup_rows.insert(1, 'rep_ID', second_column)

# Temporarily add ID as int for sorting
dup_rows["temp_ID"] = dup_rows["rep_ID"].str.replace("u", "")
dup_rows["temp_ID"] = dup_rows["temp_ID"].astype(int)
dup_rows = dup_rows.sort_values("temp_ID").drop("temp_ID", axis=1).reset_index(drop=True)

dup_rows

Unnamed: 0,ID,rep_ID,phylum,class,order,family,genus,species
0,"[u1, u10066, u10121, u10124, u102609, u102640,...",u1,Pisuviricota,Pisoniviricetes,Nidovirales,Coronaviridae,Betacoronavirus,Severe acute respiratory syndrome-related coro...
1,"[u10, u103032, u1124, u11800, u12185, u13484, ...",u10,Negarnaviricota,Monjiviricetes,Mononegavirales,Filoviridae,Ebolavirus,Zaire ebolavirus
2,"[u100, u10093, u10152, u10300, u10328, u10339,...",u100,Kitrinoviricota,Flasuviricetes,Amarillovirales,Flaviviridae,Flavivirus,West Nile virus
3,"[u102, u10330, u10369, u1053, u10598, u10748, ...",u102,Kitrinoviricota,Alsuviricetes,Hepelivirales,Hepeviridae,Orthohepevirus,Orthohepevirus A
4,"[u113, u11628, u14212, u14640, u15113, u16059,...",u113,Negarnaviricota,Monjiviricetes,Mononegavirales,Paramyxoviridae,Morbillivirus,Measles morbillivirus
...,...,...,...,...,...,...,...,...
99223,[u296608],u296608,.,.,.,.,.,.
99224,[u296609],u296609,.,.,.,.,.,.
99225,[u296613],u296613,.,.,.,.,.,.
99226,[u296616],u296616,.,.,.,.,.,.


Add virus type:

In [177]:
virus_types = []
for i, phylum in enumerate(dup_rows["phylum"].values):
    if phylum == "Pisuviricota":
        class_ = dup_rows.iloc[i]["class"]
        if class_ == "Pisoniviricetes":
            virus_types.append("+ssRNA")
        elif class_ == "Stelpaviricetes":
            virus_types.append("+ssRNA")
        elif class_ == "Duplopiviricetes":
            virus_types.append("dsRNA")
        else:
            virus_types.append("+ssRNA or dsRNA")

    elif phylum == "Negarnaviricota":
        virus_types.append("-ssRNA")

    elif phylum == "Kitrinoviricota":
        virus_types.append("+ssRNA")

    elif phylum == "Lenarviricota":
        virus_types.append("+ssRNA")

    elif phylum == "Duplornaviricota":
        virus_types.append("dsRNA")

    elif phylum == "Artverviricota":
        virus_types.append("ssRNA-RT or dsDNA-RT")

    elif phylum == "Nucleocytoviricota":
        virus_types.append("NCLDV")

    elif phylum == "Peploviricota":
        virus_types.append("dsDNA")

    elif phylum == "Uroviricota":
        virus_types.append("tailed bacteriophage")

    else:
        virus_types.append("unknown")
        
dup_rows["strandedness"] = virus_types
dup_rows

Unnamed: 0,ID,rep_ID,phylum,class,order,family,genus,species,strandedness
0,"[u1, u10066, u10121, u10124, u102609, u102640,...",u1,Pisuviricota,Pisoniviricetes,Nidovirales,Coronaviridae,Betacoronavirus,Severe acute respiratory syndrome-related coro...,+ssRNA
1,"[u10, u103032, u1124, u11800, u12185, u13484, ...",u10,Negarnaviricota,Monjiviricetes,Mononegavirales,Filoviridae,Ebolavirus,Zaire ebolavirus,-ssRNA
2,"[u100, u10093, u10152, u10300, u10328, u10339,...",u100,Kitrinoviricota,Flasuviricetes,Amarillovirales,Flaviviridae,Flavivirus,West Nile virus,+ssRNA
3,"[u102, u10330, u10369, u1053, u10598, u10748, ...",u102,Kitrinoviricota,Alsuviricetes,Hepelivirales,Hepeviridae,Orthohepevirus,Orthohepevirus A,+ssRNA
4,"[u113, u11628, u14212, u14640, u15113, u16059,...",u113,Negarnaviricota,Monjiviricetes,Mononegavirales,Paramyxoviridae,Morbillivirus,Measles morbillivirus,-ssRNA
...,...,...,...,...,...,...,...,...,...
99223,[u296608],u296608,.,.,.,.,.,.,unknown
99224,[u296609],u296609,.,.,.,.,.,.,unknown
99225,[u296613],u296613,.,.,.,.,.,.,unknown
99226,[u296616],u296616,.,.,.,.,.,.,unknown


In [179]:
# Explode column containing all IDs for easier handling
dup_rows = dup_rows.explode("ID").reset_index(drop=True)

Save new ID2tax map:

In [180]:
new_u_tax_file = "/home/laura/projects/virus-watch-data/virus_ref/u_tax_nodup_clu.csv"
dup_rows.to_csv(new_u_tax_file, index=False)

#### Create t2g:

In [128]:
new_t2g = "/home/laura/projects/virus-watch-data/virus_ref/nodup_clu_t2g.txt"

In [129]:
%%time
with open(new_t2g, "w") as t2g:
    for rep_id in dup_rows["rep_ID"].unique():
        for group_id in dup_rows[dup_rows["rep_ID"]==rep_id]["ID"].values:
            t2g.write(group_id + "\t" + rep_id + "\n")