# Prepare data for the analysis of lysine deserts in selected Orthologous Groups (OGs) from the eggNOG5 database

<br />

**It is necessary to download data from the eggNOG5 database** - run cells from par. 2. 

Par. 3-6 may be omitted, so the authors' preprocessed data will be used in further analysis.

# 1. Import libraries and select taxonomic ranks to analyze

In [2]:
import pandas as pd
import pickle
import numpy as np
import statistics
import sys
import csv
csv.field_size_limit(sys.maxsize)
import gzip
import os
from Bio import AlignIO
from Bio import SeqIO
from ipywidgets import Text, HBox, Label
from IPython.display import Markdown, display

from ete3 import NCBITaxa
ncbi = NCBITaxa()

In [3]:
# Provide NCBI ID of selected eukaryotic families/order
eukaryota = ['4893', '6236', '7214', '9989', '9604']

# 2. Download data

Download Orthologous Groups (OGs) data *Saccharomycetaceae, Rhabditida, Drosophilidae, Rodentia*, and *Hominidae*.

In [None]:
%%bash -s "{" ".join(eukaryota)}" 
mkdir -p data

# Download data
for i in $1; do mkdir -p data/$i; wget -e robots=off -P data/$i/ http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/$i/${i}_members.tsv.gz; wget -e robots=off -P data/$i/ http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/$i/${i}_raw_algs.tar; wget -e robots=off -P data/$i/ http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/$i/${i}_annotations.tsv.gz; done

# Untar directory with MSA files
for i in $1; do tar xf data/$i/${i}_raw_algs.tar -C data/$i/; done

# 3. Select OGs' MSA files for further analyses

Select MSA files from OG of each of the abovementioned eukaryotic family/order which fulfill the following conditions:

1. presence of minimum four sequences
2. presence of minimum 60% of taxonomic group’s taxons 
3. median protein length of minimum 150 aa
4. mean number of protein transmembrane helices of maximum 2.0
5. presence of at least one sequence from the representative organism for particular taxonomic group - representative organisms are *S. cerevisiae*, *C. elegans*, *D. melanogaster*, *M. musculus*, and *H. sapiens* 



## 3.1. Calculate maximum number of taxons in each downloaded eukaryotic family/order

In [3]:
taxon_dict = {}

for group in eukaryota:
    taxon_set = set()
    df = pd.read_csv('data/{}/{}_members.tsv.gz'.format(group, group), sep = '\t', header = None)

    for taxon_row in df[5].values:
        taxons = taxon_row.split(',')
        for taxon in taxons:
            taxon_set.add(taxon)
        
    taxon_dict[group] = len(taxon_set)

## 3.2. Exclude MSAs which do not fulfill conditions 1-3 & 5

Create the "pre-filtered" MSA files lists

In [4]:
# Eukaryotic reference organisms: S. cerevisiae, C. elegans, D. melanogaster, M. musculus, H. sapiens
eukaryotic_orgs = {'4893': '4932', '6236': '6239', '7214': '7227', '9989': '10090', '9604':'9606'} 
# Minimal number of sequences in MSA
min_no_MSA_sequences = 4 
# Minimal fraction of taxonomic group's taxons present in MSA
min_MSA_taxon_coverage = 0.6 

# Minimal median protein length in MSA in Bacteria
min_MSA_median_prot_length_bacteria = 150
# Minimal median protein length in MSA in Eukaryota
min_MSA_median_prot_length_eukaryota = 150

class CheckMedian():

    def __init__(self, MSA):
        """Open, read and parse alignment file
        
        :param MSA: path to alignment file
        :type MSA: str
        :return: parsed alignment file object
        :rtype: Bio.Align.MultipleSeqAlignment
        """
        
        f = gzip.open(MSA, "rt")
        alignment = AlignIO.read(f, "fasta")
        f.close()
        
        self.matrix = np.array([list(rec) for rec in alignment], order="F")

    def __get_seq(self, idx):
        """Get sequence with particular index from alignment
        
        :param idx: sequence index in alignment matrix
        :type idx: int
        :return: sequence without dashes
        :rtype: str
        """
        
        return ''.join(self.matrix[idx]).replace('-', '')

    def calculate_median_seq_length(self):
        """Calculate median of sequence length of given alignment
        
        :return: median sequence length
        :rtype: float
        """

        all_seq_lengths = []
        
        for seq_index in range(len(self.matrix)):
            all_seq_lengths.append(len(self.__get_seq(seq_index))) 

        return statistics.median(all_seq_lengths)

In [5]:
# prepare MSA files lists from OGs of each abovementioned eukaryotic family/order which fulfill conditions 1-3 & 5 described in par. 3
eukaryota_filtered_MSAs = {} 

for group in eukaryota:

    eukaryota_filtered_MSAs[group] = []
    df = pd.read_csv('data/{}/{}_members.tsv.gz'.format(group, group), sep = '\t', header = None)

    # from all Orthologous Groups, take only those fulfilling conditions 1&2 and in case of eukaryotic taxonomic groups also condition 5
    if group in eukaryota:
        df = df[(df[2] >= min_no_MSA_sequences) & ((df[3] / taxon_dict[str(group)]) >= min_MSA_taxon_coverage) & (df[5].str.contains(eukaryotic_orgs[group]) == True)]
        min_MSA_median = min_MSA_median_prot_length_eukaryota
    else:
        df = df[(df[2] >= min_no_MSA_sequences) & ((df[3] / taxon_dict[str(group)]) >= min_MSA_taxon_coverage)]
        min_MSA_median = min_MSA_median_prot_length_bacteria
    
    for ident, value in df.iterrows():
        MSA_id = value[1]
        MSA_parsed = CheckMedian('data/{}/{}/{}.raw_alg.faa.gz'.format(group, group, MSA_id))
        median = MSA_parsed.calculate_median_seq_length()

        if median >= min_MSA_median:
            eukaryota_filtered_MSAs[group].append(MSA_id)

## 3.3. Predict number of transmembrane helices (TMH) for the pre-filtered MSA files lists

Predict number of transmembrane helices (TMH) for each sequence from all the "pre-filtered" MSA files using the standalone version of [TMHMM-2.0](https://services.healthtech.dtu.dk/service.php?TMHMM-2.0) software.

In [4]:
%%bash
mkdir -p results
mkdir -p results/TMH_predictions

### 3.3.1. Prepare fasta file input for the TMHMM-2.0 software

Save sequences from the "pre-filtered" MSA files of the abovementioned eukaryotic families/order to one fasta file `TMH/all_prefiltered_seqs.fasta`.

* modify sequence headers so they contain information about the family/order and a MSA identifier
        
    original sequence header
    
    `>1121949.AQXT01000002_gene2085`
        
    after edit
    
    `>69657:43W4A:1121949.AQXT01000002_gene2085`
        
        
* remove dashes from sequence

In [13]:
all_fasta = open('results/TMH_predictions/all_prefiltered_seqs.fasta', 'w')

for group in eukaryota:
    for msa in eukaryota_filtered_MSAs[group]:
        
        msa_path = 'data/{}/{}/{}.raw_alg.faa.gz'.format(group, group, msa)
        f = gzip.open(msa_path, "rt")
        alignment = AlignIO.read(f, "fasta")
        
        for seq in alignment:
            new_id = '>{}:{}:{}\n'.format(group, msa, seq.description)
            all_fasta.write(str(new_id))
            all_fasta.write(str(seq.seq).replace('-', '') + '\n')
            
        f.close()

all_fasta.close()

###  3.3.2. Downolad TMHMM-2.0

Download the TMHMM-2.0 standalone version [here](https://services.healthtech.dtu.dk/software.php).



### 3.3.3. Run TMHMM-2.0

#### Provide path to TMHMM

Simply write it down in the field below 

e.g. `/Users/ns/Install/tmhmm-2.0c.Linux/tmhmm-2.0c/bin/tmhmm`

**Do not hit ENTER after writing it in the field below.**

In [16]:
db = HBox([Label('Full path to TMHMM-2.0 software:'), Text()])
display(db)

HBox(children=(Label(value='Full path to TMHMM-2.0 software:'), Text(value='')))

In [None]:
tmhmm_path = db.children[1].value # get path to TMHMM-2.0 software path from Text object

#### Run TMHMM-2.0

Run TMHMM-2.0 on `results/TMH_predictions/all_prefiltered_seqs.fasta`. Choose short output & save it to `results/TMH_predictions/tmhmm_results.tsv`.

In [None]:
%%bash -s "$tmhmm_path"

$1 results/TMH_predictions/all_prefiltered_seqs.fasta >> results/TMH_predictions/tmhmm_results.tsv
gzip -9 results/TMH_predictions/tmhmm_results.tsv

#### Delete the unnecessary file

Delete file with conacatenated sequences from MSA files.

In [17]:
%%bash
rm results/TMH_predictions/all_prefiltered_seqs.fasta

### 3.3.4. Create dictionary of predicted number of TMH for each sequence of each MSA

Create dictionary 
```
{
taxonomic_rank1 : {MSA1.1 : {seq1: X, seq2: Y, ...}, MSA1.2 : {seq1: Z, seq2: W, ...}, ...}, 
taxonomic_rank2 : {MSA2.1 : {seq1: Z, seq2: W, ...}, MSA2.2 : {seq1: X, seq2: Y, ...}, ...}, 
...
} 
```

where W, X, Y, Z are numbers of predicted TMH.

In [7]:
# Initialize dictionary
tmhmm_dict = {}
for group in eukaryota:
    tmhmm_dict[group] = {}

df = pd.read_csv('results/TMH_predictions/tmhmm_results.tsv.gz', header=None, sep='\t')
df.columns = ['ID', 'Sequence Length', 'ExpAA', 'First60', 'Predicted Helices', 'Topology']
# See TMHMM-2.0 documentation for the description of short output fields

# Delete PredHel= from all rows of the Predicted Helices column
df['Predicted Helices'] = df['Predicted Helices'].str.replace('PredHel=', '')

for ident, value in df.iterrows():
    
    identifiers = value[0].split(':')
    taxonomic_group = identifiers[0]
    MSA_id = identifiers[1]
    seq_id = identifiers[2]
    tmh_no = int(value[4])
    
    if MSA_id not in eukaryota_filtered_MSAs[taxonomic_group]:
        continue
    
    if MSA_id not in tmhmm_dict[taxonomic_group].keys():
        tmhmm_dict[taxonomic_group][MSA_id] = {}
    tmhmm_dict[taxonomic_group][MSA_id][seq_id] = tmh_no
        

with open('results/TMH_predictions/tmhmm_dict.pickle', 'wb') as handle:
    pickle.dump(tmhmm_dict, handle, protocol = pickle.HIGHEST_PROTOCOL)

## 3.4. Create final lists of selected MSA files for each aforementioned eukaryotic family/order

Calculate average number of TMH per sequence for each "pre-filtered" MSA file.

Discard MSA files for which average number of TMH per sequence is above 2.0, according to condition 4 described in par. 3.

In [8]:
# Initialize dictionary
final_filtered_MSA = {}
for group in eukaryota:
    final_filtered_MSA[group] = []

for group in tmhmm_dict.keys():
    for msa in tmhmm_dict[group]:
        tmh = []
        for seq in tmhmm_dict[group][msa]:
            tmh.append(tmhmm_dict[group][msa][seq])
        avg_tmh = statistics.mean(tmh)
        if avg_tmh <= 2.0:
            final_filtered_MSA[group].append(msa)
        
with open('results/additional/filtered_msa_dict.pickle', 'wb') as handle:
    pickle.dump(final_filtered_MSA, handle, protocol = pickle.HIGHEST_PROTOCOL)

# 4. Summarize selected MSA files

Prepare table `Table_selected_eukaryotic_OGs.tsv` summarizing MSA properties (all MSA files vs. selected MSA files) from the aforementioned eukaryotic family/order.


### Indexing

Table is indexed by eukaryotic family/order.

### Columns description

Table contains the following information:

| Column  | Description  |
|:--|:--|
| **Number of all MSA files**  | Number of all MSA files from OGs of a given eukaryotic family/order |
| **Number of selected MSA files**  |  Number of MSA files from OGs of a given eukaryotic family/order fulfilling conditions as described in par. 3 of `Download_data.ipynb` |
| **Fraction of selected MSA files** | Fraction of selected MSA files among all available MSA files from OGs of a given eukaryotic family/order |
| **Median of median lenghts of all MSA files** | Median of all median sequence lengths of all MSA files from OGs of a given eukaryotic family/order |
|**Median of median lenghts of selected MSA files** | Median of all median sequence lengths of selected MSA files fulfilling conditions as described in par. 3 of `Download_data.ipynb` from OGs of a given eukaryotic family/order |

In [9]:
%%bash
#Create results directories
mkdir -p results
mkdir -p results/supplementary

In [10]:
eukaryotes_dict = {'Saccharomycetaceae' : ['4893'], 'Rhabditida' : ['6236'], 'Drosophilidae' : ['7214'], 
                   'Rodentia' : ['9989'], 'Hominidae' : ['9604']}

with open('results/additional/filtered_msa_dict.pickle', 'rb') as handle:
    final_filtered_MSA = pickle.load(handle)

def make_summary(groups_dict, out_name):
    
    first_col = 'Eukaryotic family/order'
    
    columns = [first_col, 'Number of all MSA files', 'Number of filtered MSA files', 
               'Percentage of filtered MSA files', 'Median of median lenghts of all MSA files', 
               'Median of median lenghts of filtered MSA files']
    summary_table = []
    
    for key in groups_dict.keys():
        all_MSA = 0
        all_seq_median_lengths = []
        filtered_MSA = 0
        filtered_MSA_median_lengths = []

        for group in groups_dict[key]:
        
            df = pd.read_csv('data/{}/{}_members.tsv.gz'.format(group, group), sep = '\t', header = None)
            all_MSA += len(df)
            filtered_MSA += len(final_filtered_MSA[group])
            
            for ident, value in df.iterrows():
                eggNOG_id = value[1]
                f = gzip.open('data/{}/{}/{}.raw_alg.faa.gz'.format(group, group, eggNOG_id), "rt")
                all_seqs = list(SeqIO.parse(f, 'fasta'))
                lengths = []
                for s in all_seqs:
                    seq = s.seq
                    lengths.append(len(seq.ungap('-')))
                f.close()
                
                all_seq_median_lengths.append(statistics.median(lengths))
                
                if eggNOG_id in final_filtered_MSA[group]:
                    filtered_MSA_median_lengths.append(statistics.median(lengths))
                
        summary_table.append([key, all_MSA, filtered_MSA, round((filtered_MSA/all_MSA) *100, 2), statistics.median(all_seq_median_lengths), statistics.median(filtered_MSA_median_lengths)])
    
    df_summary = pd.DataFrame(summary_table, columns = columns)
    df_summary.set_index(first_col, inplace=True)
    display(df_summary)
    df_summary.to_csv('results/supplementary/Table_selected_eukaryotic_OGs.tsv', sep='\t')

In [11]:
make_summary(eukaryotes_dict, 'Eukaryotes')

Unnamed: 0_level_0,Number of all MSA files,Number of filtered MSA files,Percentage of filtered MSA files,Median of median lenghts of all MSA files,Median of median lenghts of filtered MSA files
Eukaryotic family/order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Saccharomycetaceae,5524,3654,66.15,415.25,447.0
Rhabditida,18187,9123,50.16,337.0,398.0
Drosophilidae,13686,8638,63.12,404.0,463.0
Rodentia,21313,12714,59.65,414.0,483.0
Hominidae,20461,13988,68.36,407.0,478.0


# 5. Get information about Gene Symbols associated with sequences from selected MSA files

eggNOG5 provides 14 GB file [e5.sequence_aliases.tsv](http://eggnog5.embl.de/download/eggnog_5.0/e5.sequence_aliases.tsv) with multiple information about each sequence including its Gene Symbols.


## 5.1. Download e5.sequence_aliases.tsv

In [None]:
%%bash
mkdir -p files
wget http://eggnog5.embl.de/download/eggnog_5.0/e5.sequence_aliases.tsv -P files

## 5.2. Trim e5.sequence_aliases.tsv

Cut lines containing only information about Gene Symbols from `files/e5.sequence_aliases.tsv` to reduce file size for further processing.

In [17]:
%%bash
grep -E "BLAST_KEGG_NAME|RefSeq_gene" files/e5.sequence_aliases.tsv | grep -vE "BLAST_KEGG_NAME_SYNONYM|Ensembl_EntrezGene_synonym|RefSeq_synonym" > files/e5.sequence_aliases_trimmed.tsv

## 5.3. Map sequence ID to Gene Symbol

In [21]:
seqID_gene_symbol = {}
with open('files/e5.sequence_aliases_trimmed.tsv') as f:
    reader = csv.reader(f, delimiter="\t")
    for line in reader:
        if len(line[1]) < 11 and (any(char.isalpha() for char in line[1])): # to get proper gene symbols, working in most cases
            seqID_gene_symbol[line[0]] = line[1].lower()
            
with open('results/additional/SequenceID_to_GeneSymbol.pickle', 'wb') as handle:
    pickle.dump(seqID_gene_symbol, handle, protocol = pickle.HIGHEST_PROTOCOL)

### Gzip files

In [22]:
%%bash
gzip -9 files/e5.sequence_aliases.tsv
gzip -9 files/e5.sequence_aliases_trimmed.tsv

# 6. Prepare vectors of corresponding Orthologous Groups

Prepare table with corresponding filtered MSA files from the aforementioned eukaryotic family/order.

---

We will obtain orthologous sequences from the OGs constructed by the eggNOG5 database for **Ophistokonta**. 

These are lower quality alignments due to evolutionary distances between taxons (e.g. both yeast and primates belong to Ophistokonta), but we can use information which sequences are orthologous to construct vectors of corresponding MSA files, however for the aforementioned eukaryotic families/order.

e.g.

Ophistokonta alignment
```
>seq1.yeast
AAAA---------------
>seq2.drosophilidae
------AAAAA--------
>seq3.hominidae
---------------AAAA
```

We checked previously and know, that 
* seq1.yeast can be found in MSA_A.yeast
* seq2.drosophilidae can be found in MSA_B.drosophilidae
* seq3.hominidae can be found in MSA_C.hominidae.

**Our vector will be as follows**:

MSA_A.yeast; MSA_B.drosophilidae; MSA_C.hominidae

## 6.1. Read and download data

In [5]:
with open('results/additional/filtered_msa_dict.pickle', 'rb') as handle:
    final_filtered_MSA = pickle.load(handle)
with open('results/additional/SequenceID_to_GeneSymbol.pickle', 'rb') as handle:
    seqID_gene_symbol = pickle.load(handle)

In [None]:
%%bash
mkdir -p results/vectors

# Download Ophistokonta Orthologous Groups summary and COG annotations files
wget -O results/vectors/ophistokonta_OGs.tsv.gz http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/33154/33154_members.tsv.gz 
wget -O results/vectors/ophistokonta_COG.tsv.gz http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/33154/33154_annotations.tsv.gz

## 6.2. Map sequence ID to its MSA file name

In [13]:
def create_all_genes_dict(taxonomic_groups):
    """Create dictionary mapping each sequence ID (from all filtered MSA files) to MSA file name."""
    
    seq_id_dict = {}
    
    for group in taxonomic_groups:
        
        df = pd.read_csv('data/{}/{}_members.tsv.gz'.format(group, group), sep='\t', header=None)
        
        MSA = list(df[1].values)
        genes = list(df[4].values)
        
        for i in range(len(genes)):
            if MSA[i] not in final_filtered_MSA[group]:
                continue
                
            all_genes = genes[i].split(',')
            for gene in all_genes:
                seq_id_dict[gene] = '{}/{}/{}.raw_alg.faa.gz'.format(group, group, MSA[i])
                
    return seq_id_dict

In [15]:
seq_id_dict = create_all_genes_dict(eukaryota)

## 6.3. Map Ophistokonta MSA ID to its COG annotation

In [16]:
with gzip.open('results/vectors/ophistokonta_COG.tsv.gz', "rt") as f:   
    ophistokonta_cog_dict = {}
    reader = csv.reader(f, delimiter="\t")

    for line in reader:
        MSA_id = line[1]
        COG = line[2]
        ophistokonta_cog_dict[MSA_id] = COG

## Construct and save vectors

Prepare table `eukaryotes_vectors.tsv.gz` with corresponding orthologous MSA IDs on the level of each selected eukaryotic family/order. 


### Indexing

Table is indexed by Ophistokonta OG ID. 

### Columns description

Table contains the following information for each Ophistokonta OG:

| Column  | Description  |
|:--|:--|
| **MSA paths**  | Paths to orthologous MSA files on the level of each selected eukaryotic family/order |
| **OG summary**  | Summary of names and numbers of OGs among selected eukaryotic families/orders |
|  **Genes** | Gene symbols among selected eukaryotic families/orders |
| **Number of proteins** | Total number of orthologous proteins from selected eukaryotic families/orders |
| **Number of taxons** | Total number of taxons from selected eukaryotic families/orders |
| **Number of taxonomic ranks** | Total number of selected eukaryotic families/orders |
| **Number of MSA files** | Total number of MSA files among selected eukaryotic families/orders|
| **COG** | COG annotation for Ophistokonta OG|

In [24]:
def create_vectors(taxonomic_groups, out_name):
    
    vectors_dict = {}
    
    with gzip.open('results/vectors/ophistokonta_OGs.tsv.gz', "rt") as f:
    
        reader = csv.reader(f, delimiter="\t")
        
        for line in reader:
            all_proteins = line[4].split(',')
            MSA_set = set()
            OG_set = set()
            OG_dict = {}
            prot_names_set = set()
            
            for prot in all_proteins:
                if prot in seq_id_dict.keys(): # we will construct vectors with filtered MSA files only
                    
                    taxid = str(seq_id_dict[prot].split('/')[0])
                    if taxid in taxonomic_groups: # make set only for selected taxonomic groups
                        
                        MSA_set.add(seq_id_dict[prot])
                        OG_set.add(seq_id_dict[prot].split('/')[0])

                        if taxid not in OG_dict.keys():
                            OG_dict[taxid] = 0

                        if prot in seqID_gene_symbol.keys():
                            prot_names_set.add(seqID_gene_symbol[prot])
          
            OG_names = ncbi.get_taxid_translator(list(OG_set))
            new_dict = {}

            for el in MSA_set:
                taxid = el.split('/')[0]
                OG_dict[taxid] += 1
                
            for key in OG_dict.keys():
                    
                new_dict[OG_names[int(key)]] = OG_dict[key]
                
            if len(MSA_set) > 1 and len(OG_set) > 1: # vector contains minimum two taxonomic groups
                vectors_dict[line[1]] = [','.join(list(MSA_set)), new_dict, ','.join(list(prot_names_set)), 
                                         int(line[2]), int(line[3]), len(OG_set), len(MSA_set),
                                         ophistokonta_cog_dict[line[1]]]
    
    df = pd.DataFrame.from_dict(vectors_dict, orient='index')
    df.columns = ['MSA paths', 'OG summary', 'Genes', 'Number of proteins', 
                  'Number of taxons', 'Number of taxonomic ranks', 
                  'Number of MSA files', 'COG']
    df.index.name = 'Ophistokonta OG ID'
    
    save_vectors_path = 'results/vectors'
    if not os.path.exists(save_vectors_path):
        os.makedirs(save_vectors_path) 

    df.to_csv('results/vectors/{}_vectors.tsv.gz'.format(out_name), sep='\t', compression='gzip')

In [25]:
create_vectors(eukaryota, 'eukaryotes')

In [6]:
df_vectors = pd.read_csv('results/vectors/eukaryotes_vectors.tsv.gz', sep='\t')
display(Markdown('### Sample of constructed orthologous MSA files table for <br /> Saccharomycetaceae, Rhabditida, Drosophilidae, Rodentia, and Hominidae'))
df_vectors.head()

### Sample of constructed orthologous MSA files table for <br /> Saccharomycetaceae, Rhabditida, Drosophilidae, Rodentia, and Hominidae

Unnamed: 0,Ophistokonta OG ID,MSA paths,OG summary,Genes,Number of proteins,Number of taxons,Number of taxonomic ranks,Number of MSA files,COG
0,38B3U,"9989/9989/4PSAG.raw_alg.faa.gz,9604/9604/4MZTR...","{'Rodentia': 1, 'Hominidae': 1}","usf3,kiaa2018",179,157,2,2,K
1,38B3V,"6236/6236/40UNT.raw_alg.faa.gz,9989/9989/4PUU6...","{'Rodentia': 4, 'Rhabditida': 1, 'Hominidae': 4}","ctnnd2,arvcf,jac-1,ctnnd1,cbr-jac-1,pkp4",546,152,3,9,TW
2,38B3W,"9604/9604/4N2E5.raw_alg.faa.gz,9989/9989/4Q1E1...","{'Rodentia': 1, 'Hominidae': 1}",fkbp14,142,125,2,2,O
3,38B3X,"9989/9989/4Q2TC.raw_alg.faa.gz,7214/7214/45XNB...","{'Rodentia': 3, 'Rhabditida': 1, 'Drosophilida...","lamc1,lam-2,lanb2,cbr-lam-2,dyak_lanb2,dere_la...",464,163,4,8,W
4,38B3Y,"7214/7214/45VNG.raw_alg.faa.gz,9604/9604/4N26V...","{'Saccharomycetaceae': 1, 'Rhabditida': 1, 'Dr...","nsa2,dyak_ip259,w09c5.1,ip259",387,338,4,4,J
