# Taxonomy analysis
This notebook generates the real taxonomy/lineage information of each species. This will be used to check the correctness of the phylogenetic trees which were generated using hierarchical clustering.

In [1]:
import os
from tqdm.notebook import trange
import pandas as pd
import numpy as np

In [None]:
# Config
if not 'chdir' in globals():
    os.chdir('..')
chdir = True

---

## Create taxonomy dataset
`dat/tax.csv`

In [50]:
# Load data
species = pd.read_csv('dat/data.csv')
del species['dna']

In [12]:
# Write species names to file
with open('dat/species_names.txt', 'w') as f:
    f.writelines(s + '\n' for s in species.name.to_list())

In [13]:
# Next step: generate dat/tax_report.txt using dat/species_names.txt
# https://www.ncbi.nlm.nih.gov/Taxonomy/TaxIdentifier/tax_identifier.cgi

In [14]:
# Read in generated taxonomy information
with open('dat/tax_report.txt') as f:
    data = f.readlines()

In [51]:
seen = []
i = 0
for line in data[2:]:
    line = line.strip().split('\t|\t')
    if line[1] not in seen:
        seen.append(line[1])
        species.loc[i, 'tax'] = line[-1]
        i += 1

assert i == len(species)

In [52]:
species

Unnamed: 0,id,name,tax
0,GCA_905340225.1,Abrostola tripartita,938171 254364 95186 7100 37570 104431 37567 41...
1,GCF_001949145.1,crown-of-thorns starfish,133434 133433 133432 41166 41243 7588 7587 133...
2,GCA_016904835.1,rifleman,57068 57067 38633 9126 8825 8782 436492 436491...
3,GCA_021347895.1,spiny chromis,80966 80965 30863 1489909 1489908 1489872 1233...
4,GCF_904848185.1,yellowfin seabream,8177 8176 8169 1489931 1489922 1489872 123369 ...
...,...,...,...
719,GCA_020796205.1,Ziziphus jujuba var. spinosa,714518 326968 72171 325284 3608 3744 91835 712...
720,GCF_000696155.1,Zootermopsis nevadensis,136037 7502 127821 127820 7501 1912919 1049657...
721,GCF_011800845.1,common lizard,8524 141678 162266 8522 1329975 1329976 132991...
722,GCA_907165275.2,Zygaena filipendulae,287375 287110 287187 115354 104435 104430 3756...


In [53]:
# Save data as csv file
species.to_csv('dat/tax.csv', index=False)

## Generate taxonomy distance matrix
`dat/K_tax.npy`

In [63]:
# Build matrix of taxonomy-overlap (similarity)
K_tax = np.full((len(species), len(species)), np.nan)
for i in trange(len(species)):
    a = species.tax[i].split()
    for j in range(i, len(species)):
        b = species.tax[j].split()
        K_tax[i, j] = K_tax[j, i] = len(set(a) & set(b))

# Transform into distance matrix
K_tax = K_tax.max(1) - K_tax

# Save matrix
np.save('dat/K_tax.npy', K_tax)

  0%|          | 0/724 [00:00<?, ?it/s]