# Taxonomy Classification Data Analysis
This notebook analyzes the taxonomy dataset to guide classifier development decisions.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from collections import Counter
import sys
# Add the src directory to path for imports
sys.path.append('../src/reformat_data')

# Set plot style
plt.style.use('ggplot')
sns.set(font_scale=1.2)
sns.set_style("whitegrid")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Load and Explore the Dataset

In [2]:
# Load the processed taxonomy data
data_path = '../data/merged/final_taxonomy.csv'
df = pd.read_csv(data_path, dtype=str)
print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (3221455, 12)


Unnamed: 0,seqID,taxID,scientific_name,sequence,superkingdom_name,kingdom_name,phylum_name,class_name,order_name,family_name,genus_name,species_name
0,ManCurSeq_DUFA-COLR-000000001555472-H00000001,1010000010,Bordaia karnka,TTTATCATCAAATATTGCTCATGCTGGTAGATCAGTAGATCTAGCT...,Eukaryota,Metazoa,Arthropoda,Insecta,Lepidoptera,Hepialidae,Bordaia,Bordaia karnka
1,ManCurSeq_DUFA-COLR-000000001555472-H00000002,1010000010,Bordaia karnka,TTTATCATCAAATATTGCTCATGCTGGTAGATCAGTAGATTTAGCT...,Eukaryota,Metazoa,Arthropoda,Insecta,Lepidoptera,Hepialidae,Bordaia,Bordaia karnka
2,ManCurSeq_DUFA-COLR-000000001555333-H00000001,1010000011,Amata pr.,ACTTTCATCTAATATTGCTCATAGAGGAAGTTCAGTTGATTTAGCT...,Eukaryota,Metazoa,Arthropoda,Insecta,Lepidoptera,Erebidae,Amata,Amata pr.
3,ManCurSeq_DUFA-COLR-000000001560174-H00000001,1010000012,Lophoptera pr.,ACTTTCATCTAATATTGCACATGGAGGAAGATCAGTTGATTTAGCA...,Eukaryota,Metazoa,Arthropoda,Insecta,Lepidoptera,Euteliidae,Lophoptera,Lophoptera pr.
4,ManCurSeq_DUFA-COLR-000001000000019-H00000001,1010000013,Abyssoninoe scopa,GCTAGCTAGAAACATGGCCCATGCAGGCCCATCTGTAGATTTAGCA...,Eukaryota,Metazoa,Annelida,Polychaeta,Eunicida,Lumbrineridae,Abyssoninoe,Abyssoninoe scopa


In [4]:
# Basic statistics
print(f"Total number of sequences: {len(df)}")
print(f"Unique taxonomic IDs: {df['taxID'].nunique()}")

Total number of sequences: 3221455
Unique taxonomic IDs: 252200
