# Species Mismatch Analysis
This notebook identifies species from the provided list that are not valid for Chicago, Illinois. It also generates statistics and visualizations for the mismatched species.

In [None]:
# Import Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pytaxize import scicomm

In [2]:
# Load Species Data
species_data = pd.read_csv('../runs/full-no-geo/simple_class_counts.csv')
species_data.head()

Unnamed: 0.1,Unnamed: 0,simple_class,count,%
0,1,blank,88396,84.2
1,2,bird,6077,5.8
2,3,human,4095,3.9
3,4,western pond turtle,1886,1.8
4,5,mallard,816,0.78


In [41]:
# Read the file without headers
columns = ['uuid', 'class', 'order', 'family', 'genus', 'species', 'common_name']
df_taxa = pd.read_csv("taxonomy_release.txt", 
                 delimiter=";", 
                 names=columns,
                 header=None)

# Display the first few rows to verify
display(df_taxa.head())

Unnamed: 0,uuid,class,order,family,genus,species,common_name
0,00049ff0-2ffa-4d82-8cf3-c861fbbfa9d5,mammalia,rodentia,muridae,rattus,,rattus species
1,000e4049-11cd-4630-afd6-ea16a908d5ff,mammalia,cetartiodactyla,bovidae,gazella,gazella,mountain gazelle
2,000f61aa-c02a-46f4-b7a7-81fe76a9212f,mammalia,carnivora,canidae,lycaon,,lycaon species
3,001795ae-1963-47f2-91cc-9dd627643a06,mammalia,cetartiodactyla,bovidae,nesotragus,,nesotragus species
4,00339477-70ab-42aa-9a4f-ed2cca9a028f,aves,musophagiformes,musophagidae,tauraco,schuettii,black-billed turaco


In [46]:
df_sorted = df_taxa.sort_values(by=['class', 'order', 'family', 'genus', 'species'])
display(df_sorted.head())

Unnamed: 0,uuid,class,order,family,genus,species,common_name
59,04eda76f-c0e7-4e9e-85c3-5b1542db2915,amphibia,anura,bufonidae,rhinella,marina,cane toad
337,17b8145a-a164-4059-a68b-47b8b7438428,amphibia,anura,ranidae,rana,,rana species
1168,5351aa59-81ba-4c9f-b453-f7a2830ae892,amphibia,anura,ranidae,,,true frogs
2063,96632fbc-d0d0-4880-9df7-f747f6f5ec11,amphibia,anura,,,,frogs
499,23a6f03b-b3d0-471b-a67d-88f10cb64e59,amphibia,,,,,amphibian


In [42]:
# Try the gbq download
df = pd.read_csv("gbq_gbif_liketaxa_release_v2.csv")

df_filtered = df.map(lambda x: x.lower() if isinstance(x, str) else x)

# extract actual speciesname
df_filtered.rename(columns={'species': 'scientific_name'}, inplace=True)

# create an actual species column
df_filtered['species'] = df_filtered['scientific_name'].str.split().str[-1]


display(df_filtered.head())

Unnamed: 0,class,order,family,genus,scientific_name,taxonkey,species
0,aves,piciformes,picidae,melanerpes,melanerpes carolinus,2478106,carolinus
1,amphibia,anura,bufonidae,anaxyrus,anaxyrus americanus,2422872,americanus
2,aves,charadriiformes,charadriidae,charadrius,charadrius semipalmatus,2480295,semipalmatus
3,aves,passeriformes,passerellidae,ammospiza,ammospiza leconteii,2491080,leconteii
4,aves,charadriiformes,scolopacidae,tringa,tringa melanoleuca,2481720,melanoleuca


In [43]:
# Merge dataframes on taxa hierarchy
merged_df = pd.merge(
    df_filtered,
    df_taxa[['class', 'order', 'family', 'genus', 'species', 'common_name']],
    on=['class', 'order', 'family', 'genus', 'species'],
    how='left'
)

# Display matches and non-matches
print("Rows with matching common names:")
display(merged_df[merged_df['common_name'].notna()].head())

print("\nRows without matching common names:")
display(merged_df[merged_df['common_name'].isna()].head())

# Count matches
total_matches = merged_df['common_name'].notna().sum()
print(f"\nTotal matches found: {total_matches}")
print(f"Total rows without matches: {len(merged_df) - total_matches}")

Rows with matching common names:


Unnamed: 0,class,order,family,genus,scientific_name,taxonkey,species,common_name
0,aves,piciformes,picidae,melanerpes,melanerpes carolinus,2478106,carolinus,red-bellied woodpecker
4,aves,charadriiformes,scolopacidae,tringa,tringa melanoleuca,2481720,melanoleuca,greater yellowlegs
11,aves,passeriformes,turdidae,catharus,catharus guttatus,2490810,guttatus,hermit thrush
12,aves,anseriformes,anatidae,anas,anas platyrhynchos,9577769,platyrhynchos,mallard
13,aves,passeriformes,laniidae,lanius,lanius ludovicianus,2492870,ludovicianus,loggerhead shrike



Rows without matching common names:


Unnamed: 0,class,order,family,genus,scientific_name,taxonkey,species,common_name
1,amphibia,anura,bufonidae,anaxyrus,anaxyrus americanus,2422872,americanus,
2,aves,charadriiformes,charadriidae,charadrius,charadrius semipalmatus,2480295,semipalmatus,
3,aves,passeriformes,passerellidae,ammospiza,ammospiza leconteii,2491080,leconteii,
5,aves,passeriformes,tyrannidae,contopus,contopus cooperi,5229744,cooperi,
6,aves,passeriformes,estrildidae,lonchura,lonchura oryzivora,2493591,oryzivora,



Total matches found: 391
Total rows without matches: 534


In [47]:
# Save the matching dataset
df_matched = merged_df[merged_df['common_name'].notna()]

df_matched.to_csv('gbq_to_taxa_common_matched.csv')

In [48]:
# Store the matching common_names for comparison to the speciesnet output
chicago_species = df_matched['common_name'].tolist()

print(chicago_species)

['red-bellied woodpecker', 'greater yellowlegs', 'hermit thrush', 'mallard', 'loggerhead shrike', 'northern goshawk', 'bat', 'hairy woodpecker', 'moorhen', 'héron à dos vert', 'red-winged blackbird', 'cinnamon teal', 'canada goose', 'great crested flycatcher', 'spotted sandpiper', 'palm warbler', 'northern bobwhite', 'russet-backed thrush', 'zosterops species', 'house mouse', 'north american river otter', 'russet-backed thrush', 'palm warbler', 'wood thrush', 'western tanager', 'western grebe', 'house finch', "wilson's warbler", 'northern goshawk', 'house wren', "wilson's warbler", 'killdeer', 'swamp sparrow', 'american crow', 'red-winged blackbird', 'aythya species', 'common starling', 'grey wolf', 'black-crowned night-heron', 'great horned owl', 'red-winged blackbird', 'setophaga species', 'domestic guineafowl', 'palm warbler', 'broad-winged hawk', 'marsh wren', 'american redstart', 'bald eagle', 'moorhen', 'busard saint-martin', 'moorhen', 'chukar', 'american black duck', 'gadwall',

In [49]:
# Compare Species Lists
species_list = species_data['simple_class'].tolist()
mismatched_species = [species for species in species_list if species not in chicago_species]
print(f"Total mismatched species: {len(mismatched_species)}")
print(f"Total matched species: {len(species_list)-len(mismatched_species)}")

Total mismatched species: 49
Total matched species: 31


In [50]:
# Generate Statistics on Mismatched Species
total_species = len(species_list)
mismatched_percentage = (len(mismatched_species) / total_species) * 100
matched_percentage = ( (total_species - len(mismatched_species) ) / total_species) * 100
print(f"Percentage of mismatched species: {mismatched_percentage:.2f}%")
print(f"Percentage of matched species: {matched_percentage:.2f}%")

Percentage of mismatched species: 61.25%
Percentage of matched species: 38.75%


In [51]:
# List the matching vs mismatching species
matching_species = [species for species in species_list if species in chicago_species]
mismatched_species = [species for species in species_list if species not in chicago_species]

print(f"Matching species ({len(matching_species)}): {matching_species}")
print(f"Mismatched species ({len(mismatched_species)}): {mismatched_species}")

Matching species (31): ['blank', 'bird', 'mallard', 'american coot', 'northern raccoon', 'great blue heron', 'vehicle', 'eastern cottontail', 'wood duck', 'brown rat', 'canada goose', 'domestic cat', 'muskrat', 'coyote', 'american beaver', 'eastern gray squirrel', 'american robin', 'california quail', 'domestic horse', 'sylvilagus species', 'white-crowned sparrow', 'snowy egret', 'horned lark', 'north american river otter', 'eastern fox squirrel', 'song sparrow', 'american badger', 'anatidae family', 'rattus species', 'red fox', 'eastern chipmunk']
Mismatched species (49): ['human', 'western pond turtle', 'anseriformes order', 'reptile', 'domestic dog', 'wild turkey', 'domestic cattle', 'white-tailed deer', 'central american agouti', 'mammal', 'nutria', 'wild boar', 'crocodile', 'common tapeti', "tome's spiny rat", 'ocellated turkey', 'rodent', 'branta species', 'collared peccary', 'eastern red forest rat', 'gambian rat', 'owl', 'bushy-tailed woodrat', 'madagascar crested ibis', 'spott

In [56]:
from itertools import zip_longest

review_df = pd.DataFrame(
    list(zip_longest(matching_species, mismatched_species, fillvalue=pd.NA)),
    columns=['matching', 'non_matching (speciesnet!gbif)']
)

print(review_df)


                      matching non_matching (speciesnet!gbif)
0                        blank                          human
1                         bird            western pond turtle
2                      mallard             anseriformes order
3                american coot                        reptile
4             northern raccoon                   domestic dog
5             great blue heron                    wild turkey
6                      vehicle                domestic cattle
7           eastern cottontail              white-tailed deer
8                    wood duck        central american agouti
9                    brown rat                         mammal
10                canada goose                         nutria
11                domestic cat                      wild boar
12                     muskrat                      crocodile
13                      coyote                  common tapeti
14             american beaver               tome's spiny rat
15      

In [59]:
# Some of the mismatched are given and we need to remove them from being counted
to_remove = ['human', 'domestic dog', 'mammal', 'rodent']  # example items to remove, edit as needed
mismatched_species = [s for s in mismatched_species if s not in to_remove]


# Filter species_data for mismatched species
mismatched_stats = species_data[species_data['simple_class'].isin(mismatched_species)][['simple_class', 'count', '%']]
display(mismatched_stats)

Unnamed: 0,simple_class,count,%
3,western pond turtle,1886,1.8
5,anseriformes order,588,0.56
7,reptile,150,0.14
13,wild turkey,68,0.065
14,domestic cattle,64,0.061
17,white-tailed deer,41,0.039
18,central american agouti,38,0.036
23,nutria,22,0.021
26,wild boar,18,0.017
27,crocodile,18,0.017


In [None]:
# for each species on this list that shouldn't have been found in chicago, determine the list of 2 letter countries it belongs to.

