# AI Observer -  Get tables we need
- Aggregate Synonyms.
- Merge ranks 1-3 into 4 -> full species table.
- Merge ranks 1-8 into 9 -> full genus table.

In [1]:
%matplotlib inline

In [2]:
import pandas as pd
from pathlib import Path
import copy
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import image as mpimg
import os
import numpy as np
import swifter

In [3]:
MUSH_DIR = Path("../resources/")
IMG_DIR = Path("../images/320")
names_df = pd.read_csv(MUSH_DIR / "names.csv", sep="\t")
obs_df = pd.read_csv(MUSH_DIR / "full_observations.csv", sep="\t")

## Tables

- `obs_df`: Observations.
    - o.text_name -- mushrooms "name", see rank.
    - rank -- species = 4, subspecies/variety/name = 3/2/1, subgenera and up are 5-15, group is special = 16
    - o.name_id -- id of record in names table, see other file
    - o.thumb_image_id -- id of first image: just useful for telling you at a glance if the observation has any images
    - o.when -- date of observation... may or may not actually be fresh, some are perennials, some are dried or found at a grocery store(!)
    - o.vote_cache -- 1.5 = 50% score.
    - o.lat/long/alt -- latitude, longitude (decimal degrees, -vs = S / W), altitude (meters)
    - o.where -- english description of location: "blah, blah, County Co., State, Country" -- can probably rely on County, State and Country
    - l.north/south/east/west -- if obs has no lat/long/alt then revert to the n/s/e/w edges of this bounding box, but note not all boxes will be small enough to be useful
    - l.high/low -- again, bounding elevation, but the vast majority are left empty
    - o.is_collection_location -- nominally set to false if found at a grocery store, herbarium, fungal fair, etc.
    - o.notes -- fairly free-form, included just for completeness

- `names_df`: Ontology.
    - id -- record id in names table
    - text_name -- This is what you've been using.
    - author -- Mostly useless, but in cases where there are multiple entries for a single text_name, it can help
    - rank -- species = 4, subspecies/variety/name = 3/2/1, subgenera and up are 5-15, group is special = 16
    - deprecated -- the current scientific or community consensus usually chooses a single member of a set of synonyms, all others are "deprecated"
    - synonym_id -- random number for equivalence classes: all names with same synonym_id are considered synonyms of each other
    - correct_spelling_id -- some names are just misspellings, not real synonyms, but synonym_id will be set, too, so you can probably safely ignore this
- `img_df`: image id to observation id.


In [5]:
display(obs_df.tail())
display(names_df.tail())

Unnamed: 0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes
366993,403505,Russula,1158263.0,22603,2019-09-07,0.852738,,,,"Emerald Lake State Park, Dorset, Vermont, USA",43.3255,43.2264,-72.9239,-73.0758,,,1,---\n:Other: Golden brown cap. White stipe and...
366994,403506,Lichen,1158268.0,5645,2020-03-05,0.825645,,,,"Logan Canyon, Utah, USA",41.7485,41.7319,-111.778,-111.81,,,1,---\n:Other: Found on River Birch.\n
366995,403507,Lichen,1158270.0,5645,2020-03-05,-0.825749,,,,"Logan Canyon, Utah, USA",41.7485,41.7319,-111.778,-111.81,,,1,"---\n:Other: Found on River Birch, bright lime..."
366996,403508,Cortinarius,1158274.0,20650,2020-01-29,0.835899,34.011,-119.802,,"Santa Cruz Island, Santa Barbara Co., Californ...",34.0808,33.9608,-119.522,-119.926,,,1,---\n:Collector's_Name: Joanne Schwartz\n:Subs...
366997,403509,Hypomyces lactifluorum,1158280.0,158,2019-09-07,2.55822,,,,"Emerald Lake State Park, Dorset, Vermont, USA",43.3255,43.2264,-72.9239,-73.0758,,,1,---\n:Other: Coniferous forest\n


Unnamed: 0,id,text_name,author,rank,deprecated,synonym_id,correct_spelling_id
104774,109640,Callistosporiaceae,"Vizzini, Consiglio, M. Marchetti & P. Alvarado",10,0,,
104775,109641,Pseudolaccaria fellea,"(Peck) Vizzini, Matheny, Consiglio & M. Marchetti",4,0,5508.0,
104776,109642,Mortierella gamsii,,4,0,,
104777,109643,Panaeolus axfordii,"Y. Hu, S.C. Karunarathna, P.E. Mortimer & J.C. Xu",4,0,,
104778,109644,Gasteromycetis,,9,0,,


In [49]:
names_df[names_df.deprecated == 1]

Unnamed: 0,id,text_name,author,rank,deprecated,synonym_id,correct_spelling_id
5,6,Xerocomus zelleri,(Murrill) Snell,4,1,505.0,
7,8,Xerocomus dryophilus,(Thiers) Singer,4,1,4404.0,
8,9,Xerocomus chrysenteron,(Bull.) Quél.,4,1,504.0,
9,10,Volvariella gloiocephala,(DC.) Boekhout & Enderle,4,1,593.0,
12,13,Myxomycota,,13,1,5178.0,
...,...,...,...,...,...,...,...
104546,109404,Acarospora peliocypha,(Wahlenb.) Th. Fr.,4,1,9047.0,
104710,109568,Agaricales,sensu strictissimo,11,1,6973.0,
104728,109591,Hypocrea lutea,(Tode) Petch,4,1,9273.0,
104755,109621,Mycena umbrina,A.H. Sm.,4,1,9277.0,


In [51]:
names_df[names_df.synonym_id == 505]

Unnamed: 0,id,text_name,author,rank,deprecated,synonym_id,correct_spelling_id
5,6,Xerocomus zelleri,(Murrill) Snell,4,1,505.0,
613,654,Boletus zelleri,Murrill,4,1,505.0,
28139,29865,Boletus,Zerelli Murill,9,1,505.0,654.0
29831,31695,Xerocomellus zelleri,(Murrill) Klofac,4,0,505.0,
38863,40863,Boletus zelleri,(Murrill) Murrill,4,1,505.0,
39335,41365,Xerocomellus zelleri,"(Murrill) Klofac, Öst. Z. Pilzk. 20: 39 (2011)",4,1,505.0,31695.0


In [7]:
display(obs_df.describe(include="all"))
display(names_df.describe(include="all"))

Unnamed: 0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes
count,366998.0,366998,349386.0,366998.0,366998,363083.0,69137.0,69137.0,68445.0,366998,366998.0,366998.0,366998.0,366998.0,50851.0,48691.0,366998.0,366043
unique,,16363,,,6579,,,,,19998,,,,,,,,148925
top,,Agaricales,,,2014-10-11,,,,,"Cali, Valle del Cauca, Colombia",,,,,,,,--- {}\n
freq,,6385,,,626,,,,,6671,,,,,,,,192511
mean,209063.609194,,564694.9,13404.151513,,1.807381,31.931695,-73.118909,658.323413,,35.375851,34.303878,-78.127391,-79.86844,951.08269,624.207847,0.980594,
std,115566.773848,,338741.2,17536.782169,,0.69149,20.147464,56.847431,924.502257,,17.216964,18.774903,55.49474,53.442877,975.989255,698.856352,0.137948,
min,1.0,,1.0,1.0,,-2.60801,-90.0,-180.0,-6487.0,,-87.6437,-89.6437,-169.045,-179.0,0.0,-86.0,0.0,
25%,110395.25,,267492.2,691.0,,1.439555,30.5154,-98.5142,115.0,,35.5727,35.0458,-120.528,-121.394,220.0,150.0,1.0,
50%,211910.5,,560429.0,4169.0,,1.71899,37.9269,-83.5531,280.0,,39.3784,39.2153,-83.4902,-83.74005,650.0,200.0,1.0,
75%,309297.75,,859151.2,22777.0,,2.50852,43.164,-76.6008,1415.0,,43.057,42.8286,-76.3284,-76.5175,1800.0,1500.0,1.0,


Unnamed: 0,id,text_name,author,rank,deprecated,synonym_id,correct_spelling_id
count,104779.0,104779,84174,104779.0,104779.0,25309.0,1604.0
unique,,102437,25728,,,,
top,,Cortinarius,Singer,,,,
freq,,7,893,,,,
mean,54770.988585,,,4.505636,0.195554,4407.791418,15300.733167
std,31552.562159,,,1.841703,0.396629,2574.985826,19786.073827
min,1.0,,,1.0,0.0,1.0,1.0
25%,27813.5,,,4.0,0.0,2298.0,1060.0
50%,54875.0,,,4.0,0.0,4342.0,5374.5
75%,82020.5,,,4.0,0.0,6464.0,22794.0


## Aggregate synonyms
Keep the name where deprecated = 0.

In [40]:
def pref_name(id_):
    row = names_df[names_df.id == id_]
    s_id = row.synonym_id.values[0]
    if pd.isna(s_id):
        return row.text_name.values[0]
    try:
        return names_df[(names_df.synonym_id == s_id) & (names_df.deprecated == 0)].text_name.values[0]
    except IndexError:
        return row.text_name.values[0]

In [59]:
obs_df["preferred_name"] = obs_df.name_id.swifter.apply(pref_name)

HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=24.0, style=ProgressStyle(description_wi…




In [60]:
len(obs_df.text_name.unique()), len(obs_df.preferred_name.unique())

(16363, 14566)

In [61]:
obs_df.columns

Index(['id', 'text_name', 'thumb_image_id', 'name_id', 'when', 'vote_cache',
       'lat', 'long', 'alt', 'where', 'north', 'south', 'east', 'west', 'high',
       'low', 'is_collection_location', 'notes', 'preferred_name'],
      dtype='object')

In [62]:
obs_df[obs_df.name_id == 6].head(1)

Unnamed: 0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,preferred_name
6,7,Xerocomus zelleri,7.0,6,2005-01-07,2.49991,,,,"Santa Cruz, Santa Cruz Co., California, USA",37.0243,36.9466,-121.987,-122.088,100.0,0.0,0,---\n:Other: |-\n Seen at the 2005 Santa Cruz...,Xerocomellus zelleri


In [63]:
obs_df.to_csv(MUSH_DIR / "full_observations_with_preferred_name.csv")

In [None]:
obs_df = pd.read_csv(MUSH_DIR / "full_observations_with_preferred_name.csv")

## Ranks

In [65]:
obs_df = pd.merge(obs_df, names_df, left_on="name_id", right_on="id", suffixes=("", "_y"))

In [66]:
obs_df.columns

Index(['id', 'text_name', 'thumb_image_id', 'name_id', 'when', 'vote_cache',
       'lat', 'long', 'alt', 'where', 'north', 'south', 'east', 'west', 'high',
       'low', 'is_collection_location', 'notes', 'preferred_name', 'id_y',
       'text_name_y', 'author', 'rank', 'deprecated', 'synonym_id',
       'correct_spelling_id'],
      dtype='object')

In [67]:
obs_df.drop(['id_y', 'text_name_y', 'author', 'deprecated', 'synonym_id', 'correct_spelling_id'],
            axis=1, inplace=True)

In [68]:
obs_df.columns

Index(['id', 'text_name', 'thumb_image_id', 'name_id', 'when', 'vote_cache',
       'lat', 'long', 'alt', 'where', 'north', 'south', 'east', 'west', 'high',
       'low', 'is_collection_location', 'notes', 'preferred_name', 'rank'],
      dtype='object')

In [69]:
obs_df.head()

Unnamed: 0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,preferred_name,rank
0,1,Xylaria polymorpha group,1.0,2,2004-07-13,1.92335,,,,"North Carolina, USA",36.5273,34.7416,-75.8496,-83.7598,,,1,---\n:Other: Photographed at a rest stop betwe...,Xylaria polymorpha group,16
1,4033,Xylaria polymorpha group,6169.0,2,2006-09-03,,,,,"Cook Co., Illinois, USA",42.1542,41.4685,-87.526,-88.269,290.0,177.0,1,"---\n:Other: ""Although quite common in the Eas...",Xylaria polymorpha group,16
2,4438,Xylaria polymorpha group,7065.0,2,2005-09-10,,,,,"Elgin Co., Ontario, Canada",42.876,42.4701,-80.8044,-81.8179,,,1,--- {}\n,Xylaria polymorpha group,16
3,8276,Xylaria polymorpha group,15324.0,2,2008-06-29,1.78762,,,,"Elora Gorge Conservation Area, Elora, Ontario,...",43.6825,43.6537,-80.4309,-80.4606,400.0,300.0,1,--- {}\n,Xylaria polymorpha group,16
4,10184,Xylaria polymorpha group,18949.0,2,2008-08-12,1.65447,,,,"Kinns Rd. Park, Clifton Park, New York, USA",42.8892,42.8823,-73.8065,-73.8123,,,1,"---\n:Other: ""Growing on decaying wood, probab...",Xylaria polymorpha group,16


In [70]:
obs_df.groupby("rank").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,preferred_name
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,456,456,444,456,456,455,64,64,57,456,456,456,456,456,69,66,456,455,456
2,4356,4356,4142,4356,4356,4295,506,506,478,4356,4356,4356,4356,4356,599,583,4356,4349,4356
3,663,663,613,663,663,661,103,103,106,663,663,663,663,663,90,95,663,661,663
4,217657,217657,203767,217657,217657,214305,37537,37537,34822,217657,217657,217657,217657,217657,26886,25788,217657,217215,217657
5,203,203,202,203,203,203,41,41,41,203,203,203,203,203,11,10,203,203,203
6,884,884,863,884,884,884,179,179,179,884,884,884,884,884,93,84,884,878,884
7,7182,7182,7078,7182,7182,7182,1427,1427,1438,7182,7182,7182,7182,7182,698,625,7182,7149,7182
8,2155,2155,2040,2155,2155,2155,303,303,294,2155,2155,2155,2155,2155,233,216,2155,2146,2155
9,91527,91527,89507,91527,91527,91075,18187,18187,18997,91527,91527,91527,91527,91527,14040,13408,91527,91173,91527
10,7988,7988,7932,7988,7988,7987,2318,2318,2582,7988,7988,7988,7988,7988,1815,1746,7988,7973,7988


### Merge ranks 1, 2, 3 into rank=4

In [76]:
obs_df[obs_df["rank"] == 1].groupby("preferred_name").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,rank
preferred_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Agaricus silvicola,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Amanita americitrina,1,1,0,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Amanita citrina,2,2,2,2,2,2,1,1,1,2,2,2,2,2,0,0,2,2,2
Amanita gemmata,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Amanita lavendula,46,46,43,46,46,46,1,1,0,46,46,46,46,46,5,5,46,46,46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Stropharia rugosoannulata,28,28,28,28,28,28,2,2,2,28,28,28,28,28,1,1,28,28,28
Suillus viscidus,3,3,3,3,3,3,0,0,0,3,3,3,3,3,0,0,3,3,3
Tapinella panuoides,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Tremella mesenterica,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1


In [71]:
obs_df.loc[obs_df["rank"] == 1, "preferred_name"] = (obs_df[obs_df["rank"] == 1].preferred_name
                                                     .swifter.apply(lambda x: " ".join(x.split()[:2])))

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=456.0, style=ProgressStyle(description…




In [77]:
obs_df[obs_df["rank"] == 2].groupby("preferred_name").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,rank
preferred_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Agaricus porphyrocephalus,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Agaricus silvicola,2,2,1,2,2,2,0,0,0,2,2,2,2,2,0,0,2,2,2
Agaricus xanthodermus,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Agrocybe pediades,5,5,5,5,5,5,0,0,0,5,5,5,5,5,0,0,5,5,5
Albatrellus ovinus,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Tylopilus felleus,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Tylopilus porphyrosporus,2,2,2,2,2,2,0,0,0,2,2,2,2,2,0,0,2,2,2
Usnea fragilescens,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Volvariella bombycina,7,7,7,7,7,7,1,1,1,7,7,7,7,7,0,0,7,7,7


In [75]:
obs_df.loc[obs_df["rank"] == 2, "preferred_name"] = (obs_df[obs_df["rank"] == 2].preferred_name
                                                     .swifter.apply(lambda x: " ".join(x.split()[:2])))

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=4356.0, style=ProgressStyle(descriptio…




In [78]:
obs_df[obs_df["rank"] == 3].groupby("preferred_name").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,rank
preferred_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Agaricus argenteus subsp. annetteae,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Agaricus sylvaticus subsp. occidentalis,2,2,2,2,2,2,0,0,0,2,2,2,2,2,0,0,2,2,2
Alectoria sarmentosa subsp. vexillifera,2,2,1,2,2,2,1,1,1,2,2,2,2,2,0,0,2,2,2
Amanita amerimuscaria subsp. guessowii,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Amanita muscaria,10,10,10,10,10,10,1,1,1,10,10,10,10,10,0,0,10,10,10
Amanita muscaria subsp. flavivolvata,479,479,455,479,479,477,47,47,55,479,479,479,479,479,71,73,479,477,479
Bacidia laurocerasi subsp. idahoensis,2,2,2,2,2,2,1,1,1,2,2,2,2,2,1,1,2,2,2
Bryoria trichodes,1,1,0,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Caloplaca marina subsp. americana,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1
Calvatia cyathiformis subsp. cyathiformis,2,2,2,2,2,2,0,0,0,2,2,2,2,2,0,0,2,2,2


In [79]:
obs_df.loc[obs_df["rank"] == 3, "preferred_name"] = (obs_df[obs_df["rank"] == 3].preferred_name
                                                     .swifter.apply(lambda x: " ".join(x.split()[:2])))

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=663.0, style=ProgressStyle(description…




In [80]:
obs_df.loc[obs_df["rank"].isin((1, 2, 3)), "rank"] = 4

In [84]:
obs_df.tail()

Unnamed: 0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,preferred_name,rank
366993,403226,Neonectria ditissima,1157348.0,109634,2019-04-23,0.853495,,,,"Coal Center, Pennsylvania, USA",40.0736,40.0667,-79.8948,-79.9064,,,1,--- {}\n,Neonectria ditissima,4
366994,403227,Neonectria ditissima,1157360.0,109634,2019-04-23,0.853497,,,,"Coal Center, Pennsylvania, USA",40.0736,40.0667,-79.8948,-79.9064,,,1,--- {}\n,Neonectria ditissima,4
366995,403232,Lepra pustulata,1157388.0,67233,2020-03-01,1.35942,,,,"Panola Mountain State Park, Rockdale Co., Geor...",33.6459,33.622,-84.1308,-84.1825,,,1,---\n:Other: On _Quercus_ bark in full sun.\n,Lepra pustulata,4
366996,403281,Hypochnicium albostramineum,1157571.0,50525,2020-02-26,2.5793,40.0006,-83.0426,248.0,"Carmack Woods, Columbus, Ohio, USA",40.0024,39.9999,-83.0401,-83.0439,,,1,---\n:Other: 'Growing on the underside of a ha...,Hypochnicium albostramineum,4
366997,403394,Gasteromycetis,1157940.0,109644,2020-03-02,0.866425,3.4188,-76.6014,1657.0,"Vereda El Faro, Cali, Valle del Cauca, Colombia",3.43964,3.43694,-76.5148,-76.5175,1800.0,1500.0,1,--- {}\n,Gasteromycetis,9


In [105]:
obs_4_df = obs_df[obs_df["rank"] == 4].copy()

In [120]:
obs_4_df.to_csv(MUSH_DIR / "full_observations_rank4.csv", index=False)

### Look at ranks 5-16

In [86]:
obs_df[obs_df["rank"] == 16].groupby("preferred_name").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,rank
preferred_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Agaricineae,46,46,46,46,46,46,19,19,19,46,46,46,46,46,5,6,46,46,46
Agaricus approximans group,4,4,4,4,4,4,2,2,0,4,4,4,4,4,0,0,4,4,4
Agaricus arvensis group,27,27,25,27,27,27,5,5,5,27,27,27,27,27,2,2,27,27,27
Agaricus augustus group,18,18,16,18,18,18,4,4,4,18,18,18,18,18,2,2,18,18,18
Agaricus auricolor group,8,8,8,8,8,8,5,5,5,8,8,8,8,8,0,0,8,8,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Xylaria fissilis group,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Xylaria hypoxylon group,226,226,191,226,226,211,15,15,14,226,226,226,226,226,26,26,226,226,226
Xylaria multiplex group,9,9,9,9,9,9,2,2,2,9,9,9,9,9,3,3,9,9,9
Xylaria obovata group,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [87]:
obs_df[obs_df["rank"] == 15].groupby("preferred_name").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,rank
preferred_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Bacteria,2,2,2,2,2,2,0,0,0,2,2,2,2,2,1,1,2,2,2
Eukarya,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Non-fungal,178,178,178,178,178,178,12,12,19,178,178,178,178,178,21,21,178,176,178
Undetermined,580,580,554,580,580,580,136,136,174,580,580,580,580,580,135,131,580,580,580


In [88]:
obs_df[obs_df["rank"] == 14].groupby("preferred_name").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,rank
preferred_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Animalia,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Fungi,2220,2220,1813,2220,2220,2215,661,661,696,2220,2220,2220,2220,2220,249,237,2220,2218,2220
Plantae,62,62,62,62,62,62,14,14,14,62,62,62,62,62,5,5,62,62,62


In [89]:
obs_df[obs_df["rank"] == 13].groupby("preferred_name").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,rank
preferred_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Ascomycota,328,328,323,328,328,328,80,80,83,328,328,328,328,328,40,39,328,326,328
Basidiomycota,99,99,99,99,99,99,36,36,38,99,99,99,99,99,7,5,99,98,99
Bryophyta,24,24,24,24,24,24,1,1,1,24,24,24,24,24,0,0,24,24,24
Chytridiomycota,2,2,2,2,2,2,0,0,0,2,2,2,2,2,0,0,2,2,2
Deuteromycota,239,239,236,239,239,239,45,45,50,239,239,239,239,239,40,38,239,237,239
Eumycetozoa,269,269,268,269,269,269,16,16,20,269,269,269,269,269,39,37,269,264,269
Glomeromycota,3,3,3,3,3,3,0,0,0,3,3,3,3,3,0,0,3,3,3
Lichen,1608,1608,1594,1608,1608,1608,332,332,301,1608,1608,1608,1608,1608,188,176,1608,1604,1608
Magnoliophyta,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1
Mucoromycota,2,2,2,2,2,2,1,1,1,2,2,2,2,2,1,1,2,2,2


In [90]:
obs_df[obs_df["rank"] == 12].groupby("preferred_name").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,rank
preferred_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Agaricomycetes,707,707,707,707,707,707,151,151,212,707,707,707,707,707,177,171,707,707,707
Agaricomycetidae,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Arachnida,7,7,7,7,7,7,0,0,1,7,7,7,7,7,4,4,7,7,7
Ascomycota,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1
Basidiobolomycetes,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,0,1,1,1
Basidiomycota,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,0,1,1,1
Dacrymycetes,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Deuteromycota,2,2,2,2,2,2,0,0,0,2,2,2,2,2,0,0,2,2,2
Discomycetes,508,508,497,508,508,508,119,119,127,508,508,508,508,508,81,78,508,505,508
Dothideomycetes,9,9,9,9,9,9,1,1,1,9,9,9,9,9,0,0,9,9,9


In [91]:
obs_df[obs_df["rank"] == 11].groupby("preferred_name").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,rank
preferred_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Agaricales,6385,6385,6369,6385,6385,6385,1979,1979,2348,6385,6385,6385,6385,6385,1835,1784,6385,6371,6385
Amylocorticiales,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Araneae,2,2,2,2,2,2,0,0,1,2,2,2,2,2,0,0,2,2,2
Arthoniales,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Atractiellales,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Auriculariales,3,3,3,3,3,3,2,2,2,3,3,3,3,3,0,0,3,3,3
Boletales,197,197,196,197,197,197,42,42,39,197,197,197,197,197,19,18,197,195,197
Cantharellales,6,6,6,6,6,6,1,1,1,6,6,6,6,6,1,1,6,6,6
Corticiales,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Dacrymycetales,2,2,2,2,2,2,1,1,1,2,2,2,2,2,0,0,2,2,2


In [92]:
obs_df[obs_df["rank"] == 10].groupby("preferred_name").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,rank
preferred_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Acarosporaceae,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1
Agaricaceae,585,585,585,585,585,585,150,150,199,585,585,585,585,585,196,192,585,585,585
Albatrellaceae,5,5,5,5,5,5,0,0,2,5,5,5,5,5,0,0,5,5,5
Albuginaceae,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Amanitaceae,22,22,22,22,22,22,5,5,5,22,22,22,22,22,1,1,22,22,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Umbilicariaceae,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Verrucariaceae,3,3,3,3,3,3,0,0,0,3,3,3,3,3,0,0,3,3,3
Xerocomaceae,2,2,1,2,2,2,2,2,2,2,2,2,2,2,0,0,2,2,2
Xerocomoideae,16,16,16,16,16,16,3,3,2,16,16,16,16,16,0,0,16,16,16


In [93]:
obs_df[obs_df["rank"] == 9].groupby("preferred_name").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,rank
preferred_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Abortiporus,10,10,9,10,10,10,1,1,1,10,10,10,10,10,0,0,10,10,10
Abrothallus,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1
Abundisporus,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1
Acarospora,71,71,69,71,71,71,27,27,20,71,71,71,71,71,5,6,71,71,71
Acervus,14,14,14,14,14,14,4,4,4,14,14,14,14,14,2,2,14,14,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Xylobotryum,12,12,12,12,12,12,3,3,3,12,12,12,12,12,5,5,12,12,12
Xylodon,17,17,17,17,17,17,5,5,5,17,17,17,17,17,1,1,17,17,17
Xylographa,3,3,3,3,3,3,2,2,2,3,3,3,3,3,0,0,3,3,3
Xylohypha,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1


In [94]:
obs_df[obs_df["rank"] == 8].groupby("preferred_name").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,rank
preferred_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Amanita subgenus Amanita,26,26,26,26,26,26,4,4,4,26,26,26,26,26,1,1,26,26,26
Amanita subgenus Amanitina,3,3,3,3,3,3,2,2,2,3,3,3,3,3,0,0,3,3,3
Amanita subgenus Lepidella,7,7,7,7,7,7,0,0,0,7,7,7,7,7,1,1,7,7,7
Armillaria,4,4,4,4,4,4,0,0,0,4,4,4,4,4,1,1,4,4,4
Cantharellus subgenus Cantharellus,3,3,3,3,3,3,1,1,1,3,3,3,3,3,1,1,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Russula subgenus Polychromidia,3,3,3,3,3,3,0,0,0,3,3,3,3,3,0,0,3,3,3
Russula subgenus Russula,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Thamnomyces subgenus Scopimyces,8,8,8,8,8,8,5,5,7,8,8,8,8,8,7,7,8,8,8
Thamnomyces subgenus Thamnomyces,4,4,4,4,4,4,0,0,0,4,4,4,4,4,0,0,4,4,4


In [95]:
obs_df[obs_df["rank"] == 7].groupby("preferred_name").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,rank
preferred_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Agaricus sect. Agaricus,4,4,4,4,4,4,2,2,2,4,4,4,4,4,0,0,4,4,4
Agaricus sect. Arvenses,56,56,53,56,56,56,18,18,18,56,56,56,56,56,2,2,56,56,56
Agaricus sect. Bivelares,8,8,8,8,8,8,1,1,1,8,8,8,8,8,0,0,8,8,8
Agaricus sect. Chitonioides,2,2,2,2,2,2,0,0,0,2,2,2,2,2,0,0,2,2,2
Agaricus sect. Minores,31,31,31,31,31,31,4,4,4,31,31,31,31,31,0,0,31,31,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Tricholoma sect. Tricholoma,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Tubaria sect. Confragosae,3,3,3,3,3,3,0,0,0,3,3,3,3,3,0,0,3,3,3
Xeromphalina sect. Mutabiles,24,24,24,24,24,24,5,5,5,24,24,24,24,24,1,1,24,24,24
Xeromphalina sect. Xeromphalina,29,29,29,29,29,29,6,6,5,29,29,29,29,29,3,2,29,29,29


In [96]:
obs_df[obs_df["rank"] == 6].groupby("preferred_name").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,rank
preferred_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Agaricus subsect. Hondenses,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Amanita subsect. Amanitella,2,2,2,2,2,2,1,1,0,2,2,2,2,2,0,0,2,2,2
Amanita subsect. Gemmatae,12,12,12,12,12,12,3,3,3,12,12,12,12,12,1,1,12,12,12
Amanita subsect. Gymnopodae,4,4,3,4,4,4,0,0,0,4,4,4,4,4,0,0,4,4,4
Amanita subsect. Limbatulae,12,12,12,12,12,12,0,0,0,12,12,12,12,12,1,1,12,12,12
Amanita subsect. Pantherinae,31,31,31,31,31,31,10,10,8,31,31,31,31,31,5,5,31,31,31
Amanita subsect. Rubrovolvatae,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Amanita subsect. Solitariae,23,23,23,23,23,23,5,5,5,23,23,23,23,23,3,0,23,23,23
Amanita subsect. Vittadiniae,13,13,13,13,13,13,2,2,2,13,13,13,13,13,2,2,13,13,13
Coprinellus subsect. Impatientes,4,4,4,4,4,4,0,0,0,4,4,4,4,4,0,0,4,4,4


In [97]:
obs_df[obs_df["rank"] == 5].groupby("preferred_name").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,rank
preferred_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Amanita stirps Bulbosa,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1
Amanita stirps Caesarea,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Amanita stirps Crocea,3,3,3,3,3,3,1,1,1,3,3,3,3,3,0,0,3,3,3
Amanita stirps Daucipes,1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0,1,1,1
Amanita stirps Grossa,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Amanita stirps Hemibapha,8,8,8,8,8,8,4,4,4,8,8,8,8,8,1,1,8,8,8
Amanita stirps Muscaria,58,58,57,58,58,58,12,12,12,58,58,58,58,58,3,3,58,58,58
Amanita stirps Rhopalopus,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1
Amanita stirps Rooseveltensis,1,1,1,1,1,1,0,0,0,1,1,1,1,1,0,0,1,1,1
Amanita stirps Solitaria,2,2,2,2,2,2,0,0,0,2,2,2,2,2,0,0,2,2,2


### Merge ranks 4, 5, 6, 7, and 8 into genus (rank 9)

In [99]:
obs_9_df = obs_df.copy()

In [100]:
for i in range(4, 9):
    obs_9_df.loc[obs_9_df["rank"] == i, "preferred_name"] = (obs_9_df[obs_9_df["rank"] == i].preferred_name
                                                             .swifter.apply(lambda x: " ".join(x.split()[:1])))
    obs_9_df.loc[obs_9_df["rank"] == i, "rank"] = 9

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=223132.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=203.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=884.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=7182.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=2155.0, style=ProgressStyle(descriptio…




In [101]:
obs_9_df.groupby("rank").count()

Unnamed: 0_level_0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,preferred_name
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
9,325083,325083,308656,325083,325083,321215,58347,58347,56412,325083,325083,325083,325083,325083,42719,40875,325083,324229,325083
10,7988,7988,7932,7988,7988,7987,2318,2318,2582,7988,7988,7988,7988,7988,1815,1746,7988,7973,7988
11,12055,12055,12026,12055,12055,12054,3894,3894,4630,12055,12055,12055,12055,12055,3675,3596,12055,12022,12055
12,2309,2309,2293,2309,2309,2309,496,496,637,2309,2309,2309,2309,2309,483,463,2309,2304,2309
13,2615,2615,2592,2615,2615,2615,514,514,499,2615,2615,2615,2615,2615,322,303,2615,2601,2615
14,2283,2283,1876,2283,2283,2278,675,675,710,2283,2283,2283,2283,2283,254,242,2283,2281,2283
15,761,761,735,761,761,761,148,148,193,761,761,761,761,761,157,153,761,759,761
16,13904,13904,13276,13904,13904,13864,2745,2745,2782,13904,13904,13904,13904,13904,1426,1313,13904,13874,13904


In [107]:
obs_9_df = obs_9_df[obs_9_df["rank"] == 9].copy() 

In [118]:
obs_9_df[obs_9_df["rank"] == 9].to_csv(MUSH_DIR / "full_observations_rank9.csv", index=False)

In [117]:
pd.DataFrame.to_csv?

## Read full observations

In [4]:
obs_9_df = pd.read_csv(MUSH_DIR / "full_observations_rank9.csv")
obs_4_df = pd.read_csv(MUSH_DIR / "full_observations_rank4.csv")

In [6]:
obs_4_df.describe()

Unnamed: 0,id,thumb_image_id,name_id,vote_cache,lat,long,alt,north,south,east,west,high,low,is_collection_location,rank
count,223132.0,208966.0,223132.0,219716.0,38210.0,38210.0,35463.0,223132.0,223132.0,223132.0,223132.0,27644.0,26532.0,223132.0,223132.0
mean,198864.188229,537489.0,12744.412796,1.980732,36.839369,-74.287465,497.080647,38.653647,37.809372,-82.082981,-83.535844,735.207501,379.429698,0.974585,4.0
std,116636.492021,342218.4,17364.038072,0.657988,17.143742,58.976763,860.152415,13.219711,14.944782,51.504321,49.497369,951.251742,542.182307,0.157384,0.0
min,2.0,2.0,3.0,-2.60801,-90.0,-180.0,-5809.0,-87.135,-89.135,-169.045,-179.0,0.0,-86.0,0.0,4.0
25%,98276.0,231307.8,668.0,1.67089,34.5361,-115.306,101.0,37.3749,37.258,-121.951,-122.08,170.0,15.0,1.0,4.0
50%,196054.5,521414.0,4466.0,2.17488,40.4816,-84.43525,256.0,40.3397,40.0027,-84.68995,-84.9692,220.0,173.0,1.0,4.0
75%,299063.25,833446.0,21187.0,2.54892,44.503825,-76.6019,623.5,43.7319,43.3476,-77.131,-77.5277,900.0,450.0,1.0,4.0
max,403509.0,1158280.0,109639.0,2.9374,89.984,175.7706,27000.0,89.0,74.4077,179.0,178.405,73000.0,10000.0,1.0,4.0


##  Number of observations per species

In [111]:
def obs_statistics(max_sp, conf, ret_df=False):
    print(f"Considering {max_sp} species and vote_cache > {conf}...")
    df_filter = copy.deepcopy(obs_4_df[obs_4_df.vote_cache > conf].groupby('preferred_name').count()
                              .id.sort_values(ascending=False)[:max_sp])
    cumsum = (100. * df_filter.cumsum() / obs_4_df[obs_4_df.vote_cache > conf].groupby('preferred_name')
             .count().id.sum())[:max_sp]
    df_filter = pd.DataFrame({"preferred_name": df_filter.index, "Observations": df_filter.values,
                              "Share of total %": cumsum.values})
    display(df_filter)
    if ret_df:
        return df_filter

In [114]:
max_sp = 10000
conf = 1.5
obs_statistics(max_sp, conf)

Considering 10000 species and vote_cache > 1.5...


Unnamed: 0,preferred_name,Observations,Share of total %
0,Amanita muscaria,1374,0.750684
1,Trametes versicolor,1080,1.340742
2,Lepista nuda,790,1.772358
3,Pleurotus ostreatus,730,2.171193
4,Hypholoma fasciculare,662,2.532877
...,...,...,...
9995,Cortinarius umidicola,1,99.522490
9996,Cortinarius uraceus,1,99.523037
9997,Cortinarius variiformis,1,99.523583
9998,Cortinarius venustus,1,99.524130


In [113]:
max_sp = 1000
conf = 1.5
df_filter = obs_statistics(max_sp, conf, ret_df=True)
df_filter[df_filter.preferred_name == 'Lepista nuda']

Considering 1000 species and vote_cache > 1.5...


Unnamed: 0,preferred_name,Observations,Share of total %
0,Amanita muscaria,1374,0.750684
1,Trametes versicolor,1080,1.340742
2,Lepista nuda,790,1.772358
3,Pleurotus ostreatus,730,2.171193
4,Hypholoma fasciculare,662,2.532877
...,...,...,...
995,Asterophora parasitica,45,67.522796
996,Sowerbyella rhenana,44,67.546836
997,Lactifluus glaucescens,44,67.570875
998,Lycoperdon nigrescens,44,67.594915


Unnamed: 0,preferred_name,Observations,Share of total %
2,Lepista nuda,790,1.772358


- At 1.65 we get 205 observation at the bottom