In [1]:
import pandas as pd
from pathlib import Path
import copy
import numpy as np
import os
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
def check_existence(path, df):
    dont_exist = []
    for id_ in df.image_id.values:
        if not os.path.exists(path / f"{id_}.jpg"):
            dont_exist.append(id_)
    return dont_exist


def valid_names(df, min_obs, min_vote):
    count_df = df[df.vote_cache >= min_vote].groupby("preferred_name").vote_cache.count()
    valid = list(count_df[count_df >= min_obs].index)
    unknown = list(set(df[~df.preferred_name.isin(valid)].preferred_name.values))
    return valid, unknown


def training_ds(df, known, unknown, n_unknown=1.0):
    known_df = df[df.preferred_name.isin(known)].copy()
    if type(n_unknown) == int:
        unknown_df = df[df.preferred_name.isin(unknown)].copy().sample(n_unknown)
    elif type(n_unknown) == float and n_unknown >= 0 and n_unknown <= 1:
        unknown_df = df[df.preferred_name.isin(unknown)].copy().sample(frac=n_unknown)
    else:
        return
    unknown_df.preferred_name = "unknown"
    return pd.concat([known_df, unknown_df]).sample(frac=1).reset_index(drop=True)

def train_valid_ds(df, test_size):
    split = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=41)
    train_index, test_index = next(split.split(df, df.preferred_name))
    train_df = df.iloc[train_index].copy()
    train_df["is_validation"] = False
    test_df = df.iloc[test_index].copy()
    test_df["is_validation"] = True
    return pd.concat([train_df, test_df]).reset_index(drop=True)

def check_existence(path, df):
    dont_exist = []
    for id_ in df.image_id.values:
        if not os.path.exists(path / f"{id_}.jpg"):
            dont_exist.append(id_)
    return dont_exist

In [3]:
RES_DIR = Path("../resources/")
IMG_DIR = Path("../images/320")
img_df = pd.read_csv(RES_DIR / "images_observations.csv", sep="\t")
obs_4_df = pd.read_csv(RES_DIR / "full_observations_rank4.csv")
obs_9_df = pd.read_csv(RES_DIR / "full_observations_rank9.csv")

## Tables

- `img_df`: Mapping between observations and images.
- For other tables, see `get_needed_tables` notebook.

In [4]:
obs_4_df.tail()

Unnamed: 0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,preferred_name,rank
223127,403219,"Orbilia ""oc2200229""",1157325.0,109633,2020-02-29,2.59985,48.5208,-123.4169,0.0,"Observatory Hill, Victoria, British Columbia, ...",48.5263,48.5167,-123.409,-123.422,220.0,150.0,1,--- {}\n,"Orbilia ""oc2200229""",4
223128,403226,Neonectria ditissima,1157348.0,109634,2019-04-23,0.853495,,,,"Coal Center, Pennsylvania, USA",40.0736,40.0667,-79.8948,-79.9064,,,1,--- {}\n,Neonectria ditissima,4
223129,403227,Neonectria ditissima,1157360.0,109634,2019-04-23,0.853497,,,,"Coal Center, Pennsylvania, USA",40.0736,40.0667,-79.8948,-79.9064,,,1,--- {}\n,Neonectria ditissima,4
223130,403232,Lepra pustulata,1157388.0,67233,2020-03-01,1.35942,,,,"Panola Mountain State Park, Rockdale Co., Geor...",33.6459,33.622,-84.1308,-84.1825,,,1,---\n:Other: On _Quercus_ bark in full sun.\n,Lepra pustulata,4
223131,403281,Hypochnicium albostramineum,1157571.0,50525,2020-02-26,2.5793,40.0006,-83.0426,248.0,"Carmack Woods, Columbus, Ohio, USA",40.0024,39.9999,-83.0401,-83.0439,,,1,---\n:Other: 'Growing on the underside of a ha...,Hypochnicium albostramineum,4


In [5]:
obs_4_df.vote_cache

0         2.706040
1         2.661230
2         2.716170
3         2.547040
4         1.696830
            ...   
223127    2.599850
223128    0.853495
223129    0.853497
223130    1.359420
223131    2.579300
Name: vote_cache, Length: 223132, dtype: float64

In [6]:
img_df.tail()

Unnamed: 0,image_id,observation_id
1073703,1148528,400818
1073704,1148529,400818
1073705,1148530,400818
1073706,1148531,400818
1073707,1148532,400819


In [7]:
obs_4_df.head()

Unnamed: 0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,north,south,east,west,high,low,is_collection_location,notes,preferred_name,rank
0,2,Xylaria magnoliae,2.0,3,2004-07-17,2.70604,,,,"Asheville, North Carolina, USA",35.6509,35.5576,-82.4644,-82.632,650.0,650.0,1,---\n:Other: Seen in the exhibit at the 2004 N...,Xylaria magnoliae,4
1,8594,Xylaria magnoliae,16148.0,3,2007-08-08,2.66123,,,,"Ouida Plantation, West Feliciana Parish, Louis...",39.4596,39.4135,-91.0325,-91.0951,,,1,---\n:Other: |2-\n\n\n [admin - Sat Aug 14 02...,Xylaria magnoliae,4
2,8598,Xylaria magnoliae,16155.0,3,2008-08-01,2.71617,,,,"Walhalla Fish Hatchery, Oconee Co., South Caro...",34.9828,34.9801,-83.0743,-83.077,,,1,---\n:Other: |-\n Beautiful fresh material th...,Xylaria magnoliae,4
3,98309,Xylaria magnoliae,231063.0,3,2012-06-22,2.54704,,,,"Big Thicket National Preserve, Polk Co., Texas...",30.6196,30.4564,-94.3015,-94.3952,,,1,---\n:Other: Found in Big Sandy Creek Unit.\n,Xylaria magnoliae,4
4,135841,Xylaria magnoliae,335785.0,3,2013-06-08,1.69683,35.3537,-84.0583,1158.0,"Falls Branch Falls Trail, Cherokee National Fo...",35.3731,35.3457,-84.0529,-84.0812,,,1,"---\n:Other: ""On fallen magnolia cones\\r\\n""\n",Xylaria magnoliae,4


In [8]:
full_4_df = pd.merge(obs_4_df, img_df, left_on="id", right_on="observation_id")
full_9_df = pd.merge(obs_9_df, img_df, left_on="id", right_on="observation_id")

In [9]:
len(full_4_df), len(full_9_df)

(646782, 952884)

In [51]:
valid4, unknown4 = valid_names(obs_4_df, 165, 2.0)
valid9, unknown9 = valid_names(obs_9_df, 10, 1.5)

In [52]:
len(valid4), len(unknown4), len(valid9), len(unknown9)

(100, 11610, 974, 1401)

In [53]:
training4 = training_ds(obs_4_df, valid4, unknown4, n_unknown=0.0)
training9 = training_ds(obs_9_df, valid9, unknown9)

In [54]:
len(training4), len(obs_4_df), len(training4[training4.preferred_name == "unknown"]),\
    len(training9), len(obs_9_df), len(training9[training9.preferred_name == "unknown"])

(42474, 223132, 0, 325083, 325083, 4518)

In [55]:
ds4 = train_valid_ds(training4, 0.2)
ds9 = train_valid_ds(training9, 0.2)

In [56]:
len(ds4[ds4.is_validation]) / (len(ds4[~ds4.is_validation]) + len(ds4[ds4.is_validation]))

0.20000470876300797

In [33]:
len(ds4[(ds4.is_validation) & (ds4.preferred_name == "Amanita constricta")])\
    / (len(ds4[(~ds4.is_validation) & (ds4.preferred_name == "Amanita constricta")])
       + len(ds4[(ds4.is_validation) & (ds4.preferred_name == "Amanita constricta")]))

ZeroDivisionError: division by zero

In [57]:
ds4.head()

Unnamed: 0,id,text_name,thumb_image_id,name_id,when,vote_cache,lat,long,alt,where,...,south,east,west,high,low,is_collection_location,notes,preferred_name,rank,is_validation
0,273528,Amanita calyptroderma,728029.0,159,2016-11-26,1.61993,-14.4985,-18.4351,-4497.0,"Hood Mountain Regional Park, Santa Rosa, Calif...",...,38.4403,-122.575,-122.578,,,1,---\n:Other: Solitary under live oak\n,Amanita calyptroderma,4,False
1,296795,Leotia lubrica,806494.0,188,2017-09-04,2.50613,46.6596,-88.7104,243.0,"Ottawa National Forest, Michigan, USA",...,46.0151,-87.9339,-90.4185,,,1,--- {}\n,Leotia viscosa,4,False
2,27696,Dacrymyces palmatus,63240.0,276,2009-11-01,2.55514,,,,"Forest near Elgin St., Pembroke, Ontario, Canada",...,45.7883,-77.131,-77.1485,170.0,150.0,1,"---\n:Other: On rotting wood in Zone 11, not a...",Dacrymyces chrysospermus,4,False
3,161391,Coprinellus micaceus,,300,2014-03-14,2.23138,,,,University of California Santa Cruz (UC Santa ...,...,36.9773,-122.048,-122.07,,,1,--- {}\n,Coprinellus micaceus,4,False
4,355297,Flammulina velutipes,993633.0,271,2019-01-14,2.61444,,,,"Henry Cowell Redwoods State Park, Santa Cruz C...",...,37.0101,-122.034,-122.123,,,1,---\n:Other: On Umbellularia californica.\n,Flammulina velutipes,4,False


In [58]:
ds4.columns

Index(['id', 'text_name', 'thumb_image_id', 'name_id', 'when', 'vote_cache',
       'lat', 'long', 'alt', 'where', 'north', 'south', 'east', 'west', 'high',
       'low', 'is_collection_location', 'notes', 'preferred_name', 'rank',
       'is_validation'],
      dtype='object')

In [59]:
len(ds9[ds9.preferred_name=="unknown"]) / len(ds9), len(ds4[ds4.preferred_name=="unknown"]) / len(ds4)

(0.01389798912893015, 0.0)

In [60]:
full_ds4 = pd.merge(ds4, img_df, left_on="id", right_on="observation_id")
full_ds9 = pd.merge(ds9, img_df, left_on="id", right_on="observation_id")

In [61]:
dont_exist4 = check_existence(IMG_DIR, full_ds4)
dont_exist4

[774103,
 774109,
 1110795,
 774174,
 774071,
 774110,
 1093694,
 1056894,
 1144307,
 975942,
 1090324,
 1125200,
 774142,
 934830,
 940212,
 1097068,
 1035980,
 945883]

In [62]:
dont_exist9 = check_existence(IMG_DIR, full_ds9)
dont_exist9

[951393,
 1136230,
 1102603,
 1111805,
 1008623,
 773965,
 773966,
 773967,
 1073022,
 773875,
 773876,
 773877,
 773878,
 773879,
 773880,
 774128,
 774129,
 774131,
 774135,
 774137,
 1102786,
 996723,
 1039869,
 1071396,
 1020182,
 774104,
 774105,
 774106,
 774107,
 774108,
 1074363,
 1031819,
 1086936,
 774112,
 774113,
 774114,
 774115,
 773920,
 773921,
 773922,
 773923,
 1146346,
 1069310,
 1028909,
 774136,
 773909,
 773910,
 773911,
 773912,
 941968,
 1091210,
 1137425,
 1121736,
 947367,
 1125200,
 1028738,
 1002323,
 1093694,
 945883,
 773898,
 1048734,
 1066783,
 1109084,
 996725,
 1136990,
 1035980,
 1020045,
 774174,
 1018491,
 773861,
 773862,
 773863,
 773864,
 945692,
 1110795,
 774162,
 774171,
 774172,
 774173,
 970524,
 774147,
 774151,
 774153,
 774154,
 962251,
 1085659,
 774156,
 773870,
 773871,
 773872,
 773873,
 773874,
 774175,
 774176,
 956884,
 1134440,
 774097,
 774098,
 774099,
 1111938,
 773801,
 773802,
 773803,
 774146,
 774054,
 774055,
 774059,
 774

In [63]:
#full_ds4[~full_ds4.image_id.isin(dont_exist4)].to_csv(RES_DIR / "ds4_min10obs.csv", index=False)
full_ds4[~full_ds4.image_id.isin(dont_exist4)].to_csv(RES_DIR / "ds4_100mostPopSpecies.csv", index=False)
full_ds9[~full_ds9.image_id.isin(dont_exist9)].to_csv(RES_DIR / "ds9_min10obs.csv", index=False)