# DSA Dataset
The Danish Svampe Atlas has 85k training images, which could definitely supplement the MO dataset, which may end up having around 300k clean images. This dataset is already clean, which is a plus, the only work would be in getting the species to match up with the MO dataset. Let's make that happen here. 

In [5]:
import os
import json
import pickle
import numpy as np
import pandas as pd

In [6]:
base_dir = os.environ['HOME'] + '/python/Mushroom_Classifier/'
tables_dir = base_dir + 'DSA_info/'

#loading metadata information
with open(tables_dir + 'train.json') as json_file:
    training = json.load(json_file)
with open(tables_dir + 'val.json') as json_file:
    validation = json.load(json_file)


I'm going to make a pandas dataframe for this information, it'll make life much easier I think

In [7]:
'''given an ID, it links observation table to images table to 
species category table. sort of like a SQL join'''
def make_entry(ID, TABLE):
    out = []
    out.append(ID)
    
    #get the species name of the observation
    annotation_index = \
        [i for i, w in enumerate(TABLE['annotations']) if w['id']==ID]
    annotation_index = annotation_index[0]
    category_id = TABLE['annotations'][annotation_index]['category_id']
    name = [w['name'] for w in TABLE['categories'] \
                      if w['id'] == category_id]
    assert len(name) == 1
    out.append(str(name[0]))
    
    image_id = TABLE['annotations'][annotation_index]['image_id']
    rimage = [w for w in TABLE['images'] if w['id'] == image_id]
    out.append(rimage[0]['height'])
    out.append(rimage[0]['width'])
    out.append(str(rimage[0]['file_name']))
    return out

In [None]:
table = []
headers = ['id', 'name', 'height', 'width', 'file_name']

#training dataset
for i, w in enumerate([w['id'] for w in training['annotations']]):
    if i%1000 == 0:
        print('on observation %i/%i' %(i,len(training['annotations'])))
    table.append(make_entry(w, training))

#validation dataset
for i, w in enumerate([w['id'] for w in validation['annotations']]):
    if i%1000 == 0:
        print('on observation %i/%i' %(i,len(validation['annotations'])))
    table.append(make_entry(w, validation))
    
#making a dataframe from a table of tables
new_table = pd.DataFrame(table, columns=headers)

## Linking to MO Taxa
OK, we're done with this part. Next we're going to link these observations with MO taxa information to see if we'll keep them in the dataset.

In [23]:
def text_name_to_gs(text_name):
    fine_labels = {}
    split_name = text_name.split(' ')
    if len(split_name) == 1:
        fine_labels['Genus'] = split_name[0]
    elif len(split_name) == 2:
        fine_labels['Genus'] = split_name[0]
        fine_labels['Species'] = split_name[1]
    elif len(split_name) == 3:
        if split_name[2] == 'group':
            fine_labels['Genus'] = split_name[0]
            fine_labels['Species'] = split_name[1]     
        elif split_name[1] == 'subgenus':
            #technically not correct
            fine_labels['Genus'] = split_name[0]
            fine_labels['Species'] = split_name[2] 
    elif len(split_name) > 3:
        fine_labels['Genus'] = split_name[0]
        if split_name[1] != 'subgenus':
            fine_labels['Species'] = split_name[1]  
        else:
            fine_labels['Species'] = split_name[2]
    return fine_labels

In [None]:
#load MO taxa
taxa = pickle.load(open(base_dir + 'MO_tables/taxa_dict.p'))
new_table = pickle.load(open(tables_dir + 'train_val_df.p','rb'))

def name_in_taxa(name):
    name_list = text_name_to_gs(name)
    
    tmp = np.nan
    for k,w in taxa.iteritems():
        if w.get('Genus') == name_list.get('Genus') \
        and w.get('Species') == name_list.get('Species'):
            tmp = k
            break     
    return tmp

#adding taxa number
new_table['in_taxa'] = new_table.name.apply(lambda x: name_in_taxa(x))
new_table.in_taxa[new_table.in_taxa.isna()] = 0
new_table['in_taxa'] = new_table['in_taxa'].astype('int32',copy=True)

#saving dataframe to disk
pickle.dump(new_table, open(tables_dir + 'train_val_df.p', 'wb'))

In [31]:
new_table = pickle.load(open(tables_dir + 'train_val_df.p','rb'))
display(new_table)

Unnamed: 0,id,name,height,width,file_name,in_taxa
0,43829,Leucoagaricus leucothites,732,1300,images/16390_Leucoagaricus_leucothites/OBL2010...,444
1,44039,Leucoagaricus leucothites,788,1400,images/16390_Leucoagaricus_leucothites/OBL2010...,444
2,44255,Leucoagaricus leucothites,788,1400,images/16390_Leucoagaricus_leucothites/OBL2010...,444
3,44614,Leucoagaricus leucothites,425,567,images/16390_Leucoagaricus_leucothites/FDE2010...,444
4,45510,Leucoagaricus leucothites,577,770,images/16390_Leucoagaricus_leucothites/FDE2010...,444
5,45649,Leucoagaricus leucothites,577,770,images/16390_Leucoagaricus_leucothites/FDE2010...,444
6,53554,Leucoagaricus leucothites,800,953,images/16390_Leucoagaricus_leucothites/FDE2017...,444
7,53745,Leucoagaricus leucothites,800,776,images/16390_Leucoagaricus_leucothites/FDE2017...,444
8,44714,Leucoagaricus leucothites,768,719,images/16390_Leucoagaricus_leucothites/BWP2010...,444
9,44851,Leucoagaricus leucothites,768,885,images/16390_Leucoagaricus_leucothites/BWP2010...,444


How many of the species in the DSA dataset do we have the full taxa information for?

In [32]:
Species = list(set(new_table.name))
species_in_taxa = [name_in_taxa(w) for w in Species]
species_in_taxa = [1 for w in species_in_taxa if w != None]
print('Ratio of species in MO taxa %i/%i'\
                      %(sum(species_in_taxa), len(Species)))

Ratio of species in MO taxa 1392/1392


## Standarize Dataset to MO Format
Mostly done with this dataset, what we need to do is to standarize to the same df that we have for the MO dataset. We are missing location, in_location, vote_cache, and when picture was taken. 

In [15]:
#time per observation!
%timeit create_df_from_observation(20, tmp_obs)

100 loops, best of 3: 13.6 ms per loop


In [33]:
columns = ['id','location_id','in_location','vote_cache','when',
           'Domain', 'Kingdom', 'Phylum', 'Class', 'Order', 'Family',
           'Genus', 'Species', 'Image', 'Data_Source']

def create_df_from_observation(cid, tmp_obs):
    
    df = pd.DataFrame(np.zeros([1, len(columns)]), columns=columns)

    df.loc[:,'id'] = cid
    df.loc[:,'location_id'] = -1
    df.loc[:,'in_location'] = -1
    df.loc[:,'vote_cache'] = 3
    df.loc[:,'when'] = ''
    df.loc[:,'Image'] = tmp_obs.file_name.values[0]
    df.loc[:,'Data_Source'] = 'DSA'
    
    if tmp_obs['in_taxa'].values[0]:

        #taxa information
        for taxa_subfield in ['Domain','Kingdom','Phylum',
                              'Class','Order','Family','Genus','Species']:
            df.loc[:, taxa_subfield] = \
                            taxa[int(tmp_obs.in_taxa)].get(taxa_subfield)
    else:
        GS_dict = text_name_to_gs(tmp_obs.name.values[0])
        for taxa_subfield in ['Domain','Kingdom','Phylum',
                              'Class','Order','Family']:
            df.loc[:, taxa_subfield] = ''
        df.loc[:, 'Genus'] = GS_dict.get('Genus')
        df.loc[:, 'Species'] = GS_dict.get('Species')
        
    return df



In [16]:
#Estimated time for this process
secs_per_obs = 0.014
print('Total estimated minutes: %1.2f' %((secs_per_obs * new_table.shape[0]) / (60)))

Total estimated minutes: 20.94


In [35]:
J = 0
comp_df = []

for i, (cid, tmp_obs) in enumerate(new_table.iloc[J:new_table.shape[0]].groupby('id')):
    if i%1000 == 0:
        print('on observation %i/%i' %(i,new_table.shape[0]))
    tmp = create_df_from_observation(cid, tmp_obs)
    comp_df.append(tmp)
    J = i
        
df = pd.concat(comp_df)
df.loc[:,'location_id'] = df.location_id.astype('int32')
df.loc[:,'in_location'] = df.in_location.astype('int16')

on observation 0/89760
on observation 1000/89760
on observation 2000/89760
on observation 3000/89760
on observation 4000/89760
on observation 5000/89760
on observation 6000/89760
on observation 7000/89760
on observation 8000/89760
on observation 9000/89760
on observation 10000/89760
on observation 11000/89760
on observation 12000/89760
on observation 13000/89760
on observation 14000/89760
on observation 15000/89760
on observation 16000/89760
on observation 17000/89760
on observation 18000/89760
on observation 19000/89760
on observation 20000/89760
on observation 21000/89760
on observation 22000/89760
on observation 23000/89760
on observation 24000/89760
on observation 25000/89760
on observation 26000/89760
on observation 27000/89760
on observation 28000/89760
on observation 29000/89760
on observation 30000/89760
on observation 31000/89760
on observation 32000/89760
on observation 33000/89760
on observation 34000/89760
on observation 35000/89760
on observation 36000/89760
on observation

Unnamed: 0,id,location_id,in_location,vote_cache,when,Domain,Kingdom,Phylum,Class,Order,Family,Genus,Species,Image,Data_Source
0,1,-1,-1,3.0,,,,,,,,Abortiporus,biennis,images/10000_Abortiporus_biennis/FVL2009PIC655...,DSA
0,3,-1,-1,3.0,,,,,,,,Achroomyces,disciformis,images/10025_Achroomyces_disciformis/LDJ2009PI...,DSA
0,8,-1,-1,3.0,,,,,,,,Abortiporus,biennis,images/10000_Abortiporus_biennis/FVL2009PIC490...,DSA
0,9,-1,-1,3.0,,,,,,,,Agaricus,altipes,images/10052_Agaricus_altipes/PEL2010PIC323258...,DSA
0,12,-1,-1,3.0,,,,,,,,Achroomyces,disciformis,images/10025_Achroomyces_disciformis/LDJ2009PI...,DSA
0,15,-1,-1,3.0,,,,,,,,Agaricus,altipes,images/10052_Agaricus_altipes/PEL2010PIC574673...,DSA
0,16,-1,-1,3.0,,Eukarya,Fungi,Basidiomycota,Agaricomycetes,Agaricales,Agaricaceae,Agaricus,augustus,images/10057_Agaricus_augustus/MHA2009PIC46158...,DSA
0,19,-1,-1,3.0,,,,,,,,Achroomyces,disciformis,images/10025_Achroomyces_disciformis/LDJ2009PI...,DSA
0,23,-1,-1,3.0,,Eukarya,Fungi,Basidiomycota,Agaricomycetes,Agaricales,Agaricaceae,Agaricus,bisporus,images/10061_Agaricus_bisporus/LDJ2009PIC84491...,DSA
0,25,-1,-1,3.0,,Eukarya,Fungi,Basidiomycota,Agaricomycetes,Agaricales,Agaricaceae,Agaricus,bitorquis,images/10062_Agaricus_bitorquis/JAH2009PIC6497...,DSA


In [36]:
for taxa_field in ['Domain','Phylum','Class','Order','Family','Genus','Species']:
    print('working on: %s' %taxa_field)
    df.loc[df[taxa_field].apply(lambda x: x==None), taxa_field] = ''
    df.loc[df[taxa_field].apply(lambda x: not isinstance(x,str)), taxa_field] = ''

working on: Domain
working on: Phylum
working on: Class
working on: Order
working on: Family
working on: Genus
working on: Species


In [37]:
pickle.dump(df, open(tables_dir + 'finished_df_DSA.p', 'wb'))

And we're done.