## Create a LOKI dataset for ML examples using image descriptors

In [1]:
import pandas as pd

Load original tsv files were exported from ecotaxa. 
Files should be unpacked from zip and converted to UTF-8.

_Original data might be available on request._

In [2]:
st58 = pd.read_csv("../data/loki_ps93-2_58_ecotaxa_export_368_20191016_1142.tsv",sep="\s+")
st58.shape

(47143, 71)

In [3]:
st80 = pd.read_csv("../data/loki_ps93-2_80_ecotaxa_export_362_20191016_1143.tsv",sep="\s+")
st80.shape

(43057, 71)

In [4]:
st82 = pd.read_csv("../data/loki_ps93-2_82_ecotaxa_export_376_20191016_1143.tsv",sep="\s+")
st82.shape

(41689, 71)

Combine in a single dataframe and remove the temporary dataframes

In [5]:
df = st58.append(st80).append(st82)
del st58, st80, st82

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


### Inspect data

In [6]:
df.shape

(131889, 72)

In [7]:
df.describe()

Unnamed: 0,acq_id,acq_instrument,object_annotation_date,object_annotation_time,object_area,object_area_px,object_bottom_depth,object_chlorophyll_a,object_conductivity,object_convexity,...,object_speed_in_water,object_speed_over_ground,object_structure,object_temperature,object_temperature_oxsens,object_time,object_width,process_id,sample_dataportal_descriptor,sample_id
count,0.0,0.0,107210.0,107210.0,131889.0,131889.0,131889.0,131889.0,131889.0,131889.0,...,131889.0,131889.0,131889.0,131889.0,131889.0,131889.0,131889.0,0.0,0.0,0.0
mean,,,20171320.0,145853.744007,1.449231,11151.37,0.0,0.0,30.547115,42.773784,...,0.0,0.0,55463.07,1.868414,1.868414,94841.985306,0.000237,,,
std,,,3080.546,34284.803386,3.945756,30361.31,0.0,0.0,2.162324,63.046487,...,0.0,0.0,168438.3,1.662257,1.662257,108237.053496,0.014628,,,
min,,,20170330.0,147.0,0.012,90.0,0.0,0.0,0.0028,0.0,...,0.0,0.0,97.64569,-1.586,-1.586,0.0,0.0,,,
25%,,,20170410.0,122614.0,0.141,1082.0,0.0,0.0,29.808,10.564158,...,0.0,0.0,3717.976,0.721,0.721,1500.0,0.0,,,
50%,,,20170520.0,143526.0,0.298,2290.0,0.0,0.0,30.37,23.759928,...,0.0,0.0,11893.63,1.392,1.392,3200.0,0.0,,,
75%,,,20170610.0,164950.0,1.564,12031.0,0.0,0.0,31.548,53.607914,...,0.0,0.0,53963.72,2.896,2.896,215700.0,0.0,,,
max,,,20190600.0,235459.0,185.025,1423707.0,0.0,0.0,34.828,1790.0,...,0.0,0.0,6998445.0,6.96,6.96,235900.0,1.744,,,


### Use only validated entries

In [8]:
df = df[df["object_annotation_status"]=="validated"]

In [9]:
df.shape

(107210, 72)

In [10]:
df.describe()

Unnamed: 0,acq_id,acq_instrument,object_annotation_date,object_annotation_time,object_area,object_area_px,object_bottom_depth,object_chlorophyll_a,object_conductivity,object_convexity,...,object_speed_in_water,object_speed_over_ground,object_structure,object_temperature,object_temperature_oxsens,object_time,object_width,process_id,sample_dataportal_descriptor,sample_id
count,0.0,0.0,107210.0,107210.0,107210.0,107210.0,107210.0,107210.0,107210.0,107210.0,...,107210.0,107210.0,107210.0,107210.0,107210.0,107210.0,107210.0,0.0,0.0,0.0
mean,,,20171320.0,145853.744007,1.720011,13234.93,0.0,0.0,30.635638,47.266207,...,0.0,0.0,65820.32,1.944157,1.944157,91627.342599,0.000254,,,
std,,,3080.546,34284.803386,4.327322,33297.34,0.0,0.0,2.063504,68.469328,...,0.0,0.0,185162.8,1.7212,1.7212,108061.156003,0.015495,,,
min,,,20170330.0,147.0,0.014,104.0,0.0,0.0,0.0028,0.0,...,0.0,0.0,97.64569,-1.586,-1.586,0.0,0.0,,,
25%,,,20170410.0,122614.0,0.144,1108.0,0.0,0.0,29.792,10.996203,...,0.0,0.0,4017.427,0.709,0.709,1400.0,0.0,,,
50%,,,20170520.0,143526.0,0.393,3024.5,0.0,0.0,30.391,26.237192,...,0.0,0.0,17400.85,1.445,1.445,3000.0,0.0,,,
75%,,,20170610.0,164950.0,2.026,15587.75,0.0,0.0,31.673,61.594619,...,0.0,0.0,68777.2,3.033,3.033,215700.0,0.0,,,
max,,,20190600.0,235459.0,185.025,1423707.0,0.0,0.0,34.828,1790.0,...,0.0,0.0,6998445.0,6.96,6.96,235900.0,1.744,,,


### Take only columns of interest

In [11]:
columns = set([ 'object_annotation_hierarchy', 'object_area','object_area_px', 
        'object_conductivity', 'object_convexity', 'object_dr._haardt_fluorescence_channel_a', 
        'object_form', 'object_fourier_descriptor_01', 'object_fourier_descriptor_02',
        'object_fourier_descriptor_03', 'object_fourier_descriptor_04',
        'object_fourier_descriptor_05', 'object_fourier_descriptor_06',
        'object_fourier_descriptor_07', 'object_fourier_descriptor_08',
        'object_fourier_descriptor_09', 'object_fourier_descriptor_10',
        'object_graymean', 'object_haul', 'object_hu_moment_1',
        'object_hu_moment_2', 'object_hu_moment_3', 'object_hu_moment_4',
        'object_hu_moment_5', 'object_hu_moment_6', 'object_hu_moment_7',
        'object_index', 'object_kurtosis', 'object_lenght', 
        'object_skewness',  'object_structure',
        'object_width'])

Take only those columns

In [12]:
df = df[columns]

In [13]:
df.shape

(107210, 32)

### Take only a subset of taxa

In [14]:
taxonList =set(["Siphonophorae","Polychaeta","Ostracoda","Amphipoda","Poecilostomatoida",
           "Cyclopoida","Calanoida","Naupli","bubble","pellet","egg"])

In [15]:
result = list()
label = list()
for objPath in df.object_annotation_hierarchy:
    taxa = set(objPath.split(">"))
    tmp = taxa.intersection(taxonList)
    result.append(len(tmp)>0)
  #  label.append()
    if (len(tmp)>0):
        label.append(tmp.pop())

In [16]:
df = df[result]
df["taxon"]=label

remove column 'object_annotation_hierarchy'

In [17]:
df = df.drop(columns=['object_annotation_hierarchy'])

### Inspect dataset

In [18]:
df.groupby("taxon").count()

Unnamed: 0_level_0,object_hu_moment_1,object_skewness,object_dr._haardt_fluorescence_channel_a,object_index,object_hu_moment_4,object_fourier_descriptor_07,object_kurtosis,object_fourier_descriptor_10,object_graymean,object_fourier_descriptor_06,...,object_fourier_descriptor_08,object_area,object_structure,object_width,object_area_px,object_hu_moment_7,object_hu_moment_3,object_fourier_descriptor_01,object_hu_moment_2,object_hu_moment_5
taxon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Amphipoda,194,194,194,194,194,194,194,194,194,194,...,194,194,194,194,194,194,194,194,194,194
Calanoida,42101,42101,42101,42101,42101,42101,42101,42101,42101,42101,...,42101,42101,42101,42101,42101,42101,42101,42101,42101,42101
Cyclopoida,4108,4108,4108,4108,4108,4108,4108,4108,4108,4108,...,4108,4108,4108,4108,4108,4108,4108,4108,4108,4108
Ostracoda,2772,2772,2772,2772,2772,2772,2772,2772,2772,2772,...,2772,2772,2772,2772,2772,2772,2772,2772,2772,2772
Poecilostomatoida,20348,20348,20348,20348,20348,20348,20348,20348,20348,20348,...,20348,20348,20348,20348,20348,20348,20348,20348,20348,20348
Polychaeta,412,412,412,412,412,412,412,412,412,412,...,412,412,412,412,412,412,412,412,412,412
Siphonophorae,333,333,333,333,333,333,333,333,333,333,...,333,333,333,333,333,333,333,333,333,333
bubble,1091,1091,1091,1091,1091,1091,1091,1091,1091,1091,...,1091,1091,1091,1091,1091,1091,1091,1091,1091,1091
egg,870,870,870,870,870,870,870,870,870,870,...,870,870,870,870,870,870,870,870,870,870
pellet,216,216,216,216,216,216,216,216,216,216,...,216,216,216,216,216,216,216,216,216,216


In [19]:
set(df["taxon"])

{'Amphipoda',
 'Calanoida',
 'Cyclopoida',
 'Ostracoda',
 'Poecilostomatoida',
 'Polychaeta',
 'Siphonophorae',
 'bubble',
 'egg',
 'pellet'}

Take a random subsample (because of imbalance bias), e.g. 100, 150, max 194.

In [20]:
randomSubsample194 = pd.DataFrame()
for taxon in taxonList:
    try:
        randomSubsample194 = randomSubsample194.append(df[df["taxon"]==taxon].sample(194))
    except: 
        print(f'Problems with taxon {taxon}')

Problems with taxon Naupli


In [21]:
randomSubsample194.shape

(1940, 32)

In [22]:
randomSubsample194.describe()

Unnamed: 0,object_hu_moment_1,object_skewness,object_dr._haardt_fluorescence_channel_a,object_index,object_hu_moment_4,object_fourier_descriptor_07,object_kurtosis,object_fourier_descriptor_10,object_graymean,object_fourier_descriptor_06,...,object_fourier_descriptor_08,object_area,object_structure,object_width,object_area_px,object_hu_moment_7,object_hu_moment_3,object_fourier_descriptor_01,object_hu_moment_2,object_hu_moment_5
count,1940.0,1940.0,1940.0,1940.0,1940.0,1940.0,1940.0,1940.0,1940.0,1940.0,...,1940.0,1940.0,1940.0,1940.0,1940.0,1940.0,1940.0,1940.0,1940.0,1940.0
mean,0.314089,2.057216,3.309652,22861.859278,0.0112421,5987.312935,4.362359,3361.848228,50.909794,8231.797807,...,4764.906092,2.480096,83915.94,0.000511,19083.607216,-0.0001103459,0.01833558,327593.5,0.09123856,0.004882275
std,0.205036,0.981169,1.305472,13916.004404,0.06780173,19071.180709,5.005514,12865.622873,20.935265,26864.010721,...,17799.212764,5.97893,237894.9,0.022522,46005.954443,0.005571791,0.08854366,800419.2,0.3445103,0.09558044
min,0.159282,-0.213897,2.112,172.0,3.07e-12,0.258949,-1.64433,0.240646,14.0,1.414214,...,0.05455,0.015,127.0671,0.0,112.0,-0.2056782,3.31e-08,6344.181,2.35e-08,-0.02198885
25%,0.198951,1.285803,2.15875,10228.5,1.13e-05,58.620011,0.38631,33.88925,34.0,90.721752,...,50.731584,0.142,3480.864,0.0,1094.5,-4.8025e-08,0.0001069267,26983.88,0.007497438,-6.1975e-16
50%,0.26105,2.022146,2.494,23068.5,0.000278628,217.651781,3.252919,120.418247,46.0,306.961471,...,170.418962,0.361,8052.769,0.0,2780.5,2.485e-18,0.001170362,49208.58,0.03009888,1.465e-08
75%,0.343967,2.730349,5.121,35031.75,0.003515267,2117.748102,6.755998,1071.927307,66.0,2994.889437,...,1667.575588,1.91075,54632.3,0.0,14703.75,3.875e-08,0.008213711,204675.3,0.07018951,1.045e-05
max,3.280619,6.047925,5.121,47420.0,1.752102,264862.1366,38.697178,290856.1741,132.0,397520.9899,...,398180.6931,57.7,3401337.0,0.992,443986.0,0.05477922,1.961855,7664478.0,10.47525,3.247265
