# Analyze Annotations

In this notebook, we examine the annotation data. 

In [None]:
import json
import matplotlib.pyplot as plt 
import numpy as np

import os
import pandas as pd
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [409]:
output_path = ('../data/processed/')

In [410]:
# this only looks at step 1 and 2

df1 = pd.read_csv(os.path.join(output_path, 'annotation_data_step_1.csv')).drop('Unnamed: 0', axis=1)
df2 = pd.read_csv(os.path.join(output_path, 'annotation_data_step_2.csv'))

df2 = df2[(df2['corrected_label'] != 'buiten') & (df2['corrected_label'] != 'binnen') & (df2['corrected_label'] != 'no_description_found')]

df_all = pd.concat([df1, df2])

## Categories with fewer than 50 images 

In [412]:
extra_training = list(df_all['corrected_label'].value_counts().loc[lambda x : x <50].index)

## Top 5 Accuracy 

In [413]:
#remove binnen / buiten annotations



step1_top1_acc = df1.groupby('corrected_label')['in_top_1'].mean()
step2_top1_acc = df2.groupby('corrected_label')['in_top_1'].mean()


step1_top5_acc = df1.groupby('corrected_label')['in_top_5'].mean()
step2_top5_acc = df2.groupby('corrected_label')['in_top_5'].mean()

top_1_acc = pd.merge(pd.Series(step1_top1_acc, name='step1'), pd.Series(step2_top1_acc, name='step2'), right_index = True,
               left_index = True, how='right').round(3)


top_5_acc = pd.merge(pd.Series(step1_top5_acc, name='step1'), pd.Series(step2_top5_acc, name='step2'), right_index = True,
               left_index = True, how='right').round(3)

comparison = []
for row in top_5_acc.iterrows():
    if row[1]['step1'] - row[1]['step2'] < -0.05 :
        comparison.append('increase')
    elif row[1]['step1'] - row[1]['step2'] > 0.05 :
        comparison.append('decrease')
    else:
        comparison.append('no change')

top_5_acc['state'] = comparison
top_5_acc['diff'] = top_5_acc['step1'] - top_5_acc['step2']

top_5_acc.sort_values('diff', ascending=True).head(50)
top_5_acc.to_csv('top_5_acc.csv')

## 25 Lowest accuracy

In [414]:
top_5_acc.sort_values('step2').head(25)

Unnamed: 0_level_0,step1,step2,state,diff
corrected_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
boksen,0.889,0.0,decrease,0.889
casino,,0.0,no change,
stemlokaal,,0.0,no change,
tennis,0.889,0.0,decrease,0.889
biljart,,0.0,no change,
spoorweg,,0.0,no change,
kermis,0.286,0.0,decrease,0.286
rugby,0.857,0.0,decrease,0.857
helikopter,,0.0,no change,
vuilnisbelt,,0.0,no change,


In [415]:
lowest_acc = list(top_5_acc.sort_values('step2').head(25).index)
extra_training = list(set(extra_training + lowest_acc))

## Decreased accuracy

In [416]:
decreased_acc = list(top_5_acc[top_5_acc['state'] == 'decrease'].index)

## Error Analysis

In [None]:
df2[df2['top_1'] == 'zwaan']['corrected_label'].value_counts()

zwaan     6
eend      6
vogels    2
Name: corrected_label, dtype: int64

In [None]:
df2[df2['corrected_label'] == 'zwaan']['top_1'].value_counts()

dieren_overig    6
zwaan            6
standbeeld       2
roeien           2
strand           2
vijver_plas      2
straat           2
voetbal          2
schaatsen        2
Name: top_1, dtype: int64

In [None]:
df2[df2['top_1'] == 'trein']['corrected_label'].value_counts()

trein_station           14
trein                   12
bouwplaats               4
trein_ongeluk            2
boten                    2
no_description_found     2
huisje                   2
Name: corrected_label, dtype: int64

In [None]:
df2[df2['corrected_label'] == 'trein']['top_1'].value_counts()

trein            12
bus_truck         6
ziekenhuis        4
trein_station     4
fabriek           4
trein_ongeluk     2
keuken            2
bouwplaats        2
brug              2
portret           2
optocht           2
demonstratie      2
gebouw            2
Name: top_1, dtype: int64

In [None]:
df2[(df2['top_1'] =='bus_truck') & (df2['corrected_label'] == 'trein')]

Unnamed: 0,predictions,top_1,top_5,title,image_id,scan_id,too_difficult,unusable,annotated_on,annotator_id,annotated_label,checked_on,validator_id,corrected_label,toelichting,gebruiker_id,agreement,in_top_1,in_top_5
3944,"{'cafe': 0.03, 'trein': 0.04, 'auto': 0.09, 'v...",bus_truck,"['cafe', 'trein', 'auto', 'vergaderruimte', 'b...",NL-HlmNHA_1478_15535B00_01,55748,25756,0,0,2022-04-19 14:49:36,2676,bus_truck,2022-05-13 19:32:25,17916,trein,,,disagree,0,1
3945,"{'cafe': 0.03, 'trein': 0.04, 'auto': 0.09, 'v...",bus_truck,"['cafe', 'trein', 'auto', 'vergaderruimte', 'b...",NL-HlmNHA_1478_15535B00_01,55748,25756,0,0,2022-04-19 15:17:54,23849,trein,2022-05-13 19:32:25,17916,trein,,,disagree,0,1
5602,"{'trein_station': 0.05, 'straat': 0.06, 'verga...",bus_truck,"['trein_station', 'straat', 'vergaderruimte', ...",NL-HlmNHA_1478_15659B00_01,59429,29437,0,0,2022-04-26 16:05:18,13212,trein,2022-05-19 21:45:48,17916,trein,,,agree,0,0
5603,"{'trein_station': 0.05, 'straat': 0.06, 'verga...",bus_truck,"['trein_station', 'straat', 'vergaderruimte', ...",NL-HlmNHA_1478_15659B00_01,59429,29437,0,0,2022-04-26 17:03:21,21038,trein,2022-05-19 21:45:48,17916,trein,,,agree,0,0
41643,"{'straat': 0.04, 'auto': 0.04, 'mensen_op_een_...",bus_truck,"['straat', 'auto', 'mensen_op_een_boot', 'gebo...",NL-HlmNHA_1478_33985K01_10,54009,24017,0,1,2022-04-16 11:44:27,21351,trein,2022-05-10 09:53:54,9686,trein,Treinwagon in onderhoud,9686.0,agree,0,0
41644,"{'straat': 0.04, 'auto': 0.04, 'mensen_op_een_...",bus_truck,"['straat', 'auto', 'mensen_op_een_boot', 'gebo...",NL-HlmNHA_1478_33985K01_10,54009,24017,0,1,2022-04-16 15:50:09,23655,trein,2022-05-10 09:53:54,9686,trein,Treinwagon in onderhoud,9686.0,agree,0,0


In [None]:
df2[df2['top_1'] == 'kerstmis']['corrected_label'].value_counts()

kerstmis    12
Name: corrected_label, dtype: int64

In [None]:
df2[df2['corrected_label'] == 'kerstmis']['top_1'].value_counts()

gebouw                    14
kerstmis                  12
portret                    8
straat                     8
winkel_binnen              8
muziek_optreden            6
uitreiking_huldiging       6
borden_gevelsteen          4
sinterklaas                4
toren                      4
plattegrond                2
kerk_buiten                2
theater                    2
akker                      2
woonwijk                   2
bibliotheek_boekwinkel     2
klaslokaal                 2
etalage                    2
bloemen                    2
speech                     2
brug                       2
standbeeld                 2
etende_mensen              2
mensenmassa                2
Name: top_1, dtype: int64

In [None]:
df2[df2['corrected_label'] == 'kermis']['top_1'].value_counts()

circus                24
markt                 10
mensenmassa            6
zeepkistenrace         4
mensen_op_een_boot     4
optocht                4
gymnastiek             4
kerk_buiten            2
auto                   2
portret                2
kamperen               2
fietsende_mensen       2
etalage                2
theater                2
demonstratie           2
brand                  2
klaslokaal             2
Name: top_1, dtype: int64

In [None]:
df2[df2['top_1'] == 'tram']['corrected_label'].value_counts()

straat           2
trein_station    2
winkel_binnen    2
Name: corrected_label, dtype: int64

In [None]:
meta_df = pd.read_csv('../data/intermediary/metadata-with-connected-files-2022-05-23-csv.csv', delimiter=';')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
categories = meta_df['Catalogus kaart'].str.split('|', expand=True)

In [None]:
categories[0].nunique()

1401

In [None]:
categories = categories[0].dropna().str.lower()

In [None]:
meta_df = meta_df.dropna(subset=['Catalogus kaart'])

In [None]:
categories = meta_df['Catalogus kaart'].str.split('|', expand=True)

In [None]:
meta_df['Invoernummer VH'].value_counts()

00602A_0001    1
Name: Invoernummer VH, dtype: int64

In [None]:
meta_df[meta_df['Catalogus kaart'].str.lower().str.contains('dansen')]

#invoernummer onderwerpskaarten 

Unnamed: 0,uuid,Invoernummer onderwerpskaarten,Beschrijving,Catalogus kaart,Clusteronderwerp,Catalogus kaart scan,"Plaats, Straat",Persoon,Datum,Gemaakt in VeleHanden,Code,Logboek pagina ID,Deelcollectie,Invoernummer VH,Negatiefvel - Logboeknummer,Negatiefvel - Kaartnummer,Negatiefvel - Serie naam,Aantal foto's
76265,148d0f3d-5d1e-c64b-d73c-7c530391334a,115325|115324,Dansproject Velserbeek festival,"Dansen|Festival, festiviteiten en manifestaties",,NHA012000004_D_0028|NHA012000006_F_0053,Velsen-Zuid,,01-09-1985,,NL-HlmNHA_1478_25470K,,K,,NL-HlmNHA_1478_25470K,00,Kleinbeeld 1985,6
104591,1e324a3d-b6d5-b2b0-3dcd-8baa16da7408,76,Kinder ballet Maud Kool,Dansen,,NHA012000004_D_0033,,,01-12-1945,,NL-HlmNHA_1478_00294G,,G,,NL-HlmNHA_1478_00294C|NL-HlmNHA_1478_00294G|NL...,00|00_01|00_01,C Kleur|G 2015|L,1
104595,ec9d94ba-5cf5-45fc-a74a-7dab7df1efbd,77,Kinder ballet Maud Kool,Dansen,,NHA012000004_D_0033,,,01-12-1945,,NL-HlmNHA_1478_00295G,,G,,NL-HlmNHA_1478_00295C|NL-HlmNHA_1478_00295G|NL...,00|00_01|00_01,C Kleur|G 2015|L,1
104695,4a479ea3-084d-d3ef-ccd7-c85d19efc307,1610|1611,Bloemenkoning bij Volkstam Basken / Doelen,Bloemenvak|Volksdansen,,NHA012000002_B_0407|NHA012000021_V_0353,,,07-08-1948,,NL-HlmNHA_1478_00324A,,A,,NL-HlmNHA_1478_00324A|NL-HlmNHA_1478_00324L,00|00_01,6x6 1948|L,0
107414,60999a1b-0c4e-0148-86b1-76caae88aaf3,1334,Krelagegebouw,Volksdansen,,NHA012000021_V_0353,"Haarlem, Leidsevaart",,30-05-1948,,NL-HlmNHA_1478_01215B,,B,,NL-HlmNHA_1478_01215B,00_01,B 1935-1958,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277374,f29dacac-4663-884c-2667-13a56f03fd62,64217,"Dansen voor bejaarden, IJm.",Dansen,,NHA012000004_D_0029,,,12-11-1975,,NL-HlmNHA_1478_14190K,,K,,NL-HlmNHA_1478_14190K,00,Kleinbeeld 1975,6
277445,f2f01c4b-e075-4c32-449b-c9f0013bf834,23054|23055,Spaanse dansen openlucht Bloemendaal,Dansen,,NHA012000004_D_0034,"Bloemendaal, Hoge Duin en Daalseweg",,19-06-1962,,NL-HlmNHA_1478_01111K,,K,,NL-HlmNHA_1478_01111K|NL-HlmNHA_1478_01111K,01|02,Kleinbeeld 1962|Kleinbeeld 1962,71
277793,f50113b3-f6fa-de9a-d8e0-b7f04e037f28,77124,Volksdansen Heemskerk,Volksdansen,,NHA012000021_V_0354,,,14-07-1978,,NL-HlmNHA_1478_16924K,,K,,,,,0
279137,fdcfb042-4e9c-ffb7-4033-7356d0680a53,31245,Westfriese Dansgroep bij strand IJmuiden,Dansen,,NHA012000004_D_0031,,,07-08-1965,,NL-HlmNHA_1478_03449K,,K,,NL-HlmNHA_1478_03449K,00,Kleinbeeld 1965,10


## Finding new data

In [None]:
dfused1 = pd.read_csv('../data/intermediary/VHproject2deel1.csv')
dfused2 = pd.read_csv('../data/intermediary/VHproject2deel2.csv')

df_used = pd.concat([dfused1, dfused2])

meta_df = pd.read_csv('../data/intermediary/metadata-with-connected-files-2022-05-23-csv.csv', delimiter=';')


  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
extra_training = list(set(extra_training + decreased_acc))

sorted(extra_training)
print(len(extra_training))

np.savetxt("extra_training_labels.csv", 
           sorted(extra_training),
           delimiter =", ", 
           fmt ='% s')


51

In [None]:
## remove historisch toneelstuk, mensen_op_trap

for label in extra_training:
    if label == 'amfitheater':
       extra_training.append('openluchttheater')
    elif label == 'dansende_mensen':
       extra_training.append('dansen')
    elif label == 'mensen_op_een_boot':
       extra_training.append('visserij')
       extra_training.append('boot')
    elif label == 'kerk_binnen':
        extra_training.remove('kerk_binnen')
        extra_training.append('kerk')
    elif label == 'ongeluk_brancard':
        
        extra_training.append('brancard')
    elif label == 'schaken_damen':
        
        extra_training.append('schaken')
        extra_training.append('dammen')
    elif label == 'tafel_tennis':
        
        extra_training.append('tafeltennis')
    elif label == 'trein_ongeluk':
        
        extra_training.append('treinongeluk')
        extra_training.append('treinstation')
    elif label == 'vlag_hijsen':
        
        extra_training.append('vlag hijsen')
  

extra_training.remove('mensen_op_een_boot')
extra_training.remove('sport_overig')
extra_training.remove('mensen_op_trap')
extra_training.remove('amfitheater')
extra_training.remove('vlag_hijsen')    
extra_training.remove('trein_ongeluk')    
extra_training.remove('schaken_dammen')    
extra_training.remove('ongeluk_brancard')     
extra_training.remove('dansende_mensen')   
extra_training.remove('tafel_tennis')

In [None]:
meta_df['Beschrijving'] = meta_df['Beschrijving'].str.lower()
meta_df['Catalogus kaart'] = meta_df['Catalogus kaart'].str.lower().str.replace("|", " ")

In [None]:
df_numbers = df_used['t3_entry_number'].str[1:-1].str.split(',', expand=True)
df_numbers = pd.merge(df_numbers, df_used['t3_vh_entry_number'], left_index=True, right_index=True)

used_numbers = []
for col in df_numbers:
    unique_vals = list(df_numbers[col].unique())
    for val in unique_vals:
        if val in used_numbers:
            pass
        else: 
            used_numbers.append(val)

In [None]:
def get_card_numbers(df):
    card_numbers = []
    for index, row, in df.iterrows():
        x = row['Invoernummer onderwerpskaarten']
        if x is np.nan:
            x = row['Invoernummer VH']
            card_numbers.append(x)
        elif '|' in x:
            x = x.split('|')
            for _ in x:
                card_numbers.append(_)
        else:
            card_numbers.append(x)
    return card_numbers

def find_numbers(meta_df, used_numbers, label):
    query = r'\b({})\b'.format(label)
    selection = meta_df.loc[meta_df['Catalogus kaart'].str.contains(query, na=False, regex=True) | 
meta_df['Beschrijving'].str.contains(query, na=False, regex=True)]
    #print(selection['Beschrijving'].head())

    print(f'found {selection.shape[0]} records')
    

    numbers = get_card_numbers(selection)
    print(f'found {len(numbers)} card numbers')

    # filter out used numbers
    numbers = [number for number in numbers if number not in used_numbers]
    print(f'found {len(numbers)} unused card numbers')
    print('\n')
    print('--------')

    return numbers

In [None]:
numbers_needed = {}

for label in extra_training:
    print(label)
    numbers_needed[label] = find_numbers(meta_df, used_numbers, label)

In [None]:
cards_ = 0
for k, v in numbers_needed.items():
    cards_ += len(v)

In [None]:
df = pd.DataFrame(list(numbers_needed.items()),columns = ['label','list_of_numbers']) 

In [None]:
df.to_csv('numbers_needed.csv')

with open("numbers_needed.json", "w") as outfile:
    json.dump(numbers_needed, outfile)