In [1]:
import pandas as pd
import requests
import ipywidgets as widgets
from IPython.display import display, clear_output
import os

### Load kikuyugrass observations from iNaturalist

In [2]:
data = pd.read_csv('./inaturalist_observations.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,quality_grade,uuid,observed_on_date,observed_on_day,observed_on_month,observed_on_year,id,positional_accuracy,public_positional_accuracy,...,observation_photo_6_url,observation_photo_7_id,observation_photo_7_original_dimensions,observation_photo_7_url,observation_photo_8_id,observation_photo_8_original_dimensions,observation_photo_8_url,observation_photo_9_id,observation_photo_9_original_dimensions,observation_photo_9_url
0,0,needs_id,cf6f9a84-0996-4ded-b2e0-a7bd12d67217,2024-07-18,18.0,7.0,2024.0,230179086,4.0,4.0,...,,,,,,,,,,
1,1,needs_id,39f544ee-5509-49ea-a296-19c825217ee1,2024-07-17,17.0,7.0,2024.0,230006077,35.0,35.0,...,,,,,,,,,,
2,2,needs_id,28751c0a-3cd3-4f64-9058-75e564f328f8,2024-07-15,15.0,7.0,2024.0,229533712,68.0,68.0,...,,,,,,,,,,
3,3,research,03159809-00bc-4852-bbd6-2aab8038a6ba,2024-06-24,24.0,6.0,2024.0,229078028,4.0,4.0,...,,,,,,,,,,
4,4,needs_id,e73c596e-d6d4-4980-afaf-be9268e70964,2024-07-10,10.0,7.0,2024.0,228696199,,,...,,,,,,,,,,


In [3]:
#throw out any rows where position accuracy is greater than 1000m
data = data[data['positional_accuracy'] <= 1000]

In [4]:
data.columns

Index(['Unnamed: 0', 'quality_grade', 'uuid', 'observed_on_date',
       'observed_on_day', 'observed_on_month', 'observed_on_year', 'id',
       'positional_accuracy', 'public_positional_accuracy', 'description',
       'captive', 'uri', 'geojson', 'location', 'place_guess', 'taxon_name',
       'taxon_min_species_taxon_id', 'preferred_common_name',
       'observation_photo_license_code', 'observation_photo_0_id',
       'observation_photo_0_original_dimensions', 'observation_photo_0_url',
       'observation_photo_1_id', 'observation_photo_1_original_dimensions',
       'observation_photo_1_url', 'observation_photo_2_id',
       'observation_photo_2_original_dimensions', 'observation_photo_2_url',
       'observation_photo_3_id', 'observation_photo_3_original_dimensions',
       'observation_photo_3_url', 'observation_photo_4_id',
       'observation_photo_4_original_dimensions', 'observation_photo_4_url',
       'observation_photo_5_id', 'observation_photo_5_original_dimensions',
 

### Review images and annotate if kikuyugrass or not

In [5]:
#load existing csv if it exists
if os.path.exists('./labeled_data.csv'):
    labeled_data = pd.read_csv('./labeled_data.csv')
else:
    labeled_data = pd.DataFrame(columns=['uuid', 'label'])

In [6]:
#make radio button font black, default is grey and hard to see
radio_button_style = '''
<style>
    .widget-radio-box label {
        color: black !important;
    }
</style>
'''

In [7]:
def review_observations(df, labeled_df):
    labels = list(labeled_df['label']) if 'label' in labeled_df.columns else []
    
    #get existing labels
    labeled_uuids = set(labeled_df['uuid']) if 'uuid' in labeled_df.columns else set()
    
    #show images for an observation
    def show_images(row):
        clear_output()
        image_widgets = []
        for col in df.columns:
            if col.endswith('_url') and pd.notna(row[col]) and isinstance(row[col], str):
                large_image_url = row[col].replace('square', 'large')
                response = requests.get(large_image_url)
                image_widgets.append(widgets.Image(value=response.content))
        
        #display metadata
        info = widgets.HTML(value=f"<b>Date:</b> {row['observed_on_date']}<br><b>Location:</b> {row['place_guess']}")
        display(widgets.VBox([widgets.HBox(image_widgets), info]))
        
        #labeling widget
        label = widgets.RadioButtons(
            options=['kikuyugrass', 'something else', 'unsure'],
            description='Label:',
            disabled=False,
            style={'description_width': 'initial'}
        )
        
        button = widgets.Button(description="Submit")
        
        #save after each submit
        def on_button_click(b):
            labels.append(label.value)
            labeled_df.loc[len(labeled_df)] = [row['uuid'], label.value]
            labeled_df.to_csv('./labeled_data.csv', index=False)
            review_next_observation()
        
        button.on_click(on_button_click)
        display(widgets.HTML(radio_button_style))
        display(widgets.VBox([label, button]))
    
    index = 0
    
    #pick up where previous session left off
    def review_next_observation():
        nonlocal index
        while index < len(df):
            if df.iloc[index]['uuid'] not in labeled_uuids:
                show_images(df.iloc[index])
                labeled_uuids.add(df.iloc[index]['uuid'])
                index += 1 
                break
            index += 1
        else:
            print("Labeling complete")
            display(labeled_df)
    
    review_next_observation()

review_observations(data, labeled_data)

VBox(children=(HBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x…

HTML(value='\n<style>\n    .widget-radio-box label {\n        color: black !important;\n    }\n</style>\n')

VBox(children=(RadioButtons(description='Label:', options=('kikuyugrass', 'something else', 'unsure'), style=D…

Labeling complete


Unnamed: 0,uuid,label
0,cf6f9a84-0996-4ded-b2e0-a7bd12d67217,kikuyugrass
1,39f544ee-5509-49ea-a296-19c825217ee1,kikuyugrass
2,28751c0a-3cd3-4f64-9058-75e564f328f8,kikuyugrass
3,03159809-00bc-4852-bbd6-2aab8038a6ba,kikuyugrass
4,6a732b02-dbb9-41b1-ae34-5897048e10ee,something else
...,...,...
659,b3ce775c-5926-44a4-83dd-9857ffb88e80,kikuyugrass
660,09060305-8008-42ec-8cbd-8294feacf5a0,kikuyugrass
661,cf429eb7-02ed-46da-9475-1107f617dbc5,kikuyugrass
662,3ba6f834-81ca-4e72-ad95-410a51145068,kikuyugrass


### Merge labels to dataframe

In [8]:
merged_data = pd.merge(data, labeled_data, on='uuid', how='left')

In [9]:
pd.DataFrame({'count': merged_data['label'].value_counts(), 'proportion': merged_data['label'].value_counts(normalize=True)})
#54% of the images are kikuyugrass with high confidence

Unnamed: 0_level_0,count,proportion
label,Unnamed: 1_level_1,Unnamed: 2_level_1
kikuyugrass,358,0.539157
something else,159,0.239458
unsure,147,0.221386


In [10]:
cleaned_data = merged_data[merged_data['label'] == 'kikuyugrass']

### Save csv

In [11]:
cleaned_data.to_csv("./inaturalist_observations_cleaned.csv")