## Description
This notebook either creates a new CSV file with a mobilitydata_labelled column, or loads an existing one if it already exists.
It provides a UI to label each dataset entry, displaying the German title and description. Entries can be labeled, skipped, or the session can be aborted.
When "Abort and Save" is pressed, the current labeling progress is saved back to the CSV file.

#### Improvments to consider
- None for now, except possibly reusing the same notebook for other languages 

The code was created with the assistance of ChatGPT-4.

In [1]:
import os
import pandas as pd
import random
from IPython.display import display, clear_output
import ipywidgets as widgets

inputdata_file = 'data/02_extracted_keywords_data.csv'
labelling_file ='data/03_labelled_data.csv'

In [2]:
# creates new csv with new column for labelling, if not already exists

def copy_csv(original_path, new_path):
    if not os.path.exists(original_path):
        print(f"Original file not found: {original_path}")
        return
    if os.path.exists(new_path):
        print(f"Target file already exists: {new_path}")
        return
    
    # Load original CSV
    df = pd.read_csv(original_path, low_memory=False)

    # Add empty label column
    df["mobilitydata_labelled"] = None

    # Save to new CSV
    df.to_csv(new_path, index=False)
    print(f"File copied and modified: {new_path}")

# Example usage
copy_csv(inputdata_file, labelling_file)

Target file already exists: data/03_labelled_data.csv


In [3]:
# load csv into dataframe
df = pd.read_csv(labelling_file, low_memory=False)

# Columns
title_column = "dataset_title_DE"
description_column = "dataset_description_DE"
font_family = "Arial"

# Widgets
output = widgets.Output()
label_buttons = widgets.HBox()
question_text = widgets.HTML("<b>Is this data set mobility data?</b>")
progress = widgets.Label()
current_index = None
aborted = False

# Get next index
def get_next_index():
    unlabelled = df[df["mobilitydata_labelled"].isnull()]
    if not unlabelled.empty:
        return random.choice(unlabelled.index.tolist())
    return None

# Label and move on
def label_and_next(value):
    global current_index
    if current_index is not None:
        df.at[current_index, "mobilitydata_labelled"] = value
    show_next()

# Skip
def skip(_):
    show_next()

# Abort
# Abort
def abort(_):
    global aborted
    aborted = True
    container.children = []  # remove all UI elements
    df.to_csv(labelling_file, index=False)  # write dataframe in new csv-File
    
# Show current row
def show_next():
    global current_index
    if aborted:
        return
    current_index = get_next_index()
    with output:
        clear_output(wait=True)
        if current_index is None:
            print("All rows have been labelled. Congratulations, you have successfully put too much time into labelling :)")
            # display(df)  # optional
        else:
            row = df.loc[current_index]
            print(f"Row {current_index}\n")

            # Title
            if title_column in row:
                title_html = f"""
                <div style="
                    font-family:{font_family};
                    font-size:12px;
                    font-weight:bold;
                    border:1px solid #ccc;
                    padding:10px;
                    margin-bottom:10px;
                    background-color:#f9f9f9;">
                    {row[title_column]}
                </div>
                """
                display(widgets.HTML(f"<b>{title_column}:</b>"))
                display(widgets.HTML(title_html))

            # Description
            if description_column in row:
                desc_html = f"""
                <div style="
                    font-family:{font_family};
                    font-size:12px;
                    border:1px solid #ccc;
                    padding:10px;
                    height:120px;
                    overflow-y:auto;
                    background-color:#f9f9f9;">
                    {row[description_column]}
                </div>
                """
                display(widgets.HTML(f"<b>{description_column}:</b>"))
                display(widgets.HTML(desc_html))

    # Progress update must be *outside* the output block
    progress.value = f"Labelled: {df['mobilitydata_labelled'].notnull().sum()} / {len(df)}"

# Buttons
btn_yes = widgets.Button(description="Yes", button_style="success")
btn_no = widgets.Button(description="No", button_style="danger")
btn_skip = widgets.Button(description="Skip", button_style="info")
btn_abort = widgets.Button(description="Abort and Save", button_style="warning")

btn_yes.on_click(lambda x: label_and_next(True))
btn_no.on_click(lambda x: label_and_next(False))
btn_skip.on_click(skip)
btn_abort.on_click(abort)

label_buttons.children = [btn_yes, btn_no, btn_skip, btn_abort]

# Container for all widgets
container = widgets.VBox()

# Start UI
container.children = [output, question_text, label_buttons, progress]
display(container)
show_next()

VBox(children=(Output(), HTML(value='<b>Is this data set mobility data?</b>'), HBox(children=(Button(button_st…