In [1]:
from sqlalchemy.orm import Session
from sqlalchemy.orm import joinedload
from tqdm.auto import tqdm

import src.db.models.bert_data as bm
import src.db.models.doccano as dm
from src.db.connect import make_engine

In [2]:
# name of doccano project to be inserted to bert DB
PROJECT = "PBert v.2 Task 1"

# DB is copied to local for development
# This trigger can switch between dev and "prod" if config.ini is set up properly
# (meaning it contains connection settings for DB_dev)
DEV = False
db_ending = "_dev" if DEV else ""

db_engine = make_engine(f"DB{db_ending}")
db_session = Session(db_engine)

doccano_engine = make_engine(f"DOCCANO")
doccano_session = Session(doccano_engine)

# make sure, you have the correct server:
db_engine.url


postgresql+psycopg2://postgres:***@193.196.39.254/next

In [3]:
# query all doccano data from current project for all examples confirmed by at least 1 coder
doccano_examples = (
    doccano_session.query(dm.ExamplesExample)
    .options(
        joinedload(dm.ExamplesExample.labels),
        joinedload(dm.ExamplesExample.state),
    )
    .join(dm.ExamplesExample.project)
    .join(dm.ExamplesExample.state)
    .filter(
        dm.ProjectsProject.name == PROJECT,
        dm.ExamplesExample.state.any(),
    )
)

# encapsulate everything in transaction.
# if no exceptions are thrown, the context manager will automatically commit and rollback otherwise.
with db_session.begin():
    # iterate over all samples from batch / project
    for example in tqdm(doccano_examples.all()):

        # iterate over all confirmed statuses of example
        for state in example.state:
            username = state.confirmed_by.username
            timestamp = state.confirmed_at

            # get all labels of user from doccano as string
            doccano_labels = {
                label.label.text for label in example.labels if label.user.username == username
            }

            # check if row is already in DB, create otherwise
            bert_label = (
                db_session.query(bm.Label)
                .filter(
                    bm.Label.username == username,
                    bm.Label.sample_id == example.meta["id"],
                )
                .one_or_none()
            )

            if not bert_label:
                bert_label = bm.Label(username=username, sample_id=example.meta["id"])

            # update all data from doccano in bert_data schema
            bert_label.time_labeled = timestamp

            bert_label.pop_antielite = True if "Anti-Elitismus" in doccano_labels else False
            bert_label.pop_pplcentr = True if "Volkszentriertheit" in doccano_labels else False

            bert_label.souv_eliteless = True if "Weniger Macht Eliten" in doccano_labels else False
            bert_label.souv_pplmore = True if "Mehr Macht Volk" in doccano_labels else False

            bert_label.ideol_left = True if "Links" in doccano_labels else False
            bert_label.ideol_right = True if "Rechts" in doccano_labels else False

            bert_label.unsure = True if "unsicher" in doccano_labels else False

            # add changes to transaction
            db_session.add(bert_label)


  0%|          | 0/2331 [00:00<?, ?it/s]