In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
from collections import Counter

import pandas as pd
import spacy
from sqlalchemy.orm import Session
from sqlalchemy.orm import joinedload
from tqdm import tqdm

import src.db.models.doccano as m
from src.db.connect import make_engine
from src.db.sample import Sample

In [None]:
# setup
engine = make_engine("DOCCANO")

session = Session(engine)

pd.set_option("display.max_colwidth", 512)
pd.set_option("display.max_rows", 512)

project = "PBert AnnoTask 5"

In [None]:
# prepare sentence tokenizer

nlp = spacy.load("de_core_news_md")
nlp.disable_pipes(["tagger", "morphologizer", "lemmatizer", "attribute_ruler", "ner"])

['tagger', 'morphologizer', 'lemmatizer', 'attribute_ruler', 'ner']

In [None]:
q = (
    session.query(m.ExamplesExample)
    .options(joinedload(m.ExamplesExample.labels), joinedload(m.ExamplesExample.state))
    .join(m.ExamplesExample.project)
    .filter(
        m.ProjectsProject.name == project,
        # uncomment to only collect samples confirmed by at least one person
        m.ExamplesExample.state.any(),
    )
)


samples = []
for row in tqdm(q, total=q.count(), smoothing=0):
    sample = Sample(row, nlp)
    samples.append(sample)

100%|██████████| 1535/1535 [00:02<00:00, 757.46it/s]


# Percentage coded


In [None]:
c = Counter(str(len(sample.user_labels.keys())) for sample in samples)
d = pd.DataFrame(c.most_common(None), columns=["coders", "samples"])
d = d.sort_values("coders", ascending=False)
d["cumsum"] = round(d.samples.cumsum() / d.samples.sum() * 100, 2)
d

Unnamed: 0,coders,samples,cumsum
5,6,7,0.46
1,5,484,31.99
0,4,579,69.71
2,3,323,90.75
3,2,122,98.7
4,1,20,100.0


# Coded by


In [None]:
c = Counter(user for sample in samples for user in sample.confirmed_by)
d = pd.DataFrame(c.most_common(None), columns=["coder", "samples"])
d[r"% done"] = round(d.samples / 1200 * 100, 2)
d.sort_values("samples", ascending=False)

Unnamed: 0,coder,samples,% done
0,schadt,1201,100.08
1,grabsch,1200,100.0
2,coudry,1200,100.0
3,riedel,1200,100.0
4,richter,1200,100.0
5,erhard,9,0.75
6,remer,1,0.08


# Samples with 'NICHT ZUTREFFEND'


In [None]:
count = 0
any_coded = 0
for sample in samples:
    if len(sample.user_labels.keys()):
        any_coded += 1
    if any("NICHT ZUTREFFEND" in labels for labels in sample.user_labels.values()):
        print(sample.example.id, sample.user_labels)
        count += 1

print("-" * 50 + f"\nRatio 'NICHT ZUTREFFEND': {count / any_coded:.2%}")

2529 {'erhard': {'NICHT ZUTREFFEND'}, 'grabsch': {'NICHT ZUTREFFEND'}, 'coudry': {'none'}, 'riedel': {'NICHT ZUTREFFEND'}, 'schadt': {'NICHT ZUTREFFEND'}}
2547 {'erhard': {'NICHT ZUTREFFEND'}, 'grabsch': {'NICHT ZUTREFFEND'}, 'richter': {'none'}, 'coudry': {'none'}, 'schadt': {'NICHT ZUTREFFEND'}}
2565 {'erhard': {'NICHT ZUTREFFEND'}, 'grabsch': {'none'}, 'richter': {'none'}, 'coudry': {'none'}, 'riedel': {'none'}, 'schadt': {'NICHT ZUTREFFEND'}}
2530 {'richter': {'none'}, 'grabsch': {'NICHT ZUTREFFEND'}, 'coudry': {'none'}, 'riedel': {'none'}, 'schadt': {'NICHT ZUTREFFEND'}}
2640 {'richter': {'none'}, 'riedel': {'none'}, 'coudry': {'none'}, 'grabsch': {'none'}, 'schadt': {'NICHT ZUTREFFEND'}}
2740 {'richter': {'NICHT ZUTREFFEND'}, 'grabsch': {'none'}, 'coudry': {'none'}, 'riedel': {'none'}, 'schadt': {'NICHT ZUTREFFEND'}}
2770 {'richter': {'NICHT ZUTREFFEND'}, 'riedel': {'none'}, 'coudry': {'none'}, 'schadt': {'NICHT ZUTREFFEND'}}
2910 {'richter': {'NICHT ZUTREFFEND'}, 'coudry': {'non