# Vorstellung des Datensatzes

<img src="resources/db_schema_abd.png" alt="Drawing" style="height:800px"/>

Tabellen, die mit 'abd_' beginnen, sind selbsterstellte Tabellen, die die Rohdaten erweitern

1. ABD_THESES
- beschreibt eine Tabelle, in der Disserationen und Habilitationen aus der DNB bzw. ProQuest Datenbank enthalten sind
- Die in diesen Datenbanken enthaltenen Personen wurden in den WoS Rohdaten gesucht und passende Matches extrahiert.
    - Matches im Sinne von: 
        1. Person muss (in etwa) denselben Namen haben
        2. Person muss in einem gewissen Zeitraum um die Dissertation an der Promotionsuniversität veröffentlicht haben
        
- Zu diesen Matches werden auf Basis der Metainformationen (Publons-ID, Orcid-ID) weitere Zeilen Personenzeilen aus den Rohdaten zugeordnet (s. Autorendisambiguierung)

2. ABD_AUTHORS
- enhält alle disambiguierten Personen aus 1. sowie alle Koautoren

In [None]:
# imports & setup
import pandas as pd
import sqlalchemy as sa

import abd
from abd.dataset.models import (
    Author,
    ABDAuthor,
    Item,
    ItemAuthorInstitution,
    Institution,
    ABDInstitution,
    ABDThesis,
)

In [None]:
engine, s = abd.load.dataset()

[15:13:18.487] Loaded dataset: /mnt/nvme_storage/git/abd/tmp/wos_b_2020_abd_ver-5.2.2.db


## Rohdaten


In [None]:
n_items = s.query(sa.func.count(Item.pk_items)).subquery()
n_authors = s.query(sa.func.count(Author.pk_authors)).subquery()
n_abd_authors = s.query(sa.func.count(ABDAuthor.id)).subquery()
n_institutions = s.query(sa.func.count(Institution.pk_institutions)).subquery()
n_abd_institutions = s.query(sa.func.count(ABDInstitution.id)).subquery()

n_pq_theses = (
    s.query(sa.func.count(ABDThesis.id))
    .filter_by(dataset="ProQuest", doctype="DISS")
    .subquery()
)
n_dnb_theses = (
    s.query(sa.func.count(ABDThesis.id))
    .filter_by(dataset="DNB", doctype="DISS")
    .subquery()
)


query = s.query(
    n_items.as_scalar().label("n_items"),
    n_authors.as_scalar().label("n_authors"),
    n_abd_authors.as_scalar().label("n_abd_authors"),
    n_institutions.as_scalar().label("n_institutions"),
    n_abd_institutions.as_scalar().label("n_abd_institutions"),
    n_pq_theses.as_scalar().label("PQ Theses"),
    n_dnb_theses.as_scalar().label("DNB Theses"),
)

-

In [None]:
pd.read_sql(query.statement, engine).T.reset_index().rename(
    {"index": "Tabelle", 0: "Länge"}, axis=1
)

Unnamed: 0,Tabelle,Länge
0,n_items,2510618
1,n_authors,4388830
2,n_abd_authors,3664564
3,n_institutions,2965715
4,n_abd_institutions,35951
5,PQ Theses,29104
6,DNB Theses,8323


In [None]:
query = (
    s.query(ABDAuthor.cntry_sample, ABDAuthor.disc_sample, sa.func.count(ABDAuthor.id))
    .group_by(ABDAuthor.cntry_sample, ABDAuthor.disc_sample)
    .filter(ABDAuthor.disc_sample != "coauthor")
)

In [None]:
pd.read_sql(query.statement, engine).sort_values(
    ["cntry_sample", "disc_sample"]
).rename({"count_1": "N Autoren"}, axis=1)

Unnamed: 0,cntry_sample,disc_sample,N Autoren
0,DEU,biochemistry,2187
1,DEU,physics,5417
2,DEU,psychology,719
3,USA,biochemistry,7119
4,USA,physics,11561
5,USA,psychology,10424


## Probleme (Autorendisambiguierung)

In [None]:
s.query(ABDAuthor).filter_by(id=254623).one()

ABDAuthor(id=254623, firstname=Hans-Joachim , lastname=Elmers, fullname=Elmers, Hans-Joachim , orig_sample_id=None, gender=None, disc_sample=coauthor, cntry_sample=None, prof_since=None)

In [None]:
query = (
    s.query(ABDAuthor).join(Author).filter(ABDAuthor.id == 254623).with_entities(Author)
)
pd.read_sql(query.statement, engine)

Unnamed: 0,pk_authors,fullname,middlename,author_group,role,orcid_id,orcid_id_tr,r_id,r_id_tr,author_id,firstname,lastname
0,2593200,"Elmers, HJ",,,author,,,,,254623,H. -J.,Elmers
1,7979166,"Elmers, Hans-Joachim",,,researcher_id,0000-0002-2525-9954,,D-6729-2011,,254623,Hans-Joachim,Elmers
2,14008505,"Elmers, H",,,author,,,D-6729-2011,,254623,H,Elmers
3,14529075,"Elmers, HJ",,,author,,,D-6729-2011,,254623,Hans J.,Elmers
4,19132237,"Elmers, HJ",,,author,,,D-6729-2011,,254623,H. J.,Elmers
5,21636595,"Elmers, HJ",,,author,,,,,254623,Hans Joachim,Elmers
6,24304346,"Elmers, HJ",,,author,,,,,254623,H. J.,Elmers
7,24390980,"Elmers, Hans-Joachim",,,researcher_id,,,D-6729-2011,,254623,Hans-Joachim,Elmers
8,27704426,"Elmers, HJ",,,author,,,D-6729-2011,,254623,Hans-Joachim,Elmers
9,28261982,"Elmers, HJ",,,author,,,,,254623,H-J,Elmers


<img src="resources/elmers_01.png" alt="Drawing" style="width:1050px"/>

In [None]:
query1 = (
    s.query(Item)
    .join(ItemAuthorInstitution)
    .group_by(Item.pk_items)
    .filter(
        ItemAuthorInstitution.fk_authors == 2593200,
    )
)

query2 = (
    s.query(Item)
    .join(ItemAuthorInstitution)
    .group_by(Item.pk_items)
    .filter(
        ItemAuthorInstitution.fk_authors == 7979166,
    )
)

pd.read_sql(query1.intersect(query2).statement, engine)

Unnamed: 0,items_pk_items,items_pubyear,items_pubtype,items_doctype,items_d_author_cnt,items_d_ref_cnt,items_fk_sources
0,315008636,2016,Journal,Article,14,49,97451
1,25442924902,2009,Journal,Article,12,36,8370
2,20630132174,2017,Journal,Article,7,20,95039
3,15011657812,2012,Journal,Article,8,29,97451


<img src="resources/elmers_02.png" alt="Drawing" style="width:750px"/>

In [None]:
query = (
    s.query(Item)
    .join(ItemAuthorInstitution)
    .join(Author)
    .filter(Item.pk_items == 15011657812)
    .with_entities(Author)
)

pd.read_sql(query.statement, engine).sort_values("fullname")

Unnamed: 0,pk_authors,fullname,middlename,author_group,role,orcid_id,orcid_id_tr,r_id,r_id_tr,author_id,firstname,lastname
11,29279647,"Chadov, S",,,author,,,,,268690,S.,Chadov
8,23677790,"Chadov, Stanislav",,,researcher_id,0000-0002-1160-1835,,P-3018-2014,,268690,Stanislav,Chadov
1,2593200,"Elmers, HJ",,,author,,,,,254623,H. -J.,Elmers
5,7979166,"Elmers, Hans-Joachim",,,researcher_id,0000-0002-2525-9954,,D-6729-2011,,254623,Hans-Joachim,Elmers
9,24281925,"Felser, C",,,author,,,,,56114,C.,Felser
0,150669,"Felser, Claudia",,,researcher_id,0000-0002-8200-2063,,A-5779-2009,,56114,Claudia,Felser
6,16302873,"Jorge, EA",,,author,,,,,2138489,E. Arbelo,Jorge
10,25118251,"Jourdan, M",,,author,,,,,289708,M.,Jourdan
7,18530443,"Jourdan, Martin",,,researcher_id,0000-0001-6785-0518,,D-8506-2016,,289708,Martin,Jourdan
14,32133706,"Klaui, M",,,author,,,,,212282,M.,Klaeui


<img src="resources/elmers_03.png" alt="Drawing" style="width:1070px"/>

In [None]:
# ToDo: Beispiel von einem Artikel, auf dem Autor + ResearcherID auf einem Artikel stehen

In [None]:
# ToDo: Von og. Autor: zeige wie die Sachen aufgeiteilt sind in mehrere IDs

In [None]:
# ToDo: Beschreibe unseren Ansatz der Autorendisambiguierung