# Erstellung eines Datensatzes

- Wir erheben nun einen Beispiel-Datensatz
- Alle Artikel von 25 ausgewählten Journals der Psychologie
- Zeitraum: 2015-2020

- Dieser wird morgen in den Sessions verwendet
- Um Zugang zu dem hier erstellten Datensatz zu erhalten, folgen Sie den Anweisungen in der README.md in diesem Repo

In [None]:
sources = [
    "clinical psychology review",
    "current directions in psychological science",
    "developmental review",
    "educational psychologist",
    "educational psychology review",
    "international review of sport and exercise psychology",
    "journal of abnormal psychology",
    "journal of applied psychology",
    "journal of consumer psychology",
    "journal of occupational health psychology",
    "journal of organizational behavior",
    "journal of personality and social psychology",
    "journal of the learning sciences",
    "leadership quarterly",
    "neuroscience and biobehavioral reviews",
    "personality and social psychology review",
    "personnel psychology",
    "perspectives on psychological science",
    "psychological bulletin",
    "psychological methods",
    "psychological review",
    "psychological science",
    "psychological science in the public interest",
    "social issues and policy review",
    "trends in cognitive sciences",
]

In [None]:
len(sources)

25

In [None]:
from src.connect import create_wos_session

engine, s = create_wos_session()

In [None]:
from sqlalchemy import func
from src.models import Abstract, Author, Item, ItemAuthorInstitution, Source

base_query = (
    s.query(Source)
    .join(Item)
    .join(ItemAuthorInstitution)
    .join(Author)
    .join(Abstract, isouter=True)
    .filter(
        Item.pubyear.between(2015, 2020),
        Item.doctype == "Article",
        func.lower(Source.sourcetitle).in_(sources),
    )
)

In [None]:
base_query.with_entities(Author.pk_authors).distinct().count()

16546

In [None]:
base_query.with_entities(Item.pk_items).distinct().count()

5629

In [None]:
base_query.with_entities(ItemAuthorInstitution.pk_itm_auth_inst).distinct().count()

32385

In [None]:
base_query.filter(Abstract.pk_abstracts != None).with_entities(
    Abstract.pk_abstracts
).distinct().count()

5557

In [None]:
base_query.with_entities(Source.pk_sources).distinct().count()

24

In [None]:
# check for non-existing sources
new = {
    c.lower() for (c,) in base_query.with_entities(Source.sourcetitle).distinct().all()
}

set(sources) - new

{'trends in cognitive sciences'}

## Erstelle SQlite DB

- hier wird pathlib verwendet.
- großartiges package aus der Standard-Library für den Umgang mit Pfaden (cross-plattform)
- eingängige API
- Dokumentation dazu [hier](https://docs.python.org/3/library/pathlib.html)

In [None]:
import src

sqlite_path = src.PATH / "data/example.db"

if sqlite_path.is_file():
    sqlite_path.unlink()

In [None]:
from src.connect import create_sqlite_session

sqlite_engine, sqlite_s = create_sqlite_session(sqlite_path)

In [None]:
from src.models import Base

Base.metadata.create_all(bind=sqlite_engine)

### Author

In [None]:
import pandas as pd

df = pd.read_sql(base_query.with_entities(Author).distinct().statement, engine)
df.to_sql("authors", sqlite_engine, if_exists="append", index=False)

### Item

In [None]:
df = pd.read_sql(base_query.with_entities(Item).distinct().statement, engine)
df.to_sql("items", sqlite_engine, if_exists="append", index=False)

### Source

In [None]:
df = pd.read_sql(base_query.with_entities(Source).distinct().statement, engine)
df.to_sql("sources", sqlite_engine, if_exists="append", index=False)

### ItemAuthorInstitution

In [None]:
df = pd.read_sql(
    base_query.with_entities(ItemAuthorInstitution).distinct().statement, engine
)
df.to_sql("items_authors_institutions", sqlite_engine, if_exists="append", index=False)

### Abstracts

In [None]:
# man kann leider kein DISTINCT() auf CLOB Spalten anwenden, daher hier ein 'workaround'

unique_pk_abstracts = base_query.with_entities(Abstract.pk_abstracts).distinct()

query = s.query(Abstract).filter(Abstract.pk_abstracts.in_(unique_pk_abstracts))

df = pd.read_sql(query.statement, engine)
df.to_sql("abstracts", sqlite_engine, if_exists="append", index=False)

# Ein Schmankerl für die Nerds

In [None]:
def explain_query(query, detail="TYPICAL"):
    """show the query execution plan for an oracle DB

    possible values for detail (in ascending detail order):
    BASIC, SERIAL, TYPICAL, ALL
    """
    raw_query = str(
        query.statement.compile(
            engine,
            compile_kwargs={
                "literal_binds": True,
            },
        )
    )

    s.execute("ALTER SESSION SET current_schema = WOS_B_2020")
    s.execute("EXPLAIN PLAN FOR " + raw_query)
    out = s.execute(
        f"SELECT * FROM TABLE(DBMS_XPLAN.DISPLAY(format => '{detail}'))"
    ).fetchall()

    print("\n".join(str(row) for (row,) in out))

In [None]:
explain_query(base_query, detail="BASIC")

Plan hash value: 3576923419
 
------------------------------------------------------------------------------
| Id  | Operation                               | Name                       |
------------------------------------------------------------------------------
|   0 | SELECT STATEMENT                        |                            |
|   1 |  HASH JOIN                              |                            |
|   2 |   NESTED LOOPS                          |                            |
|   3 |    NESTED LOOPS OUTER                   |                            |
|   4 |     NESTED LOOPS                        |                            |
|   5 |      TABLE ACCESS FULL                  | SOURCES                    |
|   6 |      TABLE ACCESS BY INDEX ROWID BATCHED| ITEMS                      |
|   7 |       INDEX RANGE SCAN                  | FK_ITEMS_SOURCES           |
|   8 |     INDEX RANGE SCAN                    | IDX_ABSTRACTS_FK_ITEMS     |
|   9 |    TABLE ACCES

In [None]:
explain_query(base_query, detail="SERIAL")

Plan hash value: 364409137
 
----------------------------------------------------------------------------------------------------------------------
| Id  | Operation                               | Name                       | Rows  | Bytes | Cost (%CPU)| Time     |
----------------------------------------------------------------------------------------------------------------------
|   0 | SELECT STATEMENT                        |                            |  9603 |  1828K| 32436   (1)| 00:00:04 |
|   1 |  NESTED LOOPS                           |                            |  9603 |  1828K| 32436   (1)| 00:00:04 |
|   2 |   NESTED LOOPS                          |                            |  9603 |  1772K| 22832   (1)| 00:00:03 |
|   3 |    NESTED LOOPS OUTER                   |                            |  1724 |   294K| 15936   (1)| 00:00:02 |
|   4 |     NESTED LOOPS                        |                            |  1724 |   281K| 12487   (1)| 00:00:02 |
|*  5 |      TABLE 