In [None]:
from lbnlp.models.rolodex import print_models_info

print_models_info()

In [None]:
# Import the load function from the model package
from lbnlp.models.load.matscholar_2020v1 import load

ner_model = load("ner")

In [None]:
#!cde data download

From [entities database](https://figshare.com/articles/dataset/Entities_database/8184413) readme:

> Each document is indexed by it's digital object identifier (DOI) which may be used to find the original article. Each document contains the following entity types: material (MAT), sample descriptor (DSC), symmetry/phase label (SPL), property (PRO), application (APL), synthesis method (SMT), and characterization method (CMT).

See also published [entity normalization](https://figshare.com/articles/dataset/Entity_Normalization/8184365) json.

The tags use the so-called “inside-outside-beginning” tagging scheme ([Inside–outside–beginning (tagging) - Wikipedia](https://en.m.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging))) for multi-token entities. B- means beginning, I- means inside, O means outside. So, for example, B-MAT followed by I-MAT means it’s a 2-token MAT entity.

In [None]:
from glob import glob, iglob
import os

paths_headers = os.path.expanduser("~/Dropbox/diary/21/07/marda-dd-abstracts-data/2021-07-14/headers/*/*.txt")
paths_bodies = os.path.expanduser("~/Dropbox/diary/21/07/marda-dd-abstracts-data/2021-07-14/bodies/*/*.txt")

In [None]:
len(glob(paths_headers, recursive=True))

In [None]:
len(glob(paths_bodies, recursive=True))

In [None]:
from pathlib import Path
import re

doi_to_line_end = re.compile(r"10\.\d{4,9}[\s\/\.]+[^\s]+")

no_dois = []
doi_raw_to_pathkey = {}

for headerfile in iglob(paths_headers, recursive=True):
    found = False
    with open(headerfile, encoding='utf-8') as f:
        lines = [line.strip() for line in f]
    for line in lines:
        m = re.search(doi_to_line_end, line)
        if m:
            doi_raw_to_pathkey[m.group(0)] = "/".join(Path(headerfile.replace("_header.txt", "")).parts[-2:])
            found = True
            break
    if not found:
        no_dois.append("/".join(Path(headerfile.replace("_header.txt", "")).parts[-2:]))

In [None]:
len(no_dois)

In [None]:
sep = re.compile(r"\s*[\/\.]\s*")

doi_to_pathkey = {}

for doi_raw in list(doi_raw_to_pathkey.keys()):
    _, pref, suff = re.split(sep, doi_raw)
    doi_to_pathkey[f"10.{pref}/{suff}"] = doi_raw_to_pathkey[doi_raw]

In [None]:
len(doi_to_pathkey)

In [None]:
# import requests
# from tqdm.notebook import tqdm

# for doi in tqdm(dois):
#     assert requests.head(f"https://doi.org/{doi}", allow_redirects=True).status_code == 200, doi

In [None]:
pathkey_to_doi = {pk: doi for doi, pk in doi_to_pathkey.items()}

doi_to_bodypath = {}

for bodypath in iglob(paths_bodies, recursive=True):
    pathkey = "/".join(Path(bodypath.replace("_body_sent.txt", "")).parts[-2:])
    if pathkey in pathkey_to_doi:
        doi_to_bodypath[pathkey_to_doi[pathkey]] = bodypath

In [None]:
print(len(doi_to_bodypath))
assert len(doi_to_pathkey) == len(doi_to_bodypath)

In [None]:
import concurrent.futures

from pathlib import Path
import re

from tqdm.notebook import tqdm

doi_to_tags = {}

def get_tags(item):
    doi, bodypath = item
    with open(bodypath, encoding='utf-8') as f:
        doc = f.read()
    
    try:
        return {doi: ner_model.tag_doc(doc)}
    except Exception as e:
        return {"error": {doi: str(e)}}

errors = {}

pbar = tqdm(total=len(doi_to_bodypath))

# takes ~ 1 hour on my laptop. -DW
for doi, item in zip(doi_to_bodypath, doi_to_bodypath.items()):
    data = get_tags(item)
    if "error" in data:
        errors.update(data["error"])
    else:
        doi_to_tags.update(data)
    pbar.update(1)

pbar.close()

In [None]:
len(doi_to_tags)

In [None]:
len(errors)

In [None]:
errors

In [None]:
import json

from monty.json import MontyEncoder

with open(os.path.expanduser("~/Dropbox/diary/21/07/doi_to_tags.json"), "w") as f:
    json.dump(doi_to_tags, f, cls=MontyEncoder, indent=2)

In [None]:
import json

with open(os.path.expanduser("~/Dropbox/diary/21/07/doi_to_tags.json")) as f:
    doi_to_tags = json.load(f)

In [None]:
from enum import Enum

class EntityType(str, Enum):
    MAT = "MAT"
    DSC = "DSC"
    SPL = "SPL"
    PRO = "PRO"
    APL = "APL"
    SMT = "SMT"
    CMT = "CMT"
    # Not sure what PUT and PVL are. Including here for data validation only.
    PUT = "PUT"
    PVL = "PVL"

In [None]:
from typing import Optional

from pydantic import BaseModel

class Entry(BaseModel):
    doi: str # digital object identifier (DOI)
    ne: str # named entity
    cat: EntityType # category
    idx_s: int # sentence index from start (first sentence => 0)
    n_sents: int # number of sentences in this abstract
    raw_s: Optional[str] = "" # raw sentence string (" ".join(sentence))

In [None]:
json.loads(Entry(doi="3", ne="a", cat="MAT", idx_s=0, n_sents=1).json())

From [entities database](https://figshare.com/articles/dataset/Entities_database/8184413) readme:

> Each document is indexed by it's digital object identifier (DOI) which may be used to find the original article. Each document contains the following entity types: material (MAT), sample descriptor (DSC), symmetry/phase label (SPL), property (PRO), application (APL), synthesis method (SMT), and characterization method (CMT).

See also published [entity normalization](https://figshare.com/articles/dataset/Entity_Normalization/8184365) json.

The tags use the so-called “inside-outside-beginning” tagging scheme ([Inside–outside–beginning (tagging) - Wikipedia](https://en.m.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging))) for multi-token entities. B- means beginning, I- means inside, O means outside. So, for example, B-MAT followed by I-MAT means it’s a 2-token MAT entity.

In [None]:
from pprint import pprint

from tqdm.notebook import tqdm

entries = []

for doi, tags in tqdm(list(doi_to_tags.items())):
    for idx, sentence in enumerate(tags):
        added_raw_sentence = False
        entries_sentence = []
        entry = None
        for token, label in sentence:
            if label == "O":
                if entry is not None:
                    entries_sentence.append(entry)
                entry = None
                continue
            qualifier, category = label.split("-")
            if qualifier == "B":
                entry = {
                    "doi": doi,
                    "ne": token,
                    "cat": category,
                    "idx_s": idx,
                    "n_sents": len(tags),
                    "raw_s": (" ".join([token for token, _ in sentence]) if not added_raw_sentence else "")
                }
                added_raw_sentence = True
            elif qualifier == "I":
                if entry is None: # (should be?) impossible in theory, but there in practice!
                    entry = {
                        "doi": doi,
                        "ne": token,
                        "cat": category,
                        "idx_s": idx,
                        "n_sents": len(tags),
                        "raw_s": (" ".join([token for token, _ in sentence]) if not added_raw_sentence else "")
                    }
                    added_raw_sentence = True
                else:
                    entry["ne"] += f" {token}"
        entries.extend(entries_sentence)

In [None]:
import csv

from tqdm.notebook import tqdm

import csv

with open('marda-dd-abstracts-matscholar.csv', 'w') as csvfile:
    fieldnames = ["ne", "cat", "doi", "n_sents", "idx_s", "raw_s"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for e in tqdm(entries):
        entry = Entry(**e)
        if entry.cat not in (EntityType.PUT, EntityType.PVL):
            # json (de)serialization so that category is a string, not a Python <enum 'EntityType'>
            writer.writerow(json.loads(entry.json()))

In [None]:
!wc -l marda-dd-abstracts-matscholar.csv

In [None]:
!du -h marda-dd-abstracts-matscholar.csv

In [None]:
!head marda-dd-abstracts-matscholar.csv

In [None]:
!zip marda-dd-abstracts-matscholar.zip marda-dd-abstracts-matscholar.csv

In [None]:
from pymongo import MongoClient

client = MongoClient()
dbname = "marda_dd_abstracts"
client.drop_database(dbname)
db = client[dbname]

In [None]:
entry_docs = [e for e in entries if e["cat"] not in ("PVL", "PUT")]

In [None]:
from toolz import keyfilter

def pick(whitelist, d):
    return keyfilter(lambda k: k in whitelist, d)

In [None]:
docs_abstracts = [
    {"doi": doi, "n_sents": n_sents}
    for doi, n_sents in {(e["doi"],e["n_sents"]) for e in entry_docs}
]
db.abstracts.insert_many(docs_abstracts)
db.abstracts.create_index("doi")

In [None]:
entry_docs[0]

In [None]:
docs_sentences = [
    {"doi": doi, "idx_s": idx_s, "raw_s": raw_s}
    for doi, idx_s, raw_s in
    {(e["doi"],e["idx_s"],e["raw_s"]) for e in entry_docs if e.get("raw_s")}
]
db.sentences.insert_many(docs_sentences)
db.sentences.create_index([("doi", 1), ("idx_s", 1)])

In [None]:
docs_taggings = [
    {"doi": doi, "idx_s": idx_s, "ne": ne, "cat": cat}
    for doi, idx_s, ne, cat in
    {(e["doi"],e["idx_s"],e["ne"],e["cat"]) for e in entry_docs}
]
db.taggings.insert_many(docs_taggings)

In [None]:
db.taggings.count_documents({})

In [None]:
results = list(db.taggings.aggregate([
    {"$sortByCount": "$ne"}
]))

In [None]:
results[:10]

In [None]:
from glob import glob
import json

entity_map = {}

for path in glob("matscholar_entity_normalization/*.json"):
    with open(path) as f:
        doc = json.load(f)
        entity_map.update({k: v["most_common"] for k, v in doc.items()})

In [None]:
from toolz import assoc, dissoc

docs_taggings_normalized = [
    assoc(
        dissoc(doc, "_id"),
        "ne",
        entity_map.get(doc["ne"], doc["ne"])
    )
    for doc in db.taggings.find()
]
db.taggings_normalized.insert_many(docs_taggings_normalized)

In [None]:
db.taggings_normalized.count_documents({})

In [None]:
results = list(db.taggings_normalized.aggregate([
    {"$sortByCount": "$ne"}
]))

In [None]:
len(results)

In [None]:
len([r for r in results if r["count"] >= 1000])

In [None]:
len([r for r in results if r["count"] >= 500])

In [None]:
len()

In [None]:
with open("matscholar_taggings_normalized_gte100_occurrences.txt", "w") as f:
    gte100 = [r for r in results if r["count"] >= 100]
    n_gte100 = len(gte100)
    for i, r in enumerate([r for r in results if r["count"] >= 100]):
        f.write(f'{i+1:03}/{n_gte100}: ({r["count"]:5})  {r["_id"]}\n')


In [None]:
next(r for r in results if r["_id"] == "resolution")

In [None]:
next(r for r in results if r["_id"] == "pitch")

In [None]:
next(r for r in results if r["_id"] == "line width")

In [None]:
next(r for r in results if r["_id"] == "LWR")

In [None]:
next(r for r in results if r["_id"] == "line width roughness")

In [None]:
next(r for r in results if r["_id"] == "LER")

In [None]:
next(r for r in results if r["_id"] == "line edge roughness")

In [None]:
!mongoexport -d marda_dd_abstracts -c abstracts -o ~/marda_dd_abstracts.abstracts.jsonl

In [None]:
!mongoexport -d marda_dd_abstracts -c sentences -o ~/marda_dd_abstracts.sentences.jsonl

In [None]:
!mongoexport -d marda_dd_abstracts -c taggings -o ~/marda_dd_abstracts.taggings.jsonl

In [None]:
!mongoexport -d marda_dd_abstracts -c taggings_normalized -o ~/marda_dd_abstracts.taggings_normalized.jsonl

In [None]:
!gzip -kvf ~/marda_dd_abstracts.*.jsonl