In [1]:
import operator 
import re

from collections.abc import Iterable
from collections.abc import Generator
from collections.abc import Callable

from typing import Any

import contractions
import pandas as pd

from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from stop_words import get_stop_words

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from catalog import load_catalog

In [2]:
catalog = load_catalog()

In [3]:
def get_title(node: dict) -> str:
  return node['data']['title']


def format_title_leveled(title: str, depth: int) -> str:
  return ' ' * (depth - 1) + title


def get_title_callback(node: dict, *args) -> str:
  return get_title(node)

In [4]:
def traverse_catalog(
  catalog:   Iterable[dict],
  callback:  Callable[[dict, int], Any] | None = None,
  max_depth: int | None = None,
) -> Generator[str, None, None]:    

  def visit(node: dict, depth: int = 1):
    yield callback(node, depth)

    if max_depth is not None and depth >= max_depth:
      return

    children = node.get('children')
    if children:
      depth += 1
      for child in children:
        yield from visit(child, depth)    
  
  for tree in catalog:
    yield from visit(tree) 

In [5]:
def count_chars(strings: Iterable[str]) -> dict[str, int]:  
  chars = dict()

  for s in strings:
    for ch in s:
      chars[ch] = chars.setdefault(ch, 0) + 1

  return chars

In [6]:
df_chars_original = pd.DataFrame.from_records(
  data    = list(count_chars(traverse_catalog(catalog, callback=get_title_callback)).items()),
  columns = ('char', 'n'),
)
df_chars_original[
     (df_chars_original['char'] < 'A')
  | ((df_chars_original['char'] > 'Z') & (df_chars_original['char'] < 'a'))
  |  (df_chars_original['char'] > 'z')
].sort_values('n', ascending=False)

Unnamed: 0,char,n
8,,5700
17,&,1032
43,:,271
37,",",269
42,-,117
61,0,115
50,(,91
51,),91
21,/,80
58,1,50


In [7]:
REMOVE_REGEX_PATTENS_1 = re.compile("(%s)" % "|".join([
  r"[&,()]",
  r"\betc\b",
  r"\beg\b",
  r"\be.g.\b",  
  r"\bc \d+(/\d+)? (ce|bce)\b",
  r"\b\d+(/\d+)? (ce|bce)\b",
  r"\bc \d+(/\d+)?\b",
  r"\W+$",
]))

SPACE_REGEX_PATTENS = re.compile("(%s)" % "|".join([
  r"[/:]",
  r"\s+\-\s+",  
]))

REPLACE_REGEX_PATTERNS = (
  (re.compile(r"\belt\b"), "english language teaching"),
  (re.compile(r"\bya\b"),  "young adult"),
  (re.compile(r"\br'n'b\b"),  "rnb"),
)

REMOVE_REGEX_PATTENS_2 = re.compile("(%s)" % "|".join([
  r"[']",
]))


STOP_WORDS = set(stopwords.words("english")) | set(get_stop_words('en'))

def is_stop_word(s: str) -> bool:
  return s in STOP_WORDS


SPLIT_TOKEN_REGEX = re.compile(r"[-_]")

def split_token(s: str) -> Iterable[str]:
  return SPLIT_TOKEN_REGEX.split(s)


def clean_title(s: str) -> str:
  s = s.lower()
  s = REMOVE_REGEX_PATTENS_1.sub("", s)
  s = SPACE_REGEX_PATTENS.sub(" ", s)
  
  for pattern, sub in REPLACE_REGEX_PATTERNS:
    s = pattern.sub(sub, s)

  s = contractions.fix(s)  
  s = REMOVE_REGEX_PATTENS_2.sub("", s)

  t = []
  x_prev = None

  for x in word_tokenize(s):
    if not is_stop_word(x) and x != x_prev:
      t.append(x)
      x_prev = x

  return " ".join(t)

In [8]:
for s in traverse_catalog(
  catalog,
  callback  = lambda x, d: format_title_leveled(clean_title(get_title(x)), d),
  max_depth = 3,
):
  print(s)

medical
 medicine
  medical profession
  medical equipment techniques
  public health preventive medicine
  health systems services
  medicolegal issues
  medical sociology
  history medicine
 pre-clinical medicine basic sciences
  anatomy
  physiology
  human reproduction growth development
  medical genetics
 clinical internal medicine
  medical diagnosis
  diseases disorders
  cardiovascular medicine
  musculoskeletal medicine
  haematology
  endocrinology
  gastroenterology
  hepatology
  dermatology
  respiratory medicine
  rheumatology
  neurology clinical neurophysiology
  otorhinolaryngology ent
  ophthalmology
  renal medicine nephrology
  urology urogenital medicine
  gynaecology obstetrics
  paediatric medicine
  geriatric medicine
  gene therapy
 branches medicine
  anaesthetics
  palliative medicine
  dentistry
  pathology
  pharmacology
  psychiatry
  clinical psychology
  accident emergency medicine
  nuclear medicine
  medical imaging
  forensic medicine
  environmental

In [9]:
df_chars_clean = pd.DataFrame.from_records(
  data = list(count_chars(traverse_catalog(
    catalog,
    callback = lambda x, _: clean_title(get_title(x)),
  )).items()),
  columns = ('char', 'n'),
)
df_chars_cmp = (
  df_chars_original.merge(df_chars_clean, how='outer', on='char')
  .fillna(0)
  .astype({"n_y": int})
)
df_chars_cmp['n_diff'] = df_chars_cmp['n_y'] - df_chars_cmp['n_x']
df_chars_cmp[
     (df_chars_cmp['char'] < 'A')
  | ((df_chars_cmp['char'] > 'Z') & (df_chars_cmp['char'] < 'a'))
  |  (df_chars_cmp['char'] > 'z')
].sort_values('n_x', ascending=False)

Unnamed: 0,char,n_x,n_y,n_diff
8,,5700,4212,-1488
17,&,1032,0,-1032
43,:,271,0,-271
37,",",269,0,-269
42,-,117,111,-6
61,0,115,11,-104
50,(,91,0,-91
51,),91,0,-91
21,/,80,0,-80
58,1,50,4,-46


In [10]:
df_chars_wn = pd.DataFrame.from_records(
  data    = list(count_chars(wn.all_lemma_names()).items()),
  columns = ('char', 'n'),
)
df_chars_wn[
     (df_chars_wn['char'] < 'A')
  | ((df_chars_wn['char'] > 'Z') & (df_chars_wn['char'] < 'a'))
  |  (df_chars_wn['char'] > 'z')
].sort_values('n', ascending=False)

Unnamed: 0,char,n
10,_,76399
2,-,7655
39,',1303
0,.,603
16,1,274
15,0,169
1,2,151
13,4,108
14,5,105
11,3,97


In [11]:
wn_complex_lemmas_stats = dict()

for lemma in wn.all_lemma_names():
  chunks   = split_token(lemma)
  chunks_n = len(chunks)
  if chunks_n > 1:
    wn_complex_lemmas_stats[chunks_n] = wn_complex_lemmas_stats.setdefault(chunks_n, 0) + 1

df_chunks_wn = pd.DataFrame.from_records(
  data    = list(wn_complex_lemmas_stats.items()),
  columns = ('chunk_len', 'n'),
)
df_chunks_wn.sort_values('n', ascending=False)

Unnamed: 0,chunk_len,n
0,2,57758
1,3,9406
3,4,1751
2,5,342
5,6,89
4,7,27
6,8,24
7,9,11


In [12]:
too_complex_lemmas = dict[int, set]()

for lemma in wn.all_lemma_names():
  chunks   = split_token(lemma)
  chunks_n = len(chunks)
  if chunks_n >= 5:
    too_complex_lemmas.setdefault(chunks_n, set()).add(lemma)

for chunks_n in sorted(too_complex_lemmas, reverse=True):
  lemmas = ', '.join(sorted(too_complex_lemmas[chunks_n]))
  print(f"{chunks_n}: {lemmas}")

9: abul-walid_mohammed_ibn-ahmad_ibn-mohammed_ibn-roshd, american_federation_of_labor_and_congress_of_industrial_organizations, cooper_union_for_the_advancement_of_science_and_art, first_epistle_of_paul_the_apostle_to_the_corinthians, first_epistle_of_paul_the_apostle_to_the_thessalonians, international_islamic_front_for_jihad_against_jews_and_crusaders, popular_front_for_the_liberation_of_palestine-general_command, prayer_of_azariah_and_song_of_the_three_children, second_epistle_of_paul_the_apostle_to_the_corinthians, second_epistle_of_paul_the_apostle_to_the_thessalonians, united_nations_office_for_drug_control_and_crime_prevention
8: abu_ali_al-husain_ibn_abdallah_ibn_sina, al-jama'a_al-islamiyyah_al-muqatilah_bi-libya, armenian_secret_army_for_the_liberation_of_armenia, baron_friedrich_wilhelm_ludolf_gerhard_augustin_von_steuben, baronne_anne_louise_germaine_necker_de_steal-holstein, blood-oxygenation_level_dependent_functional_magnetic_resonance_imaging, church_of_jesus_christ_of_

In [13]:
title_tokens_original   = set()
title_tokens_original_n = 0
title_tokens_original_hit_n = 0
title_tokens_original_hit_ratio = 0

for s in traverse_catalog(
  catalog,
  callback = lambda x, _: clean_title(get_title(x)),
):
  t = word_tokenize(s)
  for x in t:
    if x not in title_tokens_original:
      title_tokens_original.add(x)
      is_hit = len(wn.synsets(x)) > 0 
      if is_hit:
        title_tokens_original_hit_n += 1

title_tokens_original_n = len(title_tokens_original)
title_tokens_original_hit_ratio = title_tokens_original_hit_n / title_tokens_original_n
title_tokens_original.clear()

print("original title tokens hit ratio: {:.3f}% ({:} / {:})".format(
  title_tokens_original_hit_ratio * 100,
  title_tokens_original_hit_n,
  title_tokens_original_n,
))

original title tokens hit ratio: 93.191% (2409 / 2585)


In [16]:
wn_n_original = 0
wn_lemmatized = set()
wn_stemmed    = set()

lemmatize = WordNetLemmatizer().lemmatize
stem      = PorterStemmer().stem

for lemma in wn.all_lemma_names():
  wn_n_original += 1  
  lemmatized = [lemmatize(t) for t in split_token(lemma)]
  stemmed    = [stem(t) for t in lemmatized]

  wn_lemmatized.add("".join(lemmatized))
  wn_stemmed.add("".join(stemmed))

wn_n_lemmatized = len(wn_lemmatized)
wn_n_stemmed    = len(wn_stemmed)

wn_lemmatized.clear()
wn_stemmed.clear()

In [17]:
print(f"wn N original: {wn_n_original}")

wn_n_lemmatized_delta       = wn_n_original - wn_n_lemmatized
wn_n_lemmatized_delta_ratio = wn_n_lemmatized_delta / wn_n_original
print(f"wn N lemmatized: {wn_n_lemmatized}, Δ: {wn_n_lemmatized_delta_ratio*100:.3f}% ({wn_n_lemmatized_delta})")

wn_n_stemmed_delta       = wn_n_original - wn_n_stemmed
wn_n_stemmed_delta_ratio = wn_n_stemmed_delta / wn_n_original
print(f"wn N stemmed: {wn_n_stemmed}, Δ: {wn_n_stemmed_delta_ratio*100:.3f}% ({wn_n_stemmed_delta})")

wn N original: 147306
wn N lemmatized: 145946, Δ: 0.923% (1360)
wn N stemmed: 127513, Δ: 13.437% (19793)


In [18]:
wn_lemmas_clean = dict()

for lemma in wn.all_lemma_names():  
  lemmatized = [lemmatize(t) for t in split_token(lemma)]
  stemmed    = [stem(t) for t in lemmatized]
  
  key1 = lemma
  key2 = "".join(lemmatized)
  key3 = "".join(stemmed)

  wn_lemmas_clean.setdefault(key1, set()).add(lemma)
  wn_lemmas_clean.setdefault(key2, set()).add(lemma)
  wn_lemmas_clean.setdefault(key3, set()).add(lemma)

In [19]:
title_tokens_clean_hit = set()
title_tokens_clean_not_hit = set()

for s in traverse_catalog(
  catalog,
  callback = lambda x, _: clean_title(get_title(x)),
):
  for token in word_tokenize(s):
    tokens = split_token(token)
    lemmatized = [lemmatize(t) for t in tokens]
    stemmed    = [stem(t) for t in lemmatized]    

    token2 = "".join(lemmatized)
    token3 = "".join(stemmed)

    if (
         token  in title_tokens_clean_hit
      or token2 in title_tokens_clean_hit
      or token3 in title_tokens_clean_hit
    ):
      continue

    if token in wn_lemmas_clean:
      title_tokens_clean_hit.add(token)
    elif token2 in wn_lemmas_clean:
      title_tokens_clean_hit.add(token2)
    elif token3 in wn_lemmas_clean:
      title_tokens_clean_hit.add(token3)
    else:      
      title_tokens_clean_not_hit.add(token)
  
  # TODO: consider subtokens as separate words
  # TODO: consider 2grams/3grams
  # TODO: consider subwords

title_tokens_clean_hit_n = len(title_tokens_clean_hit)
title_tokens_clean_not_hit_n = len(title_tokens_clean_not_hit)
title_tokens_clean_n = title_tokens_clean_hit_n + title_tokens_clean_not_hit_n
title_tokens_clean_hit_ratio = title_tokens_clean_hit_n / title_tokens_clean_n

print("clean title tokens hit ratio: {:.3f}% ({:} / {:})".format(
  title_tokens_clean_hit_ratio * 100,
  title_tokens_clean_hit_n,
  title_tokens_clean_n,
))

clean title tokens hit ratio: 94.118% (2208 / 2346)


In [160]:
it = iter(sorted(title_tokens_clean_not_hit))

In [170]:
v = next(it)
v, wn_lemmas_clean.get(v)

('biomechanics', None)