In [47]:
import pandas as pd
import numpy as np
from cytoolz import *
from tqdm.auto import tqdm
tqdm.pandas()

In [48]:
terms = [t.split() for t in open('terms.txt')]

In [63]:
terms

[['part', '-', 'of', '-', 'speech', 'tagging'],
 ['word', '-', 'to', '-', 'word'],
 ['part', '-', 'of', '-', 'speech'],
 ['state', '-', 'ofthe', '-', 'art'],
 ['tree', '-', 'to', '-', 'string'],
 ['-', 'fold', 'cross', '-', 'validation'],
 ['end', '-', 'to', '-', 'end'],
 ['state', '-', 'of', '-', 'theart'],
 ['sequence', '-', 'to', '-', 'sequence'],
 ['context', '-', 'free', 'grammar'],
 ['right', '-', 'hand', 'side'],
 ['log', '-', 'linear', 'model'],
 ['-', 'fold', 'cross', 'validation'],
 ['multi', '-', 'document', 'summarization'],
 ['inter', '-', 'annotator', 'agreement'],
 ['semi', '-', 'supervised', 'learning'],
 ['multi', '-', 'task', 'learning'],
 ['pre', '-', 'trained', 'word'],
 ['natural', 'language', 'processing'],
 ['high', '-', 'level'],
 ['low', '-', 'level'],
 ['machine', 'translation', 'system'],
 ['first', '-', 'order'],
 ['sentence', '-', 'level'],
 ['predicate', '-', 'argument'],
 ['natural', 'language', 'generation'],
 ['point', 'of', 'view'],
 ['part', 'of', 'sp

In [50]:
df = pd.read_parquet('s3://ling583/micusp.parquet', storage_options={'anon':True})

In [51]:
df.head()

Unnamed: 0,filename,text
0,micusp/BIO.G0.15.1.html,"New York City, 1908: different colors of skin..."
1,micusp/BIO.G1.04.1.html,\tThe fish-tetrapod transition has been calle...
2,micusp/BIO.G3.03.1.html,\tIntracellular electric fields are of great ...
3,micusp/BIO.G0.11.1.html,Environmental stresses to plants have been st...
4,micusp/BIO.G1.01.1.html,\tThe recurrent cholera pandemics have been re...


In [6]:
len(df)

788

In [54]:
import spacy

In [55]:
# en_core_web_sm is an english model built on data from the web
# the "sm" denotes small model, there are larger models available but we don't need all of that
## Excluded modules:
# Parser finds the syntactic structure of sentences
# ner (Named Entity Recognizer) pulls out names of people and places
# lemmatizer strips imflection and morphology from words to find their root
# attribute_ruler identifies gender of pronouns and more
nlp = spacy.load('en_core_web_sm', exclude=['parser', 'ner', 'lemmatizer', 'attribute_ruler'])

In [56]:
# These are the modules that we are left with after exclusions
# took2vec (token to vector) looks words up in the vocabulary
# tagger tags the part of speech
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fe57df6a1d0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fe57debf310>)]

In [57]:
from spacy.matcher import Matcher

In [58]:
# Create a matcher and link it to our vocabulary established above
matcher = Matcher(nlp.vocab)

# add rules, in this case define candidate terms
# IN just means in the set: []
# JJ = adjective
# NN = noun
# IN = preposition
# HYPH = hyphen
# OP = operation, works like regular expressions (* = zero or more times)
matcher.add('Term', [[{'TAG': {'IN': ['JJ', 'NN']}},
                      {'TAG': {'IN': ['JJ', 'NN', 'IN', 'HYPH']}, 'OP': '*'},
                      {'TAG': 'NN'}]])
# this amounts to any noun/adjective followed by and number of adjective/noun/preposition/hyphen, ending with another noun

In [59]:
spans = matcher(doc, as_spans=True)

In [60]:
tuple(tok.norm_ for tok in spans[0])

('tetrapod', 'transition')

In [61]:
def get_candidates(text):
    doc = nlp(text) # tokenize and tag
    spans = matcher(doc, as_spans=True) # find all of the spans that satisfy the rules above
    return [tuple(tok.norm_ for tok in span) for span in spans] # return a list of all of the spans converted to tuples of normalized strings

In [62]:
# Run the process on the entire database
candidates = list(concat(df['text'].progress_apply(get_candidates)))

  0%|          | 0/788 [00:00<?, ?it/s]

In [64]:
from collections import defaultdict, Counter

In [65]:
# Create a dict
# Keys = sequence lengths
# Values = counter of sequences of that length
freqs = defaultdict(Counter)
for c in candidates:
    freqs[len(c)][c] += 1

In [66]:
freqs.keys()

dict_keys([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])

In [67]:
from nltk import ngrams

In [68]:
# Get the length of the term, then create a list 1 smaller than that length that decreases until 2
def get_subterms(term):
    k = len(term)
    for m in range(k-1, 1, -1):
        yield from ngrams(term, m)

In [69]:
from math import log2

In [70]:
# F = frequency data structure defined above, sorted by length
# theta = Threshold, the C-value above which we consider candidates to be terms
def c_value(F, theta):
    
    # Keep track of terms as we identify them
    termhood = Counter()
    
    # Keep track of longer sequences that contain shorter sequences
    longer = defaultdict(list)
    
    # K is sequence length, starting with the longest
    for k in sorted(F, reverse=True):
        for term in F[k]:
            # if the term is a subsequence of a longer one that we have seen already
            if term in longer:
                discount = sum(longer[term]) / len(longer[term])
            # if there are no longer sequences of it, there is no discount
            else:
                discount = 0
            c = log2(k) * (F[k][term] - discount)
            if c > theta:
                termhood[term] = c
                for subterm in get_subterms(term):
                    if subterm in F[len(subterm)]:
                        longer[subterm].append(F[k][term])
    return termhood

In [74]:
new_terms = c_value(freqs, theta=51)

In [75]:
for t, c in new_terms.most_common(20):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

  282.00  282 other hand
  264.00  264 health care
  252.00  126 part - time faculty
  206.00  206 same time
  177.52  112 long - term
  169.00  169 high school
  167.00  167 body color
  155.33   98 self - esteem
  146.00  146 wing venation
  138.00  138 eye color
  137.00  137 domestic violence
  120.46   76 decision - making
  112.53   71 low - income
  111.00  111 renewable energy
  103.02   65 quality of life
  103.02   65 state of nature
  103.02   65 spell - caster
  103.02   65 community violence exposure
  101.00  101 wild type
   97.00   97 civil society


In [76]:
for t, c in tail(20, new_terms.most_common()):
    print(f'{c:8.2f} {freqs[len(t)][t]:4d} {" ".join(t)}')

   61.00   61 mental health
   60.23   86 well - being
   57.00   57 child care
   57.00   57 egg donation
   56.00   28 private for - profit
   56.00   28 supra - individual realm
   55.47   35 large - scale
   55.00   55 storm water
   55.00   55 kinetic energy
   55.00   55 sex education
   54.00   54 grip force
   54.00   54 physical activity
   54.00  119 community violence
   53.89   34 client - provider
   53.89   34 recurrent breast cancer
   53.00   53 social interaction
   53.00   53 professional development
   52.30   33 nation - state
   52.00   52 wide range
   52.00   52 front end


---

**Remove non-specific terms**

---

In [80]:
# Put the new terms into a separate file
with open('terms2.txt', 'w') as f:
    for term in new_terms:
        print(' '.join(term), file=f)

In [81]:
# Read that same file back in, this converts it from a dict to a list
new_terms = [t.split() for t in open('terms2.txt')]

In [84]:
# If loop through the original terms
# If the sequence is NOT in the new terms list, add it to a new file
with open('terms-final.txt', 'w') as f:
    for term in terms:
        if term not in new_terms:
            print(' '.join(term), file=f)