In [None]:
import click
import spacy
import pandas as pd
from tqdm import tqdm
from collections import Counter
pd.set_option('display.max_rows', 50)

In [None]:
in_file = '../OHCumulativeAug10.csv'
out_file = 'oh_noun_chunks.csv'
filter_files = ('mi_filtered.csv', 'mo_filtered.csv', 'wi_filtered.csv')
model = 'en_core_web_lg'
text_col = ['text', 'title']
min_count = 5
nouns_only = True

In [None]:
def token_is_word(token):
    """Determines whether a token is a word."""
    # similar to: https://stackoverflow.com/a/41425016
    return not token.is_stop and not token.is_punct and token.text.strip()

def token_is_noun(token):
    """Determines whether a token is a noun."""
    return token_is_word(token) and token.pos_ == 'NOUN'

In [None]:
df = pd.read_csv(in_file)
texts = []
for col in text_col:
    texts += list(df[col].astype(str))

In [None]:
nlp = spacy.load(model)

In [None]:
tokens = []
entities = []
chunks = []
filter_fn = token_is_noun if nouns_only else token_is_word
for text in tqdm(texts):
    raw_doc = nlp(text)
    for ent in raw_doc.ents:
        entities.append(ent)
        text = text.replace(ent.text, '')
    filtered_doc = nlp(text)
    for chunk in filtered_doc.noun_chunks:
      if not chunk.root.is_stop:
        filtered_chunk = []
        for token in chunk:
          if filter_fn(token):
            filtered_chunk.append(token.lemma_.lower())
        if filtered_chunk:
          chunks.append(' '.join(filtered_chunk))
    for token in filtered_doc:
        if filter_fn(token):
            tokens.append(token.lemma_.lower().strip())

In [None]:
sieve = set()
for filename in filter_files:
  sieve |= set(pd.read_csv(filename)['chunk'])

In [None]:
chunk_counts = Counter(chunks)
chunks_df = pd.DataFrame(
  {'chunk': chunk, 'count': count}
  for chunk, count in chunk_counts.most_common()
  if count >= min_count
)
chunks_df = chunks_df[~chunks_df['chunk'].isin(sieve)]
chunks_df.to_csv(out_file, index=False)
chunks_df

In [None]:
token_counts = Counter(tokens)
tokens_df = pd.DataFrame({'token': tok, 'count': count} for tok, count in token_counts.most_common(250))
tokens_df

In [None]:
entity_counts = Counter([ent.text for ent in entities if ent.label_ == 'LOC'])
entities_df = pd.DataFrame({'entity': entity, 'count': count} for entity, count in entity_counts.most_common(250))
entities_df