In [1]:
import pandas as pd
from ast import literal_eval
from hearstPatterns.hearstPatterns import HearstPatterns
from Hearst import PatternMatcher
import re
import spacy
import time

In [2]:
#read in data and set dictionary values as dictionaries
df = pd.read_csv('full_dataframe.csv', index_col=0)
df['doc_type'].replace('admin%20instructions', 'admin instructions', inplace=True)
df['acronyms'] = df['acronyms'].apply(literal_eval)
df['glossary'] = df['glossary'].apply(literal_eval)
df['cleaned_text_list'] = df['cleaned_text_list'].apply(literal_eval)

In [3]:
df.head()

Unnamed: 0,doc_type,file_name,raw_text,cleaned_text,cleaned_text_list,url,acronyms,glossary
0,admin instructions,AI08_2016.txt,"Administrative Instruction 8, December 16, 201...","Administrative Instruction 8, December 16, 201...","[Administrative Instruction 8, December 16, 20...",https://mikeanders.org/data/Ontologies/DoD/Cor...,"{'LEO': 'Law Enforcement Officer', 'HRD': 'Hum...","{'day': ' A calendar day', 'furlough': ' A tem..."
1,admin instructions,AI120_2017.txt,"Administrative Instruction 120, May 9, 2017\r\...","Administrative Instruction 120, May 9, 2017\r\...","[Administrative Instruction 120, May 9, 2017, ...",https://mikeanders.org/data/Ontologies/DoD/Cor...,"{'FMD': 'Financial Management Directorate', 'P...",{'assisted acquisition': ' Defined in Subpart ...
2,admin instructions,a001p.txt,"Administrative Instruction 1, October 19, 2006...","Administrative Instruction 1, October 19, 2006...","[Administrative Instruction 1, October 19, 200...",https://mikeanders.org/data/Ontologies/DoD/Cor...,"{'DTS': 'DoD Telecommunications System', 'AI':...",{}
3,admin instructions,a0115p.txt,"Administrative Instruction 115, August 13, 201...","Administrative Instruction 115, August 13, 201...","[Administrative Instruction 115, August 13, 20...",https://mikeanders.org/data/Ontologies/DoD/Cor...,"{'AI': 'administrative instruction', 'NCR': 'N...",{'9 GLOSSARY PART I': ' ABBREVIATIO...
4,admin instructions,a023p.txt,"Administrative Instruction No. 23, December 20...","Administrative Instruction No. 23, December 20...","[Administrative Instruction No. 23, December 2...",https://mikeanders.org/data/Ontologies/DoD/Cor...,"{'WHS': 'Washington Headquarters Services', 'A...",{}


## OG Hearst Pattern

In [4]:
# initialize hearst pattern class
hp = HearstPatterns(extended = True)
# for more hearst patterns beyond the OG six, use below:
# hp = HearstPatterns(extended = True)

In [5]:
# show how it works
test = 'There are many great instruments such as violin and guitar.'
print(hp.find_hyponyms(test))

[('violin', 'great instrument'), ('guitar', 'great instrument')]


## Mike Hearst Pattern Edit

In [6]:
pm = PatternMatcher()

In [7]:
print(pm.find_hyponyms(test))

[('violin', 'great instrument'), ('guitar', 'great instrument')]


## Iterate Through Corpus

In [8]:
# iterate through cleaned text in our corpus
all_hyponyms = []
count = 0
for text in df['cleaned_text_list']:
    count += 1
    for sent in text:
        hyponyms = pm.find_hyponyms(sent)
        if len(hyponyms) != 0:
            for hyponym in hyponyms:
                all_hyponyms.append(hyponym)
    if count % 5 == 0:
        print("Through {} documents".format(count))
corpus_hyponyms = set(all_hyponyms)

Through 5 documents
Through 10 documents
Through 15 documents
Through 20 documents
Through 25 documents
Through 30 documents
Through 35 documents
Through 40 documents
Through 45 documents
Through 50 documents
Through 55 documents
Through 60 documents
Through 65 documents
Through 70 documents
Through 75 documents
Through 80 documents
Through 85 documents
Through 90 documents
Through 95 documents
Through 100 documents
Through 105 documents
Through 110 documents
Through 115 documents
Through 120 documents
Through 125 documents
Through 130 documents
Through 135 documents
Through 140 documents
Through 145 documents
Through 150 documents
Through 155 documents
Through 160 documents
Through 165 documents
Through 170 documents
Through 175 documents
Through 180 documents
Through 185 documents
Through 190 documents
Through 195 documents
Through 200 documents
Through 205 documents
Through 210 documents
Through 215 documents
Through 220 documents
Through 225 documents
Through 230 documents
Through 

In [16]:
print(len(corpus_hyponyms))

5433


In [21]:
corpus_hyponyms

{('equipment', 'asset'),
 ('patch', 'classified ISs'),
 ('program review decision', 'DoD Component ERA and BRAC account programming'),
 ('Joint Staff policy', 'the DoD'),
 ('specially train air and ground crew', 'pesticide application capability'),
 ('laboratory support', 'related service'),
 ('program review decision', 'programming'),
 ('letterhead', 'Assure printing service'),
 ('sexual misconduct', 'the DEP'),
 ('RMF security', 'authorization'),
 ('the development', 'documentation'),
 ('disposal', 'cycle'),
 ('aluminum', 'metal'),
 ('the cause', 'ship transfer'),
 ('height', 'physical characteristic'),
 ('OPCON', 'relation'),
 ('solicitation', 'activity'),
 ('part cleaning', 'work function'),
 ('administrative appeal', 'classification'),
 ('an Environmental', 'the facility'),
 ('the human right verification information', 'nomination'),
 ('configuration guideline', 'information'),
 ('family housing', 'the DoD Components'),
 ('tenant activity', 'each commodity'),
 ('countermeasure', '

In [42]:
hyponym_df = pd.DataFrame(corpus_hyponyms, columns = {'hypernym2', 'hyponym'})

In [47]:
hyponym_df = hyponym_df.rename(columns={'hypernym2': 'hypernym'})

In [48]:
hyponym_df.to_csv("hyponyms_less.csv")