# Import Required Libraries

In [2]:
import pandas as pd
import sys
import spacy
from spacy.matcher import Matcher
import networkx as nx
import itertools
from tqdm import tqdm

# Utilities

## Create Dataframe of Entity Pairs and Sentences

In [3]:
def analyze(sentences_json_filename, sampled_dest=None):
    df = pd.read_json(sentences_json_filename, lines=True)
    mapping = {}
    for _, row in df.iterrows():
        mentions = [x['text'] for x in row['entityMentions']]
        mentions = list(set(mentions))  # there can be multiple mentions of the same entity in the sentence
        if len(mentions) < 2:
            continue
        pairs = list(itertools.combinations(mentions, 2))
        sent = ' '.join(row['tokens'])
        for pair in pairs:
            pair_key = str(set(pair))
            sents = mapping.get(pair_key, [])
            sents.append(sent)
            mapping[pair_key] = sents

    print(len(mapping))
    lst = []
    for key, val in mapping.items():
        lst.append({"pair": key, "mentions": val, "mention_ct": len(val)})
    df = pd.DataFrame(lst)
    df = df.sort_values(by='mention_ct', ascending=False)
    if sampled_dest is not None:
        df.head(200).to_csv(sampled_dest, index=None)
    return df

## Create Dataframe of sentences only

In [4]:

def load(sentences_filename):
    nlp = spacy.load('en_core_web_sm', disable = ['ner'])
    with open(sentences_filename, 'r') as f:
        lines = f.readlines()
        lines = [l.strip() for l in lines]
        lines = [l for l in lines if len(l) > 0]
        print(len(lines))
        sents = []
        for line in tqdm(lines):
            doc = nlp(line)
            sentences = [sent.text for sent in doc.sents]
            sents += sentences
    df = pd.DataFrame(sents, columns=['mentions'])
    return df

## Learn patterns from matched rows

In [5]:
def learn_patterns(all_mentions, src, tgt, nlp=None):
    if nlp is None:
        nlp = spacy.load('en_core_web_sm')
    
    src_matcher = Matcher(nlp.vocab)
    src_pattern = [{"LOWER": t} for t in src.split(' ')]
    src_matcher.add("src", [src_pattern])

    tgt_matcher = Matcher(nlp.vocab)
    tgt_pattern = [{"LOWER": t} for t in tgt.split(' ')]
    tgt_matcher.add("tgt", [tgt_pattern])
    
    patterns = {}
    for mention in all_mentions:
        doc = nlp(mention)
        src_matches = src_matcher(doc)
        if len(src_matches) == 0:
            # print('src not matched')
            continue
        tgt_matches = tgt_matcher(doc)
        if len(tgt_matches) == 0:
            # print('tgt not matched')
            continue
        src_match = src_matches[0]
        tgt_match = tgt_matches[0]
        src_span = doc[src_match[1]: src_match[2]]
        tgt_span = doc[tgt_match[1]: tgt_match[2]]
        
        if len(spacy.util.filter_spans([src_span, tgt_span])) != 2: # distinct_spans
            print('overlapping spans')
            continue
        
        src_root = src_span.root
        tgt_root = tgt_span.root
        
        #  print(mention)
        edges = []
        for token in doc:
            for child in token.children:
                edges.append(('{}-{}'.format(token.lower_,token.i), '{}-{}'.format(child.lower_,child.i))) 
        
        graph = nx.Graph(edges) 
        path = None
        source = '{}-{}'.format(src_root.lower_, src_root.i)
        target = '{}-{}'.format(tgt_root.lower_, tgt_root.i)
        if nx.has_path(graph, source=source, target=target):
            path = nx.shortest_path(graph, source=source, target=target)
        #  print(path)
        if path is not None:
            for t in src_span:
                n = '{}-{}'.format(t.lower_, t.i)  
                if n not in path:
                    path.append(n)
            for t in tgt_span:
                n = '{}-{}'.format(t.lower_, t.i)
                if n not in path:
                    path.append(n)
            path_nodes = {}
            for p in path:
                t, i = p.rsplit('-', 1)
                i = int(i)
                if i in range(src_match[1], src_match[2]):
                    t = '<src>'
                elif i in range(tgt_match[1], tgt_match[2]):
                    t = '<tgt>'
                path_nodes[i] = t
            path_nodes = sorted(path_nodes.items(), key=lambda x: x[0])
            pattern = ' '.join([p[1] for p in path_nodes])
            patterns[pattern] = patterns.get(pattern, 0) + 1
    patterns = {k:v for k,v in patterns.items() if v > 1}
    patterns = sorted(patterns.items(), key=lambda x: x[1], reverse=True)
    return patterns

## Find patterns from Entity Pair Dataframe

In [6]:
def get_patterns_extracted_mentions(df, src, tgt, nlp=None):
    relevant_rows = df[df.apply(lambda x: src in eval(x['pair']) and tgt in eval(x['pair']), axis=1)]
    if len(relevant_rows) == 0:
        print('No mentions found')
        return []
    
    mentions = relevant_rows['mentions'].tolist()
    all_mentions = list(itertools.chain(*mentions))
    all_mentions = list(set(all_mentions))
    print('mentions found: {}'.format(len(all_mentions)))
    return learn_patterns(all_mentions, src, tgt, nlp)
    

## Find patterns from raw sentences

In [7]:
def get_patterns_raw(df, src, tgt, nlp=None):
    relevant_rows = df[df.apply(lambda x: src in x['mentions'] and tgt in x['mentions'], axis=1)]
    if len(relevant_rows) == 0:
        print('No mentions found')
        return []
        
    all_mentions = relevant_rows['mentions'].tolist()
    all_mentions = list(set(all_mentions))
    print('mentions found: {}'.format(len(all_mentions)))
    return learn_patterns(all_mentions, src, tgt, nlp)

# Indeed Ans dataset

In [9]:
filename = '/home/ubuntu/users/nikita/src/HiExpan/data/indeeda/intermediate/sentences.json'
ans_df = analyze(filename)

48908


In [10]:
sentences = '/home/ubuntu/users/nikita/src/HiExpan/data/indeeda/source/corpus.txt'
ans_sent_df = load(sentences)

  0%|          | 12/318786 [00:00<44:54, 118.30it/s]

318786


100%|██████████| 318786/318786 [31:17<00:00, 169.78it/s]


In [11]:
nlp = spacy.load('en_core_web_sm')

## Hypernym-Hyponym

In [171]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, 'part-time', 'full-time', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, 'part-time', 'full-time', nlp)
print(raw_patterns)

Patterns from extracted mentions
No mentions found
[]


Patterns from raw text
mentions found: 111
[]


In [169]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, 'urine', 'swab', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, 'urine', 'swab', nlp)
print(raw_patterns)

Patterns from extracted mentions
No mentions found
[]


Patterns from raw text
mentions found: 61
[('<tgt> <src>', 10), ('<tgt> <src> test', 9), ('<src> <tgt>', 8), ('<src> test <tgt>', 2), ('<src> test <tgt> blood test', 2), ('<src> <tgt> drug test', 2)]


In [170]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, 'morning shift', 'night shift', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, 'morning shift', 'night shift', nlp)
print(raw_patterns)

Patterns from extracted mentions
mentions found: 13
[('<src> <src> <tgt> <tgt>', 7)]


Patterns from raw text
mentions found: 16
[('<src> <src> <tgt> <tgt>', 5)]


In [172]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, 'dental', 'health insurance', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, 'dental', 'health insurance', nlp)
print(raw_patterns)

Patterns from extracted mentions
No mentions found
[]


Patterns from raw text
mentions found: 19
[('<src> <tgt> <tgt>', 4), ('<tgt> <tgt> <src>', 2)]


In [173]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, '401k', 'paid vacation', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, '401k', 'paid vacation', nlp)
print(raw_patterns)

Patterns from extracted mentions
No mentions found
[]


Patterns from raw text
mentions found: 9
[('<src> <tgt> <tgt>', 2)]


## Part-of

In [174]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, 'interview', 'orientation', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, 'interview', 'orientation', nlp)
print(raw_patterns)

Patterns from extracted mentions
No mentions found
[]


Patterns from raw text
mentions found: 201
[('<src> <tgt>', 33), ('<tgt> <src>', 3), ('after <src> do <tgt>', 2), ('had <src> <tgt>', 2)]


In [177]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, 'interview', 'long', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, 'interview', 'long', nlp)
print(raw_patterns)

Patterns from extracted mentions
No mentions found
[]


Patterns from raw text
mentions found: 231
[('<src> was <tgt>', 8), ('<tgt> <src>', 7), ('<src> process is <tgt>', 7), ('<tgt> take call for <src>', 3), ('<tgt> take for <src>', 3), ('<src> process was <tgt>', 3), ('<tgt> <src> process', 2), ('<tgt> take get <src>', 2), ('<src> is <tgt>', 2), ('<src> <tgt>', 2), ('<tgt> <src> hiring process', 2), ('<tgt> for <src>', 2)]


In [178]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, 'interview', 'training', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, 'interview', 'training', nlp)
print(raw_patterns)

Patterns from extracted mentions
No mentions found
[]


Patterns from raw text
mentions found: 131
[('<src> <tgt>', 15), ('<src> 6+hours of <tgt>', 2)]


In [180]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, 'drug test', 'saliva', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, 'drug test', 'saliva', nlp)
print(raw_patterns)

Patterns from extracted mentions
mentions found: 39
[('<tgt> <src> <src>', 23), ('<src> <src> is <tgt> test', 2), ('<src> <src> is <tgt> swab', 2)]


Patterns from raw text
mentions found: 41
[('<tgt> <src> <src>', 19), ('<src> <src> is <tgt> test', 2), ('<src> <src> is <tgt> swab', 2)]


In [186]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, 'drug test', 'position', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, 'drug test', 'position', nlp)
print(raw_patterns)

Patterns from extracted mentions
No mentions found
[]


Patterns from raw text
mentions found: 299
[('<src> <src> for <tgt>', 32), ('<tgt> require <src> <src>', 3), ('<tgt> <src> <src>', 3), ('<src> <src> required for <tgt>', 3), ('<src> <src> applying for <tgt>', 2), ('<tgt> have take <src> <src>', 2), ('<src> <src> needed for <tgt>', 2), ('<tgt> have <src> <src>', 2)]


In [187]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, 'drug test', 'location', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, 'drug test', 'location', nlp)
print(raw_patterns)

Patterns from extracted mentions
No mentions found
[]


Patterns from raw text
mentions found: 135
[('<src> <src> at <tgt>', 9), ('<tgt> <src> <src>', 5), ('<src> <src> <tgt>', 3), ('<tgt> did <src> <src>', 2), ('do <src> <src> at <tgt>', 2), ('<tgt> do <src> <src>', 2)]


In [181]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, 'benefits', '401k', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, 'benefits', '401k', nlp)
print(raw_patterns)

Patterns from extracted mentions
No mentions found
[]


Patterns from raw text
mentions found: 52
[('<src> <tgt>', 11), ('<tgt> <src>', 8), ('<src> of <tgt>', 3), ('<tgt> plan <src>', 2)]


In [188]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, 'benefits', 'position', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, 'benefits', 'position', nlp)
print(raw_patterns)

Patterns from extracted mentions
No mentions found
[]


Patterns from raw text
mentions found: 58
[('<tgt> with <src>', 4), ('<tgt> had <src>', 2), ('<tgt> with pay <src>', 2), ('<tgt> <src>', 2)]


In [183]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, 'pay', 'schedule', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, 'pay', 'schedule', nlp)
print(raw_patterns)

Patterns from extracted mentions
No mentions found
[]


Patterns from raw text
mentions found: 169
[('<src> <tgt>', 61), ('<tgt> <src>', 4)]


## Others (YS)

In [16]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, 'they', '401k', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, 'they', '401k', nlp)
print(raw_patterns)

Patterns from extracted mentions
No mentions found
[]


Patterns from raw text
mentions found: 39
[('<src> offer <tgt>', 8), ('<src> provide <tgt>', 4), ('<src> have <tgt>', 3), ('<src> provide <tgt> benefits', 2)]


In [17]:
print('Patterns from extracted mentions')
extracted_patterns = get_patterns_extracted_mentions(ans_df, 'they', 'weekly', nlp)
print(extracted_patterns)
print()
print()
print('Patterns from raw text')
raw_patterns = get_patterns_raw(ans_sent_df, 'they', 'weekly', nlp)
print(raw_patterns)

Patterns from extracted mentions
No mentions found
[]


Patterns from raw text
mentions found: 245
[('<src> pay <tgt>', 66), ('<src> paid <tgt>', 41), ('<src> get <tgt>', 4), ('<src> are <tgt>', 3), ('<src> do <tgt>', 2), ('<src> paid <tgt> weekly', 2)]
