# Page Similarity then Sentence Contradictions

In [1]:
import spacy
import pandas as pd
import numpy as np

In [2]:
# Read in a single organization's corpus
df = pd.read_json("data/02. Data Sets/DoD Issuances/contradictions_datasets_dod_issuances.zip", orient='records', compression='infer')
df['fulltext'] = df.text_by_page.str.join(' ')
d0 = df.iloc[0]

In [3]:
df.iloc[16].url

'https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/140025/140025_vol1408.PDF?ver=vMpiDaPFuvIp_30LRxh7OQ%3d%3d'

In [8]:
# Text by page
# --> Page embeddings
# --> Cosine similarity between all pages
# --> For similar pages
# --> Split into sentences and run neural contradiction model

In [9]:
d0

file_name                                                        100
title                                             General Provisions
num                                                              100
id                 920aeda50488770c12ced1c8d6a7b99be703194f494ce7...
corpus                                                 dod_issuances
source_page_url        https://www.esd.whs.mil/DD/DoD-Issuances/DTM/
url                https://www.esd.whs.mil/Portals/54/Documents/D...
type                                                             pdf
n_pages                                                            7
word_count                                                      1200
text_by_page       [othe he Department of Defense INSTRUCTION NUM...
fulltext           othe he Department of Defense INSTRUCTION NUMB...
Name: 0, dtype: object

In [11]:
!python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m686.7 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [14]:
# CREATE PAGE EMBEDDINGS
nlp = spacy.load('en_core_web_sm')
pages_docs = list(nlp.pipe(d0.text_by_page))
p0 = pages_docs[0]

In [15]:
p0.vector

array([-6.05834313e-02, -2.95698136e-01, -2.07711205e-01,  8.23469460e-02,
        2.36313403e-01,  5.06467700e-01,  2.05171406e-01,  4.43163455e-01,
        1.81785971e-01,  3.95017751e-02, -5.70735056e-03,  1.00936845e-01,
       -5.25057316e-01, -2.93719620e-01, -1.69553727e-01,  1.13907084e-01,
       -2.86470741e-01, -2.29209615e-03, -1.14279449e-01, -1.91979527e-01,
       -2.62902439e-01, -5.65309860e-02,  2.34655797e-01,  3.48747894e-02,
        5.39177842e-02,  4.39624824e-02,  4.77332860e-01,  3.26507628e-01,
        3.44411194e-01,  1.06265292e-01, -1.71855748e-01, -2.26183355e-01,
        2.85653621e-01, -2.93031940e-03,  9.21946857e-03,  1.70437545e-01,
        4.04932678e-01,  1.59191862e-01, -3.29580277e-01, -1.52372420e-01,
       -3.07646275e-01, -1.49332851e-01, -2.64494300e-01,  8.02048668e-02,
       -1.43276259e-01,  9.34018567e-02,  9.10365209e-02, -6.10388666e-02,
        5.06417342e-02,  1.72732212e-02, -2.81670779e-01, -1.64969474e-01,
        4.03340638e-01, -

In [16]:
# COSINE SIMILARITY BETWEEN ALL PAGES (JUST IN THIS ONE DOC TO START)
from sklearn.metrics.pairwise import cosine_similarity

page_vectors = [page.vector for page in pages_docs]
sim = cosine_similarity(page_vectors)

In [17]:
sim

array([[0.99999994, 0.82639676, 0.71216565, 0.784781  , 0.9315864 ,
        0.8354922 , 0.8292345 ],
       [0.82639676, 0.99999994, 0.56158495, 0.55448574, 0.87179685,
        0.85670906, 0.8897875 ],
       [0.71216565, 0.56158495, 0.99999994, 0.8281613 , 0.6483461 ,
        0.6298679 , 0.6682044 ],
       [0.784781  , 0.55448574, 0.8281613 , 1.0000002 , 0.73446554,
        0.5874811 , 0.6206418 ],
       [0.9315864 , 0.87179685, 0.6483461 , 0.73446554, 0.9999997 ,
        0.8175531 , 0.840251  ],
       [0.8354922 , 0.85670906, 0.6298679 , 0.5874811 , 0.8175531 ,
        0.99999994, 0.89020133],
       [0.8292345 , 0.8897875 , 0.6682044 , 0.6206418 , 0.840251  ,
        0.89020133, 1.        ]], dtype=float32)

In [40]:
# FIND MOST SIMILAR PAGES
scores = pd.DataFrame(np.tril(sim, -1))

k = 3 # Top 3 most similar page combinations

top_k_indices = np.argpartition(np.tril(sim, -1), -k, axis=None)[-k:]
row_indices, col_indices = np.unravel_index(top_k_indices, sim.shape)
top_k_pagepairs = list(zip(row_indices, col_indices))

top_k_pagepairs

[(6, 1), (6, 5), (4, 0)]

In [42]:
# # FOR ALL PAGES IN THE TOP PAIRS, SPLIT INTO SENTENCES
# wanted_page_indices = np.unique(top_k_pagepairs)
# wanted_pages = 

array([0, 1, 4, 5, 6])

In [43]:
# FOR ALL PAGE PAIRS, SPLIT INTO SENTENCES
from spacy.lang.en import English
nlp_sentencizer = English()
nlp_sentencizer.add_pipe('sentencizer')

pair_sentences = {}
for pair in top_k_pagepairs:

    # idx = 6
    # page_text = d0.text_by_page[idx]
    # page_sents_text = [sent.text for sent in nlp_sentencizer(page_text).sents]

    pair_sentences[pair] = [
        [sent.text for sent in nlp_sentencizer(
            d0.text_by_page[idx]
        ).sents]
        for idx in pair   
    ]

    # TODO: Incorporate preliminary cleaning
    # cutoff_characters = 30
    # sents_text_clean = list(filter(lambda s: len(s)>cutoff_characters, sents_text))
    


In [44]:
pair_sentences[(6,1)]

[['DoDI 1400.25-V100, December 1996 7 ENCLOSURE 2 e. Implementing procedures and programs may be issued at the operating level.',
  'f. The DUSD(CPP) shall issue DoD Manuals as necessary to provide detailed procedural, operational, or administrative material on specific program areas or to provide model programs on subjects that should be uniform for DoD-wide application.',
  '3.',
  'WAIVERS.',
  'Requests for waivers to this Volume or other DoD civilian personnel management issuances authorized by Reference (a) shall be forwarded with full justification through command channels to the DUSD(CPP) for appropriate action.'],
 ['DoDI 1400.25-V100, December 1996 2 (3) Be issued only if necessary to comply with Executive orders, law, or regulation, or to assist civilian personnel offices and human resource offices (CPOs/HROs), managers, supervisors, employees, and their representatives with civilian personnel management issues. (',
  '4) Provide for the optimal delegation of authorities and

In [47]:
# CREATE ALL SENTENCE PAIR COMBINATIONS
import itertools
# ... for pair in pair_sentences...
sentence_combinations = list(itertools.product(*pair_sentences[(6,1)]))
# sentence_permutations = list(itertools.permutations(sents_text, 2))
print(len(sentence_combinations))
# print(len(sentence_permutations))

75


In [49]:
### LOAD CONTRADICTION MODEL

# https://github.com/facebookresearch/anli/blob/main/src/hg_api/interactive.py
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

hg_model_hub_name = "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"

# Will take a moment to download
tokenizer = AutoTokenizer.from_pretrained(hg_model_hub_name)
model = AutoModelForSequenceClassification.from_pretrained(hg_model_hub_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/703 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [50]:
### DEFINE CONTRADICTION FUNCTION
def evaluate(premise, hypothesis, tokenizer=tokenizer, model=model):
    max_length = 256

    tokenized_input_seq_pair = tokenizer.encode_plus(premise, hypothesis,
                                                     max_length=max_length,
                                                     return_token_type_ids=True, truncation=True)

    input_ids = torch.Tensor(tokenized_input_seq_pair['input_ids']).long().unsqueeze(0)
    # remember bart doesn't have 'token_type_ids', remove the line below if you are using bart.
    token_type_ids = torch.Tensor(tokenized_input_seq_pair['token_type_ids']).long().unsqueeze(0)
    attention_mask = torch.Tensor(tokenized_input_seq_pair['attention_mask']).long().unsqueeze(0)

    outputs = model(input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    labels=None)
    
    predicted_probability = torch.softmax(outputs[0], dim=1)[0].tolist()  # batch_size only one

    # Note:
    # "id2label": {
    #     "0": "entailment",
    #     "1": "neutral",
    #     "2": "contradiction"
    # },
    return predicted_probability


In [51]:
### COMPUTE CONTRADICTION PROBABILITIES FOR ALL SENTENCE PAIRS
# (for now this is just for one page pair)
from tqdm import tqdm
outputs = []
for pair in tqdm(sentence_combinations):
    probs = evaluate(pair[0], pair[1])
    outputs.append(probs)


100%|██████████| 75/75 [00:16<00:00,  4.42it/s]


In [53]:
scores = pd.DataFrame(outputs, columns=['entailment', 'neutral', 'contradiction'])
scores.head()

Unnamed: 0,entailment,neutral,contradiction
0,0.007259,0.933814,0.058927
1,0.019067,0.909037,0.071896
2,0.002028,0.967357,0.030614
3,0.003969,0.981052,0.014978
4,0.00585,0.965512,0.028638


In [26]:
# FIND MOST SIMILAR PAGES
k = 3 # top k most similar
most_similar_idx = np.unravel_index(np.tril(sim, -1).argmax(), sim.shape)

In [27]:
most_similar_idx

(4, 0)