# DoD Contradictions

This notebook demonstrates the same pipeline as from pipeline.py applied to the full DoD Issuances dataset. It was ran from an Azure Databricks environment.

In [0]:
# A little hack to get tqdm progress bars working.
#
# In the Databricks notebook, tqdm auto-detects a notebook but actually the std
# progress bar is desired! So, we just replace the tqdm.auto module with the
# tqdm.std module.
import sys
sys.modules['tqdm.auto'] = __import__('tqdm.std')  
sys.modules['tqdm.autonotebook'] = __import__('tqdm.std')

## Imports and Configuration

In [0]:
from src import loading, processing, custom_preprocessors, scoring
from functools import partial
from typing import Dict, List, Tuple

from haystack.schema import Document
import numpy as np
from pandas import DataFrame

In [0]:
# Which dataset to load.
DATASET_TABLE = "team_hdsiprodigies.contradictions_datasets_dod_issuances"
# How many documents to include in the pipeline run.
SUBSET_SIZE = None
# Parameters for cleaning sentences in initial chunking
CHUNK_CLEANING_TOC_PERIOD_THRESHOLD = 5
CHUNK_CLEANING_LENGTH_MINIMUM = 15
CHUNK_CLEANING_LENGTH_MAXIMUM = 1000
# Parameters for creating chunks
CHUNK_LENGTH = 8
CHUNK_OVERLAP = 2
# Parameters for pre-selecting similar chunks
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'
MAX_CHUNK_SIMILARITY_THRESHOLD = 0.87
CHUNK_SIMILARITY_TOP_N = 500
# Parameters for selecting sentences as contradiction candidates
SENTENCE_CLEANING_LENGTH_MINIMUM = 40
CANDIDATE_SELECTION_TOP_K = 1000
SAVED_CHUNKS_FILEPATH = '/dbfs/FileStore/desired-chunks.pkl'
SAVED_CONTRADICTION_SCORES_FILEPATH = '/dbfs/FileStore/contradiction-scores.csv'
SAVED_CANDIDATES_FILEPATH = '/dbfs/FileStore/candidates.csv'


In [0]:
# -------------------------------- LOADING ------------------------------- #
df = loading.load_dataset_from_pyspark(DATASET_TABLE)


In [0]:
# ------------------------------ PREPROCESSING --------------------------- #
df['fulltext'] = df.text_by_page.apply(processing.clean_and_combine_pages)

if SUBSET_SIZE:
    df = df.iloc[:SUBSET_SIZE]
docs = processing.convert_frame_to_haystack(df)

# A chunk is constructed using a sliding window. It will be N sentences long
# if there are that many sentences remaining in the document. The next chunk
# will include the last K sentences of the previous chunk if a previous
# chunk exists. Chunks will not span documents.
chunk_sentence_cleaning_func = partial(processing.clean_sentence_splits,
    toc_period_threshold = CHUNK_CLEANING_TOC_PERIOD_THRESHOLD,
    length_minimum = CHUNK_CLEANING_LENGTH_MINIMUM,
    length_maximum = CHUNK_CLEANING_LENGTH_MAXIMUM,
)
chunker = custom_preprocessors.SplitCleanerPreProcessor(
    language='en',
    split_by='sentence',
    split_cleaner=chunk_sentence_cleaning_func,
    split_length=CHUNK_LENGTH,
    split_overlap=CHUNK_OVERLAP,
    split_respect_sentence_boundary=False, # incompatible with 'passage' or 'sentence'
)
doc_chunks = chunker.process(docs)

In [0]:
len(doc_chunks)

In [0]:
doc_chunks = processing.remove_identical_chunks(doc_chunks)

In [0]:
len(doc_chunks)

In [0]:
# --------------------------- CHUNK SIMILARITY --------------------------- #
# Available models: https://www.sbert.net/docs/pretrained_models.html
embeddings = processing.compute_chunk_embeddings(
    chunks=doc_chunks,
    model_name=EMBEDDING_MODEL_NAME,
    show_progress_bar=True,
)
# Enrich our chunks with embeddings
for chunk, embedding in zip(doc_chunks, embeddings):
    chunk.embedding = embedding

In [0]:
similarity_matrix = processing.compute_chunk_similarity(doc_chunks)

In [0]:
# Get rid of chunks that have similarity scores that are too high. This
# value was fine-tuned by trial and error to remove similar chunks that were
# just common header/disclaimer text.
top_n_pair_indices = processing.get_top_n_similar_chunk_pair_indices(
    scores=similarity_matrix,
    n=CHUNK_SIMILARITY_TOP_N,
    max_similarity_threshold=MAX_CHUNK_SIMILARITY_THRESHOLD,
)

In [0]:
similar_chunk_id_pairs = [
    (doc_chunks[c1].id, doc_chunks[c2].id) for c1, c2 in top_n_pair_indices
]
desired_indices = np.unique(np.array(top_n_pair_indices).flat)
# NOTE: Storing our desired chunks to a new variable should allow us to
# dispose of the full set of doc_chunks from memory, which we may need for
# the contradiction scoring model! If we need this, additional code will be
# required here.
desired_chunks = {
    doc_chunks[i].id: doc_chunks[i]
    for i in desired_indices
}

sentence_cleaning_func = partial(processing.clean_sentence_splits,
    length_minimum = SENTENCE_CLEANING_LENGTH_MINIMUM,
)
desired_chunk_sentences = processing.split_chunks_to_sentences(
    chunks=desired_chunks.values(),
    split_cleaner=sentence_cleaning_func,
)
# Enrich our desired chunks with sentences
for chunk, sentences in zip(desired_chunks.values(), desired_chunk_sentences):
    chunk.sentences = sentences

In [0]:
len(desired_chunks)

In [0]:
# ------------------------- CONTRADICTION SCORING ------------------------ #
tokenizer, contradiction_model = scoring.load_contradiction_model()

In [0]:
contradiction_scores = scoring.compute_sentence_contradiction_scores(
    chunks=desired_chunks,
    chunk_id_pairs=similar_chunk_id_pairs,
    tokenizer=tokenizer,
    model=contradiction_model,
)

In [0]:
# Save chunks and contradiction scores so that we can look at more than just the
# top k candidates in the future if we desire.
loading.save_candidates_csv(contradiction_scores, SAVED_CONTRADICTION_SCORES_FILEPATH)
loading.save_chunks_pickle(desired_chunks, SAVED_CHUNKS_FILEPATH)

# To download locally, modify the URL of this page so that it looks like
# https://adb-...azuredatabricks.net/files/FILENAME_UNDER_FileStore_DIRECTORY/?o=...
# keeping the id the same in the address and o= parameter. It will prompt you to
# save the file!

In [0]:
candidates = scoring.get_top_k_contradictive_candidates(
    contradiction_scores=contradiction_scores,
    k=CANDIDATE_SELECTION_TOP_K,
)
candidate_info = scoring.retrieve_candidate_info(
    candidates=candidates,
    chunks=desired_chunks
)

In [0]:
loading.save_candidates_csv(candidate_info, SAVED_CANDIDATES_FILEPATH)

In [0]:
candidate_info.head()

Unnamed: 0,chunk_A.id,chunk_A.content,chunk_A.meta.corpus,chunk_A.meta.title,chunk_A.meta.file_name,chunk_A.meta.url,chunk_A.meta.id,chunk_A.meta._split_id,sentence_A,chunk_B.id,chunk_B.content,chunk_B.meta.corpus,chunk_B.meta.title,chunk_B.meta.file_name,chunk_B.meta.url,chunk_B.meta.id,chunk_B.meta._split_id,sentence_B,entailment,neutral,contradiction
0,a49093d345ebb00f1112e4082e976508,This data element is also known as Pay Entry Base Date. Format: YYYYMMDD. If...,dod_issuances,Automated Extract of Active Duty Military Personnel Records,DoDI 1336.05,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/133605p.pdf?v...,64966f2897774248e1a0baa7cbc55457f366d28f9cc02c8f537541706ad76214,32,22 292-294 Accession Program Source 292 a. Enlisted Accession Program Source...,e2a990ab455acef9aa6c415dab3c51f8,A Induction B Voluntary enlistment in a regular component C Voluntary enlist...,dod_issuances,Automated Extract of Active Duty Military Personnel Records,DoDI 1336.05,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/133605p.pdf?v...,64966f2897774248e1a0baa7cbc55457f366d28f9cc02c8f537541706ad76214,33,"Applicable only to commissioned officers, other than commissioned warrant of...",4.5e-05,0.000343,0.999613
1,c1feed26381cb1db40cf683aa8621e34,This instruction is available on the Directives Division Website at https://...,dod_issuances,"Military Officer Actions Requiring Presidential, Secretary of Defense, or Un...",DoDI 1320.04,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/132004p.pdf,3bc7f996335f339c23b84e1e6276aefc2f41cdf103bd541c9ac1043c0d631f09,3,"This instruction is effective January 3, 2014.",fb9ba053bced31cbee8703ba7edd427f,This Instruction is approved for public release and is available on the Dire...,dod_issuances,Voluntary Education Programs,DoDI 1322.25,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/132225p.pdf,0c1b0c20b979b9075e85264650aa4341a319a2c4c677f89045bd183bfc1d6f48,6,"This Instruction is effective March 15, 2011.",4.6e-05,0.000415,0.999538
2,31a0f2e22eee6ad67b2d84f89cd42e96,SUMMARY OF CHANGE 2. This administrative change updates: a. The title of the...,dod_issuances,Intelligence Support to the Defense Critical Infrastructure Program (DCIP),DoDI 3020.51,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/302051p.pdf?v...,70a017956d0d0962694db3ae7b44100fc64072f69ed56f6a3fa2c2b08dc35490,3,This administrative change updates: a. The title of the Under Secretary of D...,106946c53f00fd46e35bca5f47b5b5fa,This instruction is available on the Directives Division Website at https://...,dod_issuances,DoD Cryptologic Training,DoDI 3305.09,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/330509p.pdf?v...,15858382aaf9d25d3ed4dab9bba37c1aca2fbed3468410eb799b83875ad00eaf,3,This administrative change updates the title of the Under Secretary of Defen...,0.000126,0.000366,0.999508
3,a1bdd1d1e8e667ce28733f212889c882,Requires a 4-year service agreement: 2 years on active duty plus 2 years in ...,dod_issuances,Defense Manpower Data Center Domain Values for Military Personnel Data Extracts,DoDM 1336.05,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodm/133605m.PDF?v...,fb16fb843de46b5fbc89e4857e25ee5649efdc877e397d788fe609562b36241f,80,Requires a 3-year active duty service agreement.,9395790f7231ae64937e2b505771dada,Requires a 4-year active duty service agreement. D5 $150 Effective 1 August ...,dod_issuances,Defense Manpower Data Center Domain Values for Military Personnel Data Extracts,DoDM 1336.05,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodm/133605m.PDF?v...,fb16fb843de46b5fbc89e4857e25ee5649efdc877e397d788fe609562b36241f,85,Requires a 6-year active duty service agreement.,5.8e-05,0.00044,0.999502
4,ead7ad04914e16e33dd8baa9b5f63cfd,Requires a 4-year service agreement: 2 years on active duty plus 2 years in ...,dod_issuances,Defense Manpower Data Center Domain Values for Military Personnel Data Extracts,DoDM 1336.05,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodm/133605m.PDF?v...,fb16fb843de46b5fbc89e4857e25ee5649efdc877e397d788fe609562b36241f,70,Requires a 3-year active duty service agreement.,865b34886db2f78504fbf37cade6a066,Requires a 4-year active duty service agreement. K5 $750 Effective 1 August ...,dod_issuances,Defense Manpower Data Center Domain Values for Military Personnel Data Extracts,DoDM 1336.05,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodm/133605m.PDF?v...,fb16fb843de46b5fbc89e4857e25ee5649efdc877e397d788fe609562b36241f,97,Requires a 6-year active duty service agreement.,5.8e-05,0.00044,0.999502


In [0]:
# Example of the first 100 results
for idx, candidate in candidate_info[:100].iterrows():
    print(f"({idx})")
    scoring.pretty_print_candidate(candidate)
    print('\n\n')