# DoD Contradictions

This notebook demonstrates the same pipeline as from pipeline.py applied to the full DoD Issuances dataset. It was ran from an Azure Databricks environment.

In [None]:
# A little hack to get tqdm progress bars working.
#
# In the Databricks notebook, tqdm auto-detects a notebook but actually the std
# progress bar is desired! So, we just replace the tqdm.auto module with the
# tqdm.std module.
import sys
sys.modules['tqdm.auto'] = __import__('tqdm.std')  
sys.modules['tqdm.autonotebook'] = __import__('tqdm.std')

## Imports and Configuration

In [None]:
from src import loading, processing, custom_preprocessors, scoring
from functools import partial
from typing import Dict, List, Tuple

from haystack.schema import Document
import numpy as np
from pandas import DataFrame

In [None]:
# Which dataset to load.
DATASET_TABLE = "team_hdsiprodigies.contradictions_datasets_dod_issuances"
# How many documents to include in the pipeline run.
SUBSET_SIZE = None
# Parameters for cleaning sentences in initial chunking
CHUNK_CLEANING_TOC_PERIOD_THRESHOLD = 5
CHUNK_CLEANING_LENGTH_MINIMUM = 15
CHUNK_CLEANING_LENGTH_MAXIMUM = 1000
# Parameters for creating chunks
CHUNK_LENGTH = 8
CHUNK_OVERLAP = 2
# Parameters for pre-selecting similar chunks
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'
MAX_CHUNK_SIMILARITY_THRESHOLD = 0.87
CHUNK_SIMILARITY_TOP_N = 500
# Parameters for selecting sentences as contradiction candidates
SENTENCE_CLEANING_LENGTH_MINIMUM = 40
CANDIDATE_SELECTION_TOP_K = 100
SAVED_CHUNKS_FILEPATH = '/dbfs/FileStore/desired-chunks.pkl'
SAVED_CONTRADICTION_SCORES_FILEPATH = '/dbfs/FileStore/contradiction-scores.csv'
SAVED_CANDIDATES_FILEPATH = '/dbfs/FileStore/candidates.csv'


In [None]:
# -------------------------------- LOADING ------------------------------- #
df = loading.load_dataset_from_pyspark(DATASET_TABLE)


In [None]:
# ------------------------------ PREPROCESSING --------------------------- #
df['fulltext'] = df.text_by_page.apply(processing.clean_and_combine_pages)

if SUBSET_SIZE:
    df = df.iloc[:SUBSET_SIZE]
docs = processing.convert_frame_to_haystack(df)

# A chunk is constructed using a sliding window. It will be N sentences long
# if there are that many sentences remaining in the document. The next chunk
# will include the last K sentences of the previous chunk if a previous
# chunk exists. Chunks will not span documents.
chunk_sentence_cleaning_func = partial(processing.clean_sentence_splits,
    toc_period_threshold = CHUNK_CLEANING_TOC_PERIOD_THRESHOLD,
    length_minimum = CHUNK_CLEANING_LENGTH_MINIMUM,
    length_maximum = CHUNK_CLEANING_LENGTH_MAXIMUM,
)
chunker = custom_preprocessors.SplitCleanerPreProcessor(
    language='en',
    split_by='sentence',
    split_cleaner=chunk_sentence_cleaning_func,
    split_length=CHUNK_LENGTH,
    split_overlap=CHUNK_OVERLAP,
    split_respect_sentence_boundary=False, # incompatible with 'passage' or 'sentence'
)
doc_chunks = chunker.process(docs)

In [None]:
len(doc_chunks)

In [None]:
doc_chunks = processing.remove_identical_chunks(doc_chunks)

In [None]:
len(doc_chunks)

In [None]:
# --------------------------- CHUNK SIMILARITY --------------------------- #
# Available models: https://www.sbert.net/docs/pretrained_models.html
embeddings = processing.compute_chunk_embeddings(
    chunks=doc_chunks,
    model_name=EMBEDDING_MODEL_NAME,
    show_progress_bar=True,
)
# Enrich our chunks with embeddings
for chunk, embedding in zip(doc_chunks, embeddings):
    chunk.embedding = embedding

In [None]:
similarity_matrix = processing.compute_chunk_similarity(doc_chunks)

In [None]:
# Get rid of chunks that have similarity scores that are too high. This
# value was fine-tuned by trial and error to remove similar chunks that were
# just common header/disclaimer text.
top_n_pair_indices = processing.get_top_n_similar_chunk_pair_indices(
    scores=similarity_matrix,
    n=CHUNK_SIMILARITY_TOP_N,
    max_similarity_threshold=MAX_CHUNK_SIMILARITY_THRESHOLD,
)

In [None]:
similar_chunk_id_pairs = [
    (doc_chunks[c1].id, doc_chunks[c2].id) for c1, c2 in top_n_pair_indices
]
desired_indices = np.unique(np.array(top_n_pair_indices).flat)
# NOTE: Storing our desired chunks to a new variable should allow us to
# dispose of the full set of doc_chunks from memory, which we may need for
# the contradiction scoring model! If we need this, additional code will be
# required here.
desired_chunks = {
    doc_chunks[i].id: doc_chunks[i]
    for i in desired_indices
}

sentence_cleaning_func = partial(processing.clean_sentence_splits,
    length_minimum = SENTENCE_CLEANING_LENGTH_MINIMUM,
)
desired_chunk_sentences = processing.split_chunks_to_sentences(
    chunks=desired_chunks.values(),
    split_cleaner=sentence_cleaning_func,
)
# Enrich our desired chunks with sentences
for chunk, sentences in zip(desired_chunks.values(), desired_chunk_sentences):
    chunk.sentences = sentences

In [None]:
len(desired_chunks)

In [None]:
# ------------------------- CONTRADICTION SCORING ------------------------ #
tokenizer, contradiction_model = scoring.load_contradiction_model()

In [None]:
contradiction_scores = scoring.compute_sentence_contradiction_scores(
    chunks=desired_chunks,
    chunk_id_pairs=similar_chunk_id_pairs,
    tokenizer=tokenizer,
    model=contradiction_model,
)

In [None]:
# Save chunks and contradiction scores so that we can look at more than just the
# top k candidates in the future if we desire.
loading.save_candidates_csv(contradiction_scores, SAVED_CONTRADICTION_SCORES_FILEPATH)
loading.save_chunks_pickle(desired_chunks, SAVED_CHUNKS_FILEPATH)

In [None]:
candidates = scoring.get_top_k_contradictive_candidates(
    contradiction_scores=contradiction_scores,
    k=CANDIDATE_SELECTION_TOP_K,
)
candidate_info = scoring.retrieve_candidate_info(
    candidates=candidates,
    chunks=desired_chunks
)

In [None]:
candidate_info.head()

Unnamed: 0,chunk_A.id,chunk_A.content,chunk_A.meta.corpus,chunk_A.meta.title,chunk_A.meta.file_name,chunk_A.meta.url,chunk_A.meta.id,chunk_A.meta._split_id,sentence_A,chunk_B.id,chunk_B.content,chunk_B.meta.corpus,chunk_B.meta.title,chunk_B.meta.file_name,chunk_B.meta.url,chunk_B.meta.id,chunk_B.meta._split_id,sentence_B,entailment,neutral,contradiction
0,4662527fcb253c676136151e3d43a84c,"Department of Defense DIRECTIVE NUMBER 2000.13 March 11, 2014 Incorporating ...",dod_issuances,Civil Affairs,DoDD 2000.13,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/200013p.pdf?v...,8dab2540c02b0e1d15a5df4c92ea8b24a1f3a182bafe6e400d066f1aa8dd1b05,0,This directive reissues DoD Directive (DoDD) 2000.13 (Reference (a)) to upda...,7c2ae16323b2d90e1f4f745ed18ff981,"Department of Defense DIRECTIVE NUMBER 5100.01 December 21, 2010 Incorporati...",dod_issuances,Functions of the Department of Defense and its Major Components,DoDD 5100.01,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/510001p.pdf?v...,11566778fb6fa013db6324e8b06a0fe46434decd74fe8b0f7ab5ecc5a2768de5,0,This Directive: a.Reissues DoD Directive (DoDD) 5100.1 (Reference (a)).,0.000195,0.000257,0.999548
1,6ab31688679de8b8bb17a31dc39149c1,"Department of Defense INSTRUCTION NUMBER 1400.25, Volume 2013 April 17, 2012...",dod_issuances,Civilian Air Traffic Controllers (ATCs),331,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/140025/140025_vol3...,510e7ebd00a47c9e8e528837a0b31bf446893fb4fa915f0d890b25f7b6ded1b3,0,"Department of Defense INSTRUCTION NUMBER 1400.25, Volume 2013 April 17, 2012...",1135f6e1821c099878a633aa752b7595,"Department of Defense INSTRUCTION NUMBER 1400.25, Volume 451 November 4, 201...",dod_issuances,Awards,451,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/140025/140025_vol4...,4610810fc70811563a89c65e5d27c0bb90a713bda321546cf456ad1f7f6b7b0e,0,"Department of Defense INSTRUCTION NUMBER 1400.25, Volume 451 November 4, 201...",5.5e-05,0.0004,0.999545
2,c1feed26381cb1db40cf683aa8621e34,This instruction is available on the Directives Division Website at https://...,dod_issuances,"Military Officer Actions Requiring Presidential, Secretary of Defense, or Un...",DoDI 1320.04,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/132004p.pdf,3bc7f996335f339c23b84e1e6276aefc2f41cdf103bd541c9ac1043c0d631f09,3,"This instruction is effective January 3, 2014.",fb9ba053bced31cbee8703ba7edd427f,This Instruction is approved for public release and is available on the Dire...,dod_issuances,Voluntary Education Programs,DoDI 1322.25,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/132225p.pdf,0c1b0c20b979b9075e85264650aa4341a319a2c4c677f89045bd183bfc1d6f48,6,"This Instruction is effective March 15, 2011.",4.6e-05,0.000415,0.999538
3,e3051a4941aa3bd11e5313c0a5bb7a27,7.RELEASABILITY. Cleared for public release. This AI is available on the DoD...,dod_issuances,Personnel and Data Management Information Reporting Policies and Procedures ...,AI 101,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/ai/a101p.pdf?ver=g...,fc52dc5371508acdfa0d379ded8ae6860ab00925a4d7ddab784487ca3a38d25d,3,"This change is administrative and, in accordance with the February 1, 2022 W...",325e3fd7eb7b5c95d448e3f36e74c08,"This change is administrative and, in accordance with the March 21, 2022 Was...",dod_issuances,Department of Defense Spirit of Hope (SOH) Award,DoDI 1005.14,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/100514p.pdf?v...,60aaa18fd0c8ca82996701c28047a19cd51e6ad37e1eb443939b199f47929170,4,"This change is administrative and, in accordance with the March 21, 2022 Was...",0.000181,0.000288,0.999531
4,9062666acb355dd4158fa24e98a803b7,Cleared for public release. This instruction is available on the Directives ...,dod_issuances,Educational Requirements for Appointment to a Grade Above First Lieutenant o...,DoDI 1215.17,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/121517p.pdf,2b3c68840aaf3c4371f14c34fd73a5efaf842c6a68df6be1aafe3c2742a73383,3,"This instruction is effective January 29, 2014.",11634d82384910858041f035b890ecf1,Cleared for public release. This instruction is available on the Directives ...,dod_issuances,Defense Courier Operations (DCO),DoDI 5200.33,https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/520033p.pdf?v...,2d30f1adffc42cfbc2d0a54bc6cd4be8306b7d6f9dff40fbc73c6567524fb516,4,"This instruction is effective June 30, 2011.",4.4e-05,0.000443,0.999513


In [None]:
for idx, candidate in candidate_info.iterrows():
    print(f"({idx})")
    scoring.pretty_print_candidate(candidate)
    print('\n\n')