# Basic Similarity Pipeline

In [25]:
import pandas as pd
import spacy
import numpy as np

# Read in a single organization's corpus
df = pd.read_json("data/02. Data Sets/NIFA/contradictions_datasets_nifa_reports.zip", orient='records', compression='infer')

# Feature engineering
df['fulltext'] = df.text_by_page.str.join(' ')

In [3]:
nlp = spacy.load('en_core_web_sm')
nlp.enable_pipe("senter")
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fdf5f9c1c60>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fdf5f9c1de0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fdf5fabaab0>),
 ('senter', <spacy.pipeline.senter.SentenceRecognizer at 0x7fdf5f9c1900>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fdf5f9f38c0>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fdf5f7cb000>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fdf5fabad50>)]

In [73]:
docs = list(nlp.pipe(df.fulltext.iloc[:10]))
len(docs)

10

In [74]:
d0 = docs[0]
d1 = docs[1]

In [75]:
# Get similarity scores between all documents
from sklearn.metrics.pairwise import cosine_similarity

vectors = [d.vector for d in docs]

sim = cosine_similarity(vectors)
sim

array([[0.9999998 , 0.90701205, 0.45180175, 0.9628214 , 0.92784864,
        0.96166295, 0.963816  , 0.9551968 , 0.53328365, 0.93974704],
       [0.90701205, 1.        , 0.61923206, 0.91946745, 0.9413703 ,
        0.90639913, 0.93182683, 0.9383831 , 0.761079  , 0.9522818 ],
       [0.45180175, 0.61923206, 0.99999994, 0.46749824, 0.5520142 ,
        0.46073174, 0.487793  , 0.52700514, 0.81139284, 0.59623885],
       [0.9628214 , 0.91946745, 0.46749824, 1.        , 0.9816299 ,
        0.94153774, 0.94554734, 0.98668694, 0.60547584, 0.9699452 ],
       [0.92784864, 0.9413703 , 0.5520142 , 0.9816299 , 0.9999999 ,
        0.9203201 , 0.9271325 , 0.98271453, 0.69292617, 0.9785352 ],
       [0.96166295, 0.90639913, 0.46073174, 0.94153774, 0.9203201 ,
        1.0000001 , 0.98662245, 0.95091724, 0.56642133, 0.9402379 ],
       [0.963816  , 0.93182683, 0.487793  , 0.94554734, 0.9271325 ,
        0.98662245, 1.0000002 , 0.95057523, 0.6135747 , 0.94574773],
       [0.9551968 , 0.9383831 , 0.5270051

In [76]:
# Get similarity scores between all sentences in all documents

desired_docs = docs[0:3:2]

sents = [s for d in desired_docs for s in d.sents]
vectors_sents = [s.vector for s in sents]
sim_sents = cosine_similarity(vectors_sents)
print(sim_sents.shape)
# Remove diagonal entrees
np.fill_diagonal(sim_sents, 0)
sim_sents

(820, 820)


array([[0.        , 0.65993017, 0.69959444, ..., 0.67886394, 0.6814903 ,
        0.67463917],
       [0.65993017, 0.        , 0.51466626, ..., 0.35471618, 0.35768148,
        0.36253226],
       [0.69959444, 0.51466626, 0.        , ..., 0.4233155 , 0.42655492,
        0.4140238 ],
       ...,
       [0.67886394, 0.35471618, 0.4233155 , ..., 0.        , 0.99887544,
        0.99858356],
       [0.6814903 , 0.35768148, 0.42655492, ..., 0.99887544, 0.        ,
        0.9981862 ],
       [0.67463917, 0.36253226, 0.4140238 , ..., 0.99858356, 0.9981862 ,
        0.        ]], dtype=float32)

In [77]:
most_dissimilar_idx = np.unravel_index(sim_sents.argmin(), sim_sents.shape)
print(most_dissimilar_idx)

for i in most_dissimilar_idx:
    print(sents[i].text)

(298, 345)
Appraisals.
To obtain prior approval, the grantee must submit the following written information regarding each contract to the Authorized Departmental Officer as soon as the contractor has been selected:


In [78]:
most_similar_idx = np.unravel_index(sim_sents.argmax(), sim_sents.shape)
print(most_similar_idx)

for i in most_similar_idx:
    print(sents[i].text)

(817, 818)
Agriculture, Natural Resources and Gourneau, Haven, 6 Biological Engineering Hamley, Mark R., 17 Antonio, Thomas M., 7 Kowalkowski, Brian, 4 Carr, Charlene, 7, 20 Yarlott, David, 9 Crebs, Douglas, 16 Dahlberg, Steve, 19 Community Development Duafala, Carrie Ann, 3 Agnew, Wanda, 18 Dupuis, Virgil, 13 Brower, Pearl, 7 Etter, Steven, 10 Dupuis, Virgil, 13 Hafer, James, 3 Gourneau, Haven, 6 Hamley, Mark R., 17 Halvorson, Gary, 15, 20 Henry, Leslie Rae, 12 Hamley, Mark R., 17 Guinn, Mandy, 18 Henry, Leslie Rae, 12 Kinley, Sharon, 11 Kinley, Sharon, 11 Lichtenberg, Janene, 13 Kowalczak, Courtney, 6 Marlow, Amber, 8 Lindquist, Cynthia, 3 Mongoh, Mafany Ndiva, 15 Marlow, Amber, 8 Quijada-Mascarenas, Adrian, 17 Mongoh, Mafany Ndiva, 15 Sells, Angeline B., 16 Red Owl, Sherry, 14 St. Pierre, Mary Ruth, 16 Sineway, Carla, 12 Woodard, Janyce, 9 St. Pierre, Nathaniel, 16 Yanni, Steve, 2 Yanni, Steve, 2 Animals and Their Systems Community Gardening and Daye, Germaine, 10 Horticulture Duafa