This sheets answers the question of: out of the best models, how similar are the top X% of the results.  The best models are mpnet_base_v2, roberta, and scispacy, since all three of these models have the greatest z-scores from the noise.  See Noise_to_Related_Claims_Histogram_DrugLabelsandPatent.ipynb.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
zip_path='/content/drive/MyDrive/Colab Notebooks/zip/db2file.zip'
!cp "{zip_path}" .
!cp "/content/drive/MyDrive/Colab Notebooks/requirements.txt" .
!unzip -q db2file.zip
!rm db2file.zip
!rm -r en_core_sci_lg-0.4.0.zip
!cp "/content/drive/MyDrive/Colab Notebooks/zip/en_core_sci_lg-0.4.0.zip" .
!unzip -q en_core_sci_lg-0.4.0.zip
!rm en_core_sci_lg-0.4.0.zip

Mounted at /content/drive
rm: cannot remove 'en_core_sci_lg-0.4.0.zip': No such file or directory


In [None]:
!pip install -r '/content/requirements.txt'

Collecting beautifulsoup4==4.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/d1/41/e6495bd7d3781cee623ce23ea6ac73282a373088fcd0ddc809a047b18eae/beautifulsoup4-4.9.3-py3-none-any.whl (115kB)
[K     |██▉                             | 10kB 23.2MB/s eta 0:00:01[K     |█████▋                          | 20kB 15.8MB/s eta 0:00:01[K     |████████▌                       | 30kB 14.0MB/s eta 0:00:01[K     |███████████▎                    | 40kB 12.7MB/s eta 0:00:01[K     |██████████████▏                 | 51kB 7.9MB/s eta 0:00:01[K     |█████████████████               | 61kB 8.3MB/s eta 0:00:01[K     |███████████████████▉            | 71kB 8.6MB/s eta 0:00:01[K     |██████████████████████▋         | 81kB 9.6MB/s eta 0:00:01[K     |█████████████████████████▌      | 92kB 8.7MB/s eta 0:00:01[K     |████████████████████████████▎   | 102kB 7.7MB/s eta 0:00:01[K     |███████████████████████████████▏| 112kB 7.7MB/s eta 0:00:01[K     |██████████████████████████████

In [None]:
import random
import os
random.seed(30)
db2files = "/content/db2file/"
NDA_list=[f for f in os.listdir(db2files)]

In [None]:
def get_lines_in_file(file_name):
  if os.path.exists(file_name):
    f = open(file_name, "rb")
    return_list = [str(line.decode('unicode_escape')) for line in f if line.decode('unicode_escape').strip()]
    f.close()
    return return_list
  else:
    return []

In [None]:
def flat_list(lst):
  return [item for sublist in lst for item in sublist]

def get_additions(NDA, additions_folder_name):
  """ 
  Return all additions as a list for the set-id with most additions for a NDA 
  excluding the first addition.
  Parameters:
      NDA (string): NDA dir
      additions_folder_name (string): either 'just_additions' or 'additions_with_context'
  """
  if additions_folder_name not in ['just_additions', 'additions_with_context']:
    print(f"Parameter {additions_folder_name} not in ['just_additions', 'additions_with_context']")
    return []
  NDA_dir=db2files+str(NDA)+'/'
  set_id_dirs=[f for f in os.listdir(NDA_dir)]
  try:
    set_id_dirs.remove('patents')
  except ValueError:
    pass
  additions_list=[]
  for set_id_dir in set_id_dirs:
    additions_dir=NDA_dir+set_id_dir+'/'+additions_folder_name+'/'
    if os.path.exists(additions_dir):
      additions_files=sorted([additions_dir+f for f in os.listdir(additions_dir)])[1:]
      additions_list_tmp=flat_list([get_lines_in_file(file) for file in additions_files])
      if len(additions_list_tmp)> len(additions_list):
        additions_list=additions_list_tmp
  return additions_list

def get_patent_claims(NDA, patents_folder_name):
  """Return a list of patents claims for a NDA
  Parameters:
      NDA (string): NDA dir
      patents_folder_name (string): either 'patents' or 'patents_longhand'
  """
  patent_dir=db2files+str(NDA)+'/'+patents_folder_name+'/'
  if os.path.exists(patent_dir):
    patent_files=[patent_dir+f for f in os.listdir(patent_dir)]
    return flat_list([get_lines_in_file(file) for file in patent_files])
  return []
    

In [None]:
random_NDA_list=random.sample(NDA_list, int(len(NDA_list)*.33))
print(len(NDA_list), len(random_NDA_list), random_NDA_list[1])

1606 529 202895-21976


In [None]:
# narrow 1/3 of random data to NDA with patents and NDAs with additions.  If either is missing, we cannot check quality of additions to related patents.
random_NDA_list=[x for x in random_NDA_list if get_patent_claims(x, 'patents') and get_additions(x, 'additions_with_context')]

In [None]:
print(len(random_NDA_list))

292


In [None]:
def return_match(NDA, additions_folder_name, patent_folder_name, scoring_method_list, cutoff_percentage=.1):
  """
  This method returns {"method_name":{"additions_len": X, "claims_len": X, "matches": [[additions_num, claim_num],] }}
  Parameters:
    random_NDA_list (list): list of NDA numbers
    additions_folder_name (string): either 'patents' or 'patents_longhand'
    patent_folder_name (string): either 'just_additions' or 'additions_with_context'
    scoring_method_list (list): list of [[function that is use to score similarity, optional_scoring_method_field],..]
  """

  # scoring_method_result_dict={ "scoring_method.__name__": {"additions_len": X, "claims_len": X, "matches":[(addition, claim), }}
  scoring_method_result_dict={}

  for i in range(len(scoring_method_list)):
    scoring_method=scoring_method_list[i][0]
    optional_scoring_method_field=scoring_method_list[i][1]
    optional_scoring_method_field_name=scoring_method_list[i][2]

    claims = get_patent_claims(NDA, patent_folder_name)
    additions = get_additions(NDA, additions_folder_name)

    matrix=scoring_method(additions, claims, optional_scoring_method_field)
    # get top 10 percent of each row/addition of the matrix.
    top_10 = top_10_percent_matrix(matrix, cutoff_percentage)
    key = optional_scoring_method_field_name if optional_scoring_method_field else scoring_method.__name__
    scoring_method_result_dict[key]={
        "claims_len":len(claims),
        "matches": top_10,
    }
  return scoring_method_result_dict

import math



In [None]:
def top_10_percent_matrix(matrix, cutoff_percentage):
  """ Return a matrix: ie. [[claim_num_index, claim_num_index],] for top 10% of each match.
        The rows represents additions.

  Parameters:
    matrix (list of list): a matrix where the rows represents additions, and the columns, claims for all additions to all claims
  """
  return_matrix_of_index=[]
  for row in matrix:
    length_10percent=math.ceil(len(row)*cutoff_percentage)
    # get indices of top 10 in max_scores_list
    indexes=sorted(range(len(row)), key=lambda i: row[i], reverse=True)[:length_10percent]
    return_matrix_of_index.append(indexes)
  return return_matrix_of_index


In [None]:
from sentence_transformers import SentenceTransformer, util

def scoring_method_bert(additions, claims, model):
  """ Returns a list of [[additions_claim, claim_num],]
  """
  # Compute embedding for both lists
  additions_embeddings = model.encode(
      additions,
      convert_to_tensor=True,
  )
  claims_embeddings = model.encode(
      claims,
      convert_to_tensor=True,
  )
  # Compute cosine-similarity for every additions to every claim
  cosine_scores = util.pytorch_cos_sim(
      additions_embeddings, claims_embeddings
  ).tolist()
  return cosine_scores


In [36]:
import spacy
gpu = spacy.prefer_gpu()
print('GPU:', gpu)
!pip install -U spacy[cuda101]
_N_PROCESS = 1
_en_core_sci_lg_nlp = spacy.load("/content/en_core_sci_lg-0.4.0/en_core_sci_lg/en_core_sci_lg-0.4.0")
!python -m spacy download en_core_web_trf
nlp_en = spacy.load('en_core_web_trf')

GPU: True
Requirement already up-to-date: spacy[cuda101] in /usr/local/lib/python3.7/dist-packages (3.0.6)
2021-05-05 19:04:14.015027: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


In [37]:

def preprocess_with_spacy_nlp(text_list, steps, nlp=nlp_en):
    """
    This method can remove punctuation,
    Parameters:
        text_list (list): list of strings
        steps (list): one of ["punct", "lemma", "stopwords"]
    """
    # make a copy of text_lis
    return_list = text_list
    if any(item in ["punct", "lemma", "stopwords"] for item in steps):
        # 'lemmatizer' required 'tagger' and 'attribute_ruler'
        nlp_list = list(
            nlp.pipe(
                return_list,
                disable=["tok2vec", "ner"],
                n_process=_N_PROCESS,
            )
        )
        return_list = [
            " ".join(
                [
                    token.lemma_ if "lemma" in steps else token.text
                    for token in doc
                    if (
                        (
                            ("punct" in steps and not token.is_punct)
                            or "punct" not in steps
                        )
                        and (
                            ("stopwords" in steps and not token.is_stop)
                            or "stopwords" not in steps
                        )
                    )
                ]
            )
            for doc in nlp_list
        ]
    return return_list


def similarity_matrix(embed_A_list, embed_B_list):
    """
    This method returns a matrix such as:
        [[X, X, X],
        [X, X, X]]
    wherein each row represents the similarity measurement between an embedding
    from embed_A_list to each of the embeddings in embed_B_list.

    Parameters:
        embed_A_list (list): list of NLP object generated by spaCy
        embed_B_list (list): list of NLP object generated by spaCy to be
                             compared to embed_A
    """
    matrix = [[0] * len(embed_B_list) for y in range(len(embed_A_list))]
    for i in range(len(embed_A_list)):
        for j in range(len(embed_B_list)):
            matrix[i][j] = embed_A_list[i].similarity(embed_B_list[j])
    return matrix


def scoring_method_spacy(additions, claims, nlp=nlp_en):
  """ Scores with spaCy; return a matrix of similar scores
  """
  additions=preprocess_with_spacy_nlp(additions, ["punct", "lemma", "stopwords"], nlp)
  claims=preprocess_with_spacy_nlp(claims, ["punct", "lemma", "stopwords"], nlp)
  # Compute embedding for both lists
  # tokenization only requires tok2vec
  disabled_list = ["tagger", "attribute_ruler", "lemmatizer", "parser", "ner"]
  additions_embeddings = list(
      nlp.pipe(
          additions, disable=disabled_list, n_process=_N_PROCESS
      )
  )
  claims_embeddings = list(
      nlp.pipe(
          claims, disable=disabled_list, n_process=_N_PROCESS
      )
  )
  # Compute cosine-similarity for every additions to every claim
  return similarity_matrix(additions_embeddings, claims_embeddings)


In [38]:
_device = None

model_mpnet_base_v2 = SentenceTransformer("stsb-mpnet-base-v2", device=_device)
model_mpnet_base_v2.zero_grad()
# Will limit size since CUDA runs out of memory
model_mpnet_base_v2.max_seq_length=512

model_roberta_base_v2 = SentenceTransformer("stsb-roberta-base-v2", device=_device)
model_roberta_base_v2.zero_grad()
# Will limit size since CUDA runs out of memory
model_roberta_base_v2.max_seq_length=512



In [44]:

scoring_method_list=[
                     [scoring_method_bert, model_mpnet_base_v2, "model_mpnet_base_v2"],
                     [scoring_method_bert,model_roberta_base_v2, "model_roberta_base_v2"],
                     [scoring_method_spacy,nlp_en, "en_core_web_trf"],
                     [scoring_method_spacy,_en_core_sci_lg_nlp, "_en_core_sci_lg_nlp"],]

NDA=random_NDA_list[3]

# return_match_result={"method_name":{"claims_len": X, "matches": [[claim_index, claim_index],] }}, where matches are index/specific line in the patent file when all files are joined together
return_match_result=return_match(NDA, "additions_with_context", "patents", scoring_method_list, cutoff_percentage=.1)
print(return_match_result)




{'model_mpnet_base_v2': {'claims_len': 176, 'matches': [[101, 112, 109, 164, 22, 142, 39, 51, 0, 114, 118, 110, 103, 175, 113, 115, 111, 159], [92, 78, 68, 65, 154, 79, 97, 64, 34, 91, 60, 70, 49, 81, 61, 87, 82, 50], [101, 112, 109, 142, 22, 164, 39, 51, 0, 114, 118, 115, 110, 175, 111, 103, 113, 159], [131, 91, 61, 65, 96, 86, 76, 31, 81, 23, 62, 119, 78, 35, 27, 82, 60, 85], [128, 129, 170, 130, 42, 138, 168, 167, 86, 87, 139, 26, 165, 140, 23, 35, 27, 55], [101, 109, 112, 22, 142, 39, 0, 164, 51, 110, 111, 114, 115, 175, 118, 75, 159, 73], [142, 0, 22, 109, 101, 39, 112, 164, 51, 175, 111, 114, 2, 144, 159, 75, 162, 143], [0, 142, 101, 112, 39, 109, 51, 22, 164, 175, 111, 114, 143, 75, 110, 35, 96, 159]]}, 'model_roberta_base_v2': {'claims_len': 176, 'matches': [[101, 109, 112, 142, 118, 115, 0, 103, 22, 39, 51, 164, 75, 175, 117, 143, 59, 1], [87, 97, 92, 54, 82, 62, 84, 94, 52, 51, 99, 30, 55, 38, 59, 40, 53, 89], [101, 109, 142, 112, 118, 115, 0, 22, 103, 39, 164, 117, 51, 75, 1

In [46]:
# Remove en_core_web_trf, since results indicated that there is no vectors
scoring_method_list=[
                     [scoring_method_bert, model_mpnet_base_v2, "model_mpnet_base_v2"],
                     [scoring_method_bert,model_roberta_base_v2, "model_roberta_base_v2"],
                     [scoring_method_spacy,_en_core_sci_lg_nlp, "_en_core_sci_lg_nlp"],]

multi_return_match_result=[]
for i in range(len(random_NDA_list)):
  print(f"processing {str(i)} of {str(len(random_NDA_list))}")
  NDA=random_NDA_list[i]
  multi_return_match_result.append(return_match(NDA, "additions_with_context", "patents", scoring_method_list, cutoff_percentage=.1))

processing 0 of 292
processing 1 of 292
processing 2 of 292
processing 3 of 292
processing 4 of 292
processing 5 of 292
processing 6 of 292
processing 7 of 292
processing 8 of 292
processing 9 of 292
processing 10 of 292
processing 11 of 292
processing 12 of 292
processing 13 of 292
processing 14 of 292
processing 15 of 292
processing 16 of 292
processing 17 of 292
processing 18 of 292
processing 19 of 292
processing 20 of 292
processing 21 of 292
processing 22 of 292
processing 23 of 292
processing 24 of 292
processing 25 of 292
processing 26 of 292
processing 27 of 292
processing 28 of 292
processing 29 of 292
processing 30 of 292
processing 31 of 292
processing 32 of 292
processing 33 of 292
processing 34 of 292
processing 35 of 292
processing 36 of 292
processing 37 of 292
processing 38 of 292
processing 39 of 292
processing 40 of 292
processing 41 of 292
processing 42 of 292
processing 43 of 292
processing 44 of 292
processing 45 of 292
processing 46 of 292
processing 47 of 292
pr

In [52]:
print(multi_return_match_result[0])
import json
json=json.dumps(multi_return_match_result)
f=open("multi_match_result.json", "w")
f.write(json)
f.close()

{'model_mpnet_base_v2': {'claims_len': 121, 'matches': [[58, 31, 71, 63, 59, 56, 32, 50, 52, 54, 55, 15, 67], [58, 31, 63, 53, 56, 71, 55, 32, 51, 59, 50, 54, 60], [63, 58, 53, 51, 0, 31, 56, 54, 15, 33, 60, 71, 52], [58, 53, 51, 31, 56, 54, 71, 52, 59, 55, 32, 63, 50], [84, 2, 119, 75, 105, 114, 104, 117, 116, 3, 115, 15, 46], [84, 2, 114, 104, 111, 119, 102, 116, 107, 110, 117, 81, 115], [58, 31, 71, 63, 53, 32, 67, 15, 59, 51, 60, 0, 56], [30, 28, 25, 95, 58, 31, 27, 0, 63, 53, 26, 91, 108], [74, 72, 63, 20, 104, 102, 33, 15, 92, 60, 58, 51, 57], [49, 104, 5, 102, 111, 92, 74, 84, 116, 43, 72, 80, 34], [117, 104, 116, 49, 119, 111, 74, 102, 39, 92, 15, 114, 33], [104, 102, 15, 33, 119, 4, 74, 114, 31, 60, 63, 0, 58], [104, 102, 107, 58, 114, 15, 74, 110, 33, 31, 119, 113, 120], [20, 57, 60, 19, 92, 91, 39, 75, 105, 58, 79, 109, 50], [74, 72, 104, 102, 15, 33, 0, 107, 120, 90, 63, 103, 20], [104, 102, 58, 15, 110, 112, 74, 72, 33, 31, 107, 63, 87], [58, 31, 91, 92, 63, 51, 60, 90, 87

In [69]:
def mean(lst):
  return sum(lst)/len(lst)

def return_mean_percent_same_for_each_nda(multi_return_match_result):
  # calculate percentage that match
  # percent_same_for_each_nda is a list of percentages of claims that are the same for each addtion for each NDA
  percent_same_for_each_nda=[]
  for NDA in multi_return_match_result:
    # matches = [[[claim_index1,claim_index2,],],] wherein the outermost list represents list for each model, the second outermost list represented the additions
    matches=[]
    model_names=[]
    # breakup dict into two lists
    for model_name, value in NDA.items():
      model_names.append(model_name)
      matches.append(value['matches'])
    if not matches or not matches[0] or not matches[0][0]:
      continue

    models_len=len(matches)
    additions_len=len(matches[0])
    claims_len=len(matches[0][0])

    # additions_percent_same is a list of the percentage of additions that have same claim_indexes across multiple models for each addition in an NDA
    additions_percent_same=[]
    for a in range(additions_len):
      # addition_matches=[[claim_index1,claim_index2,],] wherein each row represents a different model
      addition_matches=[]
      for m in range(models_len):
        addition_matches.append(matches[m][a])
      # count number of claim_index that is the same across multiple models for same addition
      same_claims=addition_matches[0]
      same_claims_orig_len=len(same_claims)
      assert same_claims_orig_len==claims_len
      for i in range(1,len(addition_matches)):
        same_claims=list(set(same_claims).intersection(addition_matches[i]))
      percent_same=len(same_claims)/same_claims_orig_len
      additions_percent_same.append(percent_same)
    
    percent_same_for_each_nda.append(mean(additions_percent_same))
  return mean(percent_same_for_each_nda)

print("Mean percent_same_for_each_nda: ", return_mean_percent_same_for_each_nda(multi_return_match_result))
  

Mean percent_same_for_each_nda:  0.3022827958802241


The above indicates that if three models are used, the top 10% of the results are shared by 30% of the models.  The following indicates if 2 models are using the results are shared by 55% of the two models.

In [74]:
scoring_method_list=[
                     [scoring_method_bert, model_mpnet_base_v2, "model_mpnet_base_v2"],
                     [scoring_method_bert,model_roberta_base_v2, "model_roberta_base_v2"],
                     ]

multi_return_match_result=[]
for i in range(len(random_NDA_list)):
  # print(f"processing {str(i)} of {str(len(random_NDA_list))}")
  NDA=random_NDA_list[i]
  multi_return_match_result.append(return_match(NDA, "additions_with_context", "patents", scoring_method_list, cutoff_percentage=.1))

print("Mean percent_same_for_each_nda: ", return_mean_percent_same_for_each_nda(multi_return_match_result))

Mean percent_same_for_each_nda:  0.5502248546898962


If a single model is use, the following should report 1, since all matches are the same to itself.

In [75]:
scoring_method_list=[
                     [scoring_method_bert, model_mpnet_base_v2, "model_mpnet_base_v2"],
                     ]

multi_return_match_result=[]
for i in range(len(random_NDA_list)):
  # print(f"processing {str(i)} of {str(len(random_NDA_list))}")
  NDA=random_NDA_list[i]
  multi_return_match_result.append(return_match(NDA, "additions_with_context", "patents", scoring_method_list, cutoff_percentage=.1))

print("Mean percent_same_for_each_nda: ", return_mean_percent_same_for_each_nda(multi_return_match_result))

Mean percent_same_for_each_nda:  1.0


If the cutoff percentage increases, then the number of matching claims should increase.

In [79]:
scoring_method_list=[
                      [scoring_method_bert, model_mpnet_base_v2, "model_mpnet_base_v2"],
                     [scoring_method_bert,model_roberta_base_v2, "model_roberta_base_v2"],
                     [scoring_method_spacy,_en_core_sci_lg_nlp, "_en_core_sci_lg_nlp"]
                     ]

multi_return_match_result=[]
for i in range(len(random_NDA_list)):
  # print(f"processing {str(i)} of {str(len(random_NDA_list))}")
  NDA=random_NDA_list[i]
  multi_return_match_result.append(return_match(NDA, "additions_with_context", "patents", scoring_method_list, cutoff_percentage=.2))

print("Mean percent_same_for_each_nda: ", return_mean_percent_same_for_each_nda(multi_return_match_result))

Mean percent_same_for_each_nda:  0.38887973947123505


In [78]:
scoring_method_list=[
                      [scoring_method_bert, model_mpnet_base_v2, "model_mpnet_base_v2"],
                     [scoring_method_bert,model_roberta_base_v2, "model_roberta_base_v2"],
                     [scoring_method_spacy,_en_core_sci_lg_nlp, "_en_core_sci_lg_nlp"]
                     ]

multi_return_match_result=[]
for i in range(len(random_NDA_list)):
  # print(f"processing {str(i)} of {str(len(random_NDA_list))}")
  NDA=random_NDA_list[i]
  multi_return_match_result.append(return_match(NDA, "additions_with_context", "patents", scoring_method_list, cutoff_percentage=.3))

print("Mean percent_same_for_each_nda: ", return_mean_percent_same_for_each_nda(multi_return_match_result))

Mean percent_same_for_each_nda:  0.45128598488193755
