In [211]:
import json
import re
import string
import sys

In [212]:
#Extract epitope residues 
class EpitopeExtractRegex:
    
    #Init the class
    def __init__(self, antigen, patentID, patentJson):
        self.antigen = antigen
        self.patentID = patentID
        self.patentJson = patentJson
        self.keyList = ["antigen","patentID","residuesPositionsMarked","seqID"]
        self.epitopeDictionary = dict.fromkeys(self.keyList)
        self.epitopeDictionary["antigen"] = antigen
        self.epitopeDictionary["patentID"] = patentID
        self.epitopeDictionary["residuesPositionsMarked"] = []
        
        #US9574011
        self.bindingString = r'''([^.]*?The invention claimed is:(.*)antibody specifically binds an epitope comprising residues[^.]*\.)'''     
        #US8829165
        self.bindingString1 = r'''([^.]*?What is claimed is:(.*)antibody binds to at least one of the following residues[^.]*\.)'''   
        #US8859741
        self.bindingString2 = r'''([^.]*?What is claimed is:(.*)antibody binds an epitope on(.*)comprising at least one of residues[^.]*\.)''' 
        #US8563698
        self.bindingString3 = r'''([^.]*?What is claimed is:(.*)antibody binds to at least one residue within the sequence set forth by residues[^.]*\.)''' 
        #US10023654
        self.bindingString4 = r'''([^.]*?We claim:(.*)antibody or antigen-binding fragment of the antibody binds an epitope comprising amino acid residue[^.]*\.)'''
        
        self.bindingPattern = [re.compile(p) for p in [self.bindingString, self.bindingString1, self.bindingString2, self.bindingString3, self.bindingString4]]
        self.outputJson = ''.join(("extracted", patentID, ".json"))
        
        
    #Load patent from json    
    def loadJson(self):  
        try:
            self.jsonfile = open(self.patentJson)
        except OSError:
            print ("Could not open/read file:", self.patentJson)
            sys.exit()
        
        with self.jsonfile:
            self.data = self.jsonfile.read().replace('\n', '')
            return self.data
            
    #Find the relevant sentence
    def findRelevantSentence(self):
        for regex in self.bindingPattern:
            if re.findall(regex, self.data):
                self.sentenceToEvaluate = re.findall(regex, self.data)
        self.sentenceToEvaluate = ','.join(str(v) for v in self.sentenceToEvaluate)                   
        return self.sentenceToEvaluate
        
    #Preprocess the required string to find positions marked  
    def extractWords(self):
        self.extractedString = ''.join(self.sentenceToEvaluate)
        if re.search(r'\bresidues\b', self.extractedString):
            self.extractedString = self.extractedString.split("residues")[1].split("SEQ ID")[0]
        else:
            self.extractedString = self.extractedString.split("residue")[1].split("SEQ ID")[0]
        
        self.words = self.extractedString.split()
        return self.words
    
    #Extract Seq ID
    def extractSeqID(self):
        self.extractedSeqID = ''.join(self.sentenceToEvaluate)
        if re.search(r'\bresidues\b', self.extractedSeqID):
            self.extractedSeqID = self.extractedSeqID.split("SEQ ID NO:")[1].split(".")[0].strip()
            self.extractedSeqID = self.extractedSeqID.split(",")[0].strip()
        else:
            self.extractedSeqID = self.extractedSeqID.split("(SEQ ID NO:")[1].split(").")[0].strip()
        
        return self.extractedSeqID
    
    #Put Seq ID into dict
    def seqIDtoDict(self):
        self.epitopeDictionary["seqID"] = self.extractedSeqID
        
    
    #Fill the dictionary
    def fillEpitopeDict(self):
        for i in self.words:
            i = i.replace(',','')
            #if punctuation
            if i in string.punctuation:
                i = i.replace(':','')
                
            #if range of sequences
            elif i.find("-") != -1:
                rangeList = i.split("-")
                for n in range(int(rangeList[0]), int(rangeList[-1]) + 1):
                    self.epitopeDictionary["residuesPositionsMarked"].append(int(n)) 
                
            #if mix of letters and digits
            elif (i.isalpha() == False) and (i.isdigit() == False) and (len(i) < 5 ):
                i = i[1:]
                self.epitopeDictionary["residuesPositionsMarked"].append(int(i))
            #if digital
            elif i.isdigit():
                self.epitopeDictionary["residuesPositionsMarked"].append(int(i))
    
    #Write epitope dictionary to json file
    def epitopeDictToJson(self):
        with open(self.outputJson, 'w') as outputJson:
            json.dump(self.epitopeDictionary, outputJson)
    

In [213]:
evaluatedPatent011 = EpitopeExtractRegex("TFPI", "US9574011", "US9574011.json")

In [214]:
data011 = evaluatedPatent011.loadJson()

In [215]:
sentenceToParse011 = evaluatedPatent011.findRelevantSentence()

In [216]:
sentenceToParse011

'(\'": "",   "Epitope description 1": "The invention claimed is: A monoclonal antibody that specifically binds the K2 domain (Kunitz domain 2) of TFPI, wherein said antibody specifically binds an epitope comprising residues E10, E11, D12, P13, R17, Y19, T21, Y23, F24, N26, Q28, Q31, C32, E33, R34, K36, and L50 of SEQ ID NO: 2.\', \' A monoclonal antibody that specifically binds the K2 domain (Kunitz domain 2) of TFPI, wherein said \')'

In [217]:
ourString011 = evaluatedPatent011.extractWords()

In [218]:
words011 = evaluatedPatent011.extractWords()

In [219]:
extractedSeqID011 = evaluatedPatent011.extractSeqID()

In [220]:
extractedSeqID011

'2'

In [221]:
evaluatedPatent011.seqIDtoDict()

In [222]:
evaluatedPatent011.fillEpitopeDict()

In [223]:
evaluatedPatent011.epitopeDictionary

{'antigen': 'TFPI',
 'patentID': 'US9574011',
 'residuesPositionsMarked': [10,
  11,
  12,
  13,
  17,
  19,
  21,
  23,
  24,
  26,
  28,
  31,
  32,
  33,
  34,
  36,
  50],
 'seqID': '2'}

In [224]:
evaluatedPatent011.epitopeDictToJson()

In [225]:
evaluatedPatent165 = EpitopeExtractRegex("PCSK9", "US8829165", "US8829165.json")
data165 = evaluatedPatent165.loadJson()
sentenceToParse165 = evaluatedPatent165.findRelevantSentence()
print(sentenceToParse165)


ourString165 = evaluatedPatent165.extractWords()
words165 = evaluatedPatent165.extractWords()
extractedSeqID165 = evaluatedPatent165.extractSeqID()
print(extractedSeqID165)

evaluatedPatent165.seqIDtoDict()
evaluatedPatent165.fillEpitopeDict()
print(evaluatedPatent165.epitopeDictionary)

evaluatedPatent165.epitopeDictToJson()

('25"}}},"us-claim-statement":{"_text":"What is claimed is:"},"claims":{"_attributes":{"id":"claims"},"claim":[{"_attributes":{"id":"CLM-00001","num":"00001"},"claim-text":{"_text":"1. An isolated monoclonal antibody, wherein, when bound to PCSK9, the monoclonal antibody binds to at least one of the following residues: S153, I154, P155, R194, D238, A239, I369, S372, D374, C375, T377, C378, F379, V380, or S381 of SEQ ID NO:3, and wherein the monoclonal antibody blocks binding of PCSK9 to LDLR."}},{"_attributes":{"id":"CLM-00002","num":"00002"},"claim-text":{"_text":["2. The isolated monoclonal antibody of ",", wherein the monoclonal antibody binds to at least S153."],"claim-ref":{"_attributes":{"idref":"CLM-00001"},"_text":"claim 1"}}},{"_attributes":{"id":"CLM-00003","num":"00003"},"claim-text":{"_text":["3. The isolated monoclonal antibody of ",", wherein the monoclonal antibody binds to at least I154."],"claim-ref":{"_attributes":{"idref":"CLM-00001"},"_text":"claim 1"}}},{"_attribut

In [226]:
evaluatedPatent741 = EpitopeExtractRegex("PCSK9", "US8859741", "US8859741.json")
data741 = evaluatedPatent741.loadJson()
sentenceToParse741 = evaluatedPatent741.findRelevantSentence()
print(sentenceToParse741)


ourString741 = evaluatedPatent741.extractWords()
words741 = evaluatedPatent741.extractWords()
extractedSeqID741 = evaluatedPatent741.extractSeqID()
print(extractedSeqID741)

evaluatedPatent741.seqIDtoDict()
evaluatedPatent741.fillEpitopeDict()
print(evaluatedPatent741.epitopeDictionary)

evaluatedPatent741.epitopeDictToJson()

('": "",   "Date of Patent": "",   "Epitope description 1": "What is claimed is: 1. An isolated monoclonal antibody that binds to PCSK9, wherein the isolated monoclonal antibody binds an epitope on PCSK9 comprising at least one of residues 237 or 238 of SEQ ID NO: 3, and wherein the monoclonal antibody blocks binding of PCSK9 to LDLR.', ' 1. An isolated monoclonal antibody that binds to PCSK9, wherein the isolated monoclonal ', ' PCSK9 ')
3
{'antigen': 'PCSK9', 'patentID': 'US8859741', 'residuesPositionsMarked': [237, 238], 'seqID': '3'}


In [227]:
evaluatedPatent698 = EpitopeExtractRegex("PCSK9", "US8563698", "US8563698.json")
data698 = evaluatedPatent698.loadJson()
sentenceToParse698 = evaluatedPatent698.findRelevantSentence()
print(sentenceToParse698)


ourString698 = evaluatedPatent698.extractWords()
words698 = evaluatedPatent698.extractWords()
extractedSeqID698 = evaluatedPatent698.extractSeqID()
print(extractedSeqID698)

evaluatedPatent698.seqIDtoDict()
evaluatedPatent698.fillEpitopeDict()
print(evaluatedPatent698.epitopeDictionary)

evaluatedPatent698.epitopeDictToJson()

('": "",   "Date of Patent": "",   "Epitope description 1": "What is claimed is: 1. An isolated monoclonal antibody, wherein, when bound to PCSK9, said monoclonal antibody binds to at least one residue within the sequence set forth by residues 123-132 of SEQ ID NO: 1, and wherein said monoclonal antibody reduces binding between PCSK9 and an EGFa domain of LDLR protein antagonizes PCSK9\'s inhibition of cellular LDL uptake.', ' 1. An isolated monoclonal antibody, wherein, when bound to PCSK9, said monoclonal ')
1
{'antigen': 'PCSK9', 'patentID': 'US8563698', 'residuesPositionsMarked': [123, 124, 125, 126, 127, 128, 129, 130, 131, 132], 'seqID': '1'}


In [228]:
evaluatedPatent654 = EpitopeExtractRegex("PCSK9", "US10023654", "US10023654.json")
data654 = evaluatedPatent654.loadJson()
sentenceToParse654 = evaluatedPatent654.findRelevantSentence()
print(sentenceToParse654)

ourString654 = evaluatedPatent654.extractWords()
words654 = evaluatedPatent654.extractWords()

extractedSeqID654 = evaluatedPatent654.extractSeqID()
print(extractedSeqID654)

evaluatedPatent654.seqIDtoDict()
evaluatedPatent654.fillEpitopeDict()
print(evaluatedPatent654.epitopeDictionary)

evaluatedPatent654.epitopeDictToJson()

('": "",   "Epitope description 1": "We claim: 1. An antibody or antigen-binding fragment of an antibody that specifically binds hPCSK9, comprising: a) a heavy chain complementarity determining region 1 (HCDR1) comprising an amino acid sequence that is at least 95% identical to SEQ ID NO: 76; b) a heavy chain complementarity determining region 2 (HCDR2) comprising an amino acid sequence that is at least 95% identical to SEQ ID NO: 78; c) a heavy chain complementarity determining region 3 (HCDR3) comprising an amino acid sequence that is at least 95% identical to SEQ ID NO: 80; d) a light chain complementarity determining region 1 (LCDR1) comprising an amino acid sequence that is at least 95% identical to SEQ ID NO: 84; e) a light chain complementarity determining region 2 (LCDR2) comprising an amino acid sequence that is at least 95% identical to SEQ ID NO: 86; and f) a light chain complementarity determining region 3 (LCDR3) comprising an amino acid sequence that is at least 95% ident