In [84]:
import json
import re

In [85]:
#Extract epitope residues 
class EpitopeExtractRegex:
    
    #Init the class
    def __init__(self, antigen, patentID, patentJson):
        self.antigen = antigen
        self.patentID = patentID
        self.patentJson = patentJson
        self.keyList = ["antigen","patentID","residuesPositionsMarked"]
        self.epitopeDictionary = dict.fromkeys(self.keyList)
        self.epitopeDictionary["antigen"] = antigen
        self.epitopeDictionary["patentID"] = patentID
        self.epitopeDictionary["residuesPositionsMarked"] = []
        self.bindingString = r'''([^.]*?An isolated monoclonal antibody, wherein, when bound to PCSK9, the monoclonal antibody binds to at least one of the following residues:[^.]*\.)''' 
        self.bindingPattern = re.compile(self.bindingString)
        self.outputJson = ''.join(("extracted", patentID, ".json"))
        
        
    #Load patent from json    
    def loadJson(self):  
        try:
            self.jsonfile = open(self.patentJson)
        except OSError:
            print ("Could not open/read file:", self.patentJson)
            sys.exit()
        
        with self.jsonfile:
            self.data = self.jsonfile.read().replace('\n', '')
            return self.data
            
    #Find the relevant sentence
    def findRelevantSentence(self):
        self.sentenceToEvaluate = re.findall(self.bindingPattern, self.data)
        return self.sentenceToEvaluate
        
    #Preprocess the required string    
    def extractWords(self):
        self.extractedString = ''.join(self.sentenceToEvaluate).split("residues:")[1].split("SEQ ID")[0]
        self.words = self.extractedString.split()
        return self.words
    
    #Fill the dictionary
    def fillEpitopeDict(self):
        for i in self.words:
            i = i.replace(',','')
            if (i.isalpha() == False) and (i.isdigit() == False):
                i = i[1:]
                self.epitopeDictionary["residuesPositionsMarked"].append(i)
    
    #Write epitope dictionary to json file
    def epitopeDictToJson(self):
        with open(self.outputJson, 'w') as outputJson:
            json.dump(self.epitopeDictionary, outputJson)
    

In [86]:
evaluatedPatent = EpitopeExtractRegex("PCSK9", "8829165", "8829165.json")

In [87]:
data = evaluatedPatent.loadJson()

In [88]:
sentenceToParse = evaluatedPatent.findRelevantSentence()

In [89]:
sentenceToParse

[' An isolated monoclonal antibody, wherein, when bound to PCSK9, the monoclonal antibody binds to at least one of the following residues: S153, I154, P155, R194, D238, A239, I369, S372, D374, C375, T377, C378, F379, V380, or S381 of SEQ ID NO:3, and wherein the monoclonal antibody blocks binding of PCSK9 to LDLR.']

In [90]:
ourString = evaluatedPatent.extractWords()

In [91]:
words = evaluatedPatent.extractWords()

In [92]:
evaluatedPatent.fillEpitopeDict()

In [93]:
evaluatedPatent.epitopeDictionary

{'antigen': 'PCSK9',
 'patentID': '8829165',
 'residuesPositionsMarked': ['153',
  '154',
  '155',
  '194',
  '238',
  '239',
  '369',
  '372',
  '374',
  '375',
  '377',
  '378',
  '379',
  '380',
  '381']}

In [94]:
evaluatedPatent.epitopeDictToJson()