In [14]:
import re,urllib.request,os,pickle,datetime,sys
from bs4 import BeautifulSoup
    

def get_patent_info(patent):
    patent_lines = download_patent_data(patent)
    return parse_patent_lines(patent,patent_lines)

def download_patent_data(patent):
    patent = get_canonical_name(patent) 
    curr_url = "http://patft1.uspto.gov/netacgi/nph-Parser?patentnumber="+patent
    print(curr_url)
    user_agent = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
    req = urllib.request.Request(
    curr_url, 
    data=None, 
    headers=user_agent
    )
    first = urllib.request.urlopen(req)
    first_lines = first.read().decode('utf-8').split('\n')
    #print(first_lines)
    refresh_line = [line for line in first_lines if "REFRESH" in line][0]
    refresh_url = "http://patft1.uspto.gov"+re.match('.*?URL=(.*?)">',refresh_line).group(1)
    #print(refresh_url)
    patent_req = urllib.request.Request(refresh_url, data=None, headers=user_agent)
    patent_connection = urllib.request.urlopen(patent_req)
    patent_lines = patent_connection.read().decode('utf-8')
    #print(patent_lines)
    return patent_lines
    
def parse_patent_lines(patent,patent_lines):
    patent = get_canonical_name(patent)
    ret_dict = {}
    print (patent_lines)
    soup = BeautifulSoup(patent_lines, 'html.parser')
    grant_index = patent_lines.index('<TD ALIGN="RIGHT" WIDTH="50%"> <B>\n')+1
    grant_date = patent_lines[grant_index].strip()
    file_index = patent_lines.index('  <TR><TD VALIGN="TOP" ALIGN="LEFT" WIDTH="10%">Filed:\n')+2
    file_line = patent_lines[file_index]
    file_date = re.match(".*?<B>(.*?)</B>",file_line).group(1)
    summary_index = grant_index+5
    summary_end_index = patent_lines.index('</font><BR>\n',summary_index)
    summary = patent_lines[summary_index][16:].strip()
    for i in range(summary_index+1,summary_end_index):
        summary += " "+patent_lines[i].strip()
    related_patents_header = '<HR> <CENTER><B>Related U.S. Patent Documents</B></CENTER> <HR> <TABLE WIDTH="100%"> <TR><TD WIDTH="7%"></TD><TD></TD><TD></TD><TD></TD><TD></TD><TD></TD></TR> <TR><TD align="left">\n'
    patent_case_header = "<CENTER><B><I>Parent Case Text</B></I></CENTER>\n"
    if patent_case_header in patent_lines:
        patent_case_index = patent_lines.index(patent_case_header)
        patent_case_end = patent_lines.index("<HR>\n",patent_case_index+2)
        patent_case_text = (" ".join([x.lstrip() for x in patent_lines[patent_case_index+2:patent_case_end]])).replace("<BR><BR>","")
        ret_dict["patent_case_text"] = patent_case_text
    if related_patents_header in patent_lines:
        related_index = patent_lines.index(related_patents_header)
        related_list = re.split("</TR><TR>",patent_lines[related_index+1])[1:-1]
        related_parsed = [[re.match("<TD.*>(.*)",x).group(1) for x in re.split("</TD>",y)[1:5]] for y in related_list]
        related_info = [{"application_number":x[0],"filing_date": find_related_exact_date(ret_dict,x[1]),"patent_number":x[2],"issue_date":x[3]} for x in related_parsed]
        related_info = [keep_non_empty(x) for x in related_info]
        ret_dict["related_info"] = related_info
    pct_file_header = '  <TR><TD VALIGN="TOP" ALIGN="LEFT" WIDTH="10%">PCT Filed:\n'
    if pct_file_header in patent_lines:
        pct_file_index = patent_lines.index(pct_file_header)
        pct_date = patent_lines[pct_file_index+2].strip()
        ret_dict["pct_file_date"] = pct_date
    ret_dict.update({"patent":patent,"file_date":file_date,
                     "grant_date":grant_date,"summary":summary})
    terminal_disclaimer_header = '</TR>  <TR><TD VALIGN="TOP" ALIGN="LEFT" WIDTH="10%" NOWRAP><B>[*]</B> Notice: </TD>\n'
    if terminal_disclaimer_header in patent_lines:
        terminal_disclaimer_index = patent_lines.index(terminal_disclaimer_header)
        terminal_disclaimer_date = " ".join(patent_lines[terminal_disclaimer_index+1].split()[-3:])
        ret_dict["terminal_disclaimer_date"] = terminal_disclaimer_date
    if patent.lower().startswith("re"):
        reissue_index = -1
        for i in xrange(len(patent_lines)):
            if "Reissue of:" in patent_lines[i]:
                reissue_index = i
                break
        if reissue_index > 0:
            reissue_line = patent_lines[reissue_index]
            reissue_tail = re.match(".*?Reissue of:(.*)",reissue_line).group(1)
            reissue_parts = reissue_tail.replace("<TD align=center>","").split("</TD>")
            orig_file_date,orig_patent,orig_grant_date = reissue_parts[2:5]
            ret_dict.update({"orig_patent":orig_patent,"orig_file_date":orig_file_date,"orig_grant_date":orig_grant_date})
        return ret_dict
        #print patent,file_date,grant_date,orig_patent,orig_file_date,orig_grant_date,"--",repr(summary)
    else:        
        return ret_dict
    #print patent,file_date,grant_date,"--",repr(summary)

month_names = ['', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
mon_names = [name[:3].lower() for name in month_names]

def get_numeric_date(date):
    #Could be two formats: "December 9, 1994" or "Mar., 1993"
    list = date.split()
    month = list[0][:3].lower()
    year = int(list[-1])
    if len(list) == 3:
        day = int(list[1].strip(",")) 
    else:
        day = 32
    month = mon_names.index(month)
    return [year,month,day]

#20 year term start date is first of related file date, pct file date or file date.
# If filed  or pct filed before June 8, 1995, 17 years from grant date or 20 years from 20 year term start
#After June 8, 1995, 20 years from first file date.
#If a reissued patent, Use original patent date, not final date.
#If it has a terminal disclaimer, use that instead of calculated date.  
#XXX: Note that this does not handle the possibility that the patent office 
# might have delayed granting the application and it could be extended because of that.
#XXX: Note that if maintenance fees are not paid, the patent might expire early
 
def get_patent_expiration(patent_info):
    reason = ""
    if patent_info.has_key("orig_patent"):
        filed = patent_info["orig_file_date"]
        granted = patent_info["orig_grant_date"]
    else:
        filed = patent_info["file_date"]
        granted = patent_info["grant_date"]
    file_date = get_numeric_date(filed)
    grant_date = get_numeric_date(granted)
    file_plus = file_date[:]
    file_plus[0] += 20
    reason += "file+20: "+str(file_plus)    
    if "related_info" in patent_info and len(patent_info["related_info"]) > 0:
        related_file = sorted([get_numeric_date(x["filing_date"]) 
                               for x in patent_info["related_info"] if "filing_date" in x])
        if len(related_file) > 0:
            related_file_plus = related_file[0]
            related_file_plus[0] += 20
            file_plus = min(related_file_plus,file_plus)
            reason += " related_patent+20:"+str(related_file_plus)
    if "pct_file_date" in patent_info:
        pct_file_date = get_numeric_date(patent_info["pct_file_date"])
        pct_file_plus = pct_file_date[:]
        pct_file_plus[0] += 20
        file_plus = min(file_plus,pct_file_plus)
        reason += " pct_file+20:"+str(pct_file_plus)
    if "override_reason" in patent_info:
        reason += " Override: "+patent_info["override_reason"]
    if file_date < [1995,6,8] or ("pct_file_date" in patent_info and pct_file_date < [1995,6,8]):
        grant_plus = grant_date
        grant_plus[0] += 17
        reason += " grant+17:"+str(grant_plus)
        max_date = max(file_plus,grant_plus)
        max_date,reason = adjust_expiration_date(patent_info,max_date,reason)
        return max_date,reason
    else:
        file_plus,reason = adjust_expiration_date(patent_info,file_plus,reason)
        return file_plus,reason

def adjust_expiration_date(patent,date,reason):
    """Adjusts date based on terminal disclaimers and term extensions"""
    if "terminal_disclaimer_date" in patent:
        new_date = get_numeric_date(patent["terminal_disclaimer_date"])
        reason += " terminal disclaimer date "+patent["terminal_disclaimer_date"]
        return new_date,reason
    if "term_extension" in patent:
        term_extension = patent["term_extension"]
        if term_extension >= 0:
            reason += " term extension "+str(term_extension)+" days "
        if term_extension > 0:
            delta = datetime.timedelta(term_extension)
            if date[2] == 32:
                #If exact date is unknown, it is set to 32
                date = [date[0],date[1],1]
            date_o = datetime.date(date[0],date[1],date[2])
            date_adjusted = date_o + delta
            new_date = [date_adjusted.year,date_adjusted.month,date_adjusted.day]
            return new_date,reason
    return date,reason

def find_related_exact_date(patent,date):
    if len(date) < 4:
        #Bad date, filtered out elsewhere
        return date
    numeric_date = get_numeric_date(date)
    if "patent_case_text" in patent:
        month = mon_names[numeric_date[1]]
        year = str(numeric_date[0])
        case_text = patent["patent_case_text"]
        dates = re.findall(month+".?\s+(\d{1,2}),\s+"+year,case_text.lower())+\
            re.findall("(\d{1,2})\s+"+month+".\s+"+year,case_text.lower())
        if len(dates) == 1:
            #"December 9, 1994"
            month = month_names[numeric_date[1]]
            new_day = dates[0]
            return month+" "+new_day+", "+year
    return date


def get_first_date(patent_info):
    """ Calculates the first relevent date for patent prior art"""
    if patent_info.has_key("orig_patent"):
        filed = patent_info["orig_file_date"]
    else:
        filed = patent_info["file_date"]
    file_date = get_numeric_date(filed)
    first_date = file_date
    if "related_info" in patent_info and len(patent_info["related_info"]) > 0:
        related_file = sorted([get_numeric_date(x["filing_date"]) 
                               for x in patent_info["related_info"] if "filing_date" in x])
        if len(related_file) > 0:
            related_file = related_file[0]
            first_date = min(related_file,first_date)
    if "pct_file_date" in patent_info:
        pct_file_date = get_numeric_date(patent_info["pct_file_date"])
        first_date = min(pct_file_date,first_date)
    return first_date
    

def get_canonical_name(patent):
    if patent.startswith("0"):
        patent = patent[1:]
    patent = patent.replace(",","")
    return patent

def keep_non_empty(dic):
    new_dict = {}
    for key in dic:
        if dic[key] and dic[key] != '':
            new_dict[key] = dic[key]
    return new_dict

def get_patent_term_extension(patent):
    patent = get_canonical_name(patent)
    if CACHE_FILES:
        if patent in term_extension_dict:
            return term_extension_dict[patent]
    req = urllib.request.Request("http://www.google.com/patents?as_pnum="+patent, None, {'User-agent': 'Patent Reader'})
    #opener.addheaders = [('User-agent', 'Patent Reader')]
    #print >> sys.stderr,"Opening "+"http://www.google.com/patents?as_pnum="+patent
    search_data = None                                                                             
    with urllib.request.urlopen(req) as response:
        search_data = response.read()
                                                                                            
    #google_search = opener.open("http://www.google.com/patents?as_pnum="+patent)
    #search_data = google_search.read()
    patent_url = re.search("<a href=\"(http://www.google.com/patents/about\?id=.*?)\">",search_data).group(1)
    google_id = re.search("id=(.*?)&",patent_url).group(1)
    #print >> sys.stderr,"Opening "+"http://www.google.com/patents?printsec=abstract&zoom=4&id="+google_id+"&output=text&pg=PA1"
    patent_text_file = opener.open("http://www.google.com/patents?printsec=abstract&zoom=4&id="+google_id+"&output=text&pg=PA1")
    patent_text = patent_text_file.read()
    patent_search = re.search("35 U.S.C. 154\(b\) by ([0-9]*) days.",patent_text)
    if patent_search:
        days = int(patent_search.group(1))
    else:
        days = -1
    if CACHE_FILES:
        term_extension_dict[patent] = days
        out_file = open(term_extension_file,"w")
        pickle.dump(term_extension_dict,out_file)
        out_file.close()
    return days

In [36]:
p_info = get_patent_info('8,357,371')

http://patft1.uspto.gov/netacgi/nph-Parser?patentnumber=8357371
<HTML>
<HEAD>
<BASE target="_top">
<TITLE>United States Patent: 8357371</TITLE></HEAD>
<!-BUF1=8357371
BUF7=2013
BUF8=546831
BUF9=/1/
BUF51=8
-->
<BODY bgcolor="#FFFFFF">
<A name="top"></A>
<CENTER>
<IMG src="/netaicon/PTO/patfthdr.gif" alt="[US Patent & Trademark Office, Patent Full Text and Image Database]">
<BR>
<TABLE>
<TR><TD align="center">
<A href="/netahtml/PTO/index.html"><IMG src="/netaicon/PTO/home.gif" alt="[Home]" border="0" valign="middle"></A>
<A href="/netahtml/PTO/search-bool.html"><IMG src="/netaicon/PTO/boolean.gif" alt="[Boolean Search]" border="0" valign="middle"></A>
<A href="/netahtml/PTO/search-adv.htm"><IMG border="0" src="/netaicon/PTO/manual.gif" ALT="[Manual Search]" valign="middle"></A>
<A href="/netahtml/PTO/srchnum.htm"><IMG src="/netaicon/PTO/number.gif" alt="[Number Search]" border="0" valign="middle"></A>
<A href="/netahtml/PTO/help/help.htm"><IMG border="0" valign="middle" src="/netaicon/

ValueError: substring not found

In [38]:
patent_lines = download_patent_data('8,357,371')

http://patft1.uspto.gov/netacgi/nph-Parser?patentnumber=8357371


In [39]:
soup = BeautifulSoup(patent_lines, 'html.parser')

In [40]:
tables = soup.findAll("table")

In [41]:
grant_date = tables[2].findAll('tr')[1].findAll('td')[1].text.strip()

In [49]:
fonts = soup.findAll('font')

In [55]:
title = fonts[3].text.strip()

'Methods for treating hypercholesterolemia using antibodies to PCSK9'

In [57]:
abstract = soup.findAll('p')[0].text.strip()

'The present invention provides methods for treating hypercholesterolemia.\n     The methods of the present invention comprise administering to a subject\n     in need thereof a therapeutic composition comprising an anti-PCSK9\n     antibody or antigen-binding fragment thereof.'

In [81]:
rows = soup.findAll('tr')
assignee = None
for r in rows:
    if r.th != None and 'Assignee:' in r.th: 
        assignee = r.th.findNext('td').text.strip()
        break
print(assignee)       

Regeneron Pharmaceuticals, Inc.
 (Tarrytown, 
NY)


In [143]:
center_tags = soup.findAll('center')
claim_header = None
idx = 0
for tag in center_tags:
    if 'Claims' in tag.text:
        claim_header = tag
        end_tag = center_tags[idx + 1]
        break
    idx += 1


In [127]:
print(claim_header.next_sibling.next.next.next.next.next.next.next.next.next
      
     )

<br/>


In [155]:
def extract_text_between_tags(curr_elem, last_tag, text):
    print(curr_elem)
    if curr_elem.string != None and '<' not in curr_elem:
        text += curr_elem.string.strip()
    if curr_elem.next == last_tag:
        return text
    else:
        return extract_text_between_tags(curr_elem.next, last_tag, text)

In [147]:
claims_text = ''
extract_text_between_tags(claim_header.next_sibling, end_tag, claims_text)

'What is claimed is:1.  A method for treating hypercholesterolemia comprising administering to a patient in need thereof a therapeutically effective amount of a pharmaceutical composition\ncomprising an antibody or antigen-binding fragment thereof which specifically binds hPCSK9, wherein the antibody or antigen-binding fragment comprises the heavy and light chain CDRs of a HCVR/LCVR amino acid sequence pair selected from the group\nconsisting of SEQ ID NOs:90/92 and 218/226.2.  The method of claim 1, wherein the antibody or antigen-binding fragment comprises heavy and light chain CDR amino acid sequences having SEQ ID NOs:220, 222, 224, 228, 230 and 232.3.  The method of claim 2, wherein the antibody or antigen-binding fragment comprises an HCVR having the amino acid sequence of SEQ ID NO:218 and an LCVR having the amino acid sequence of SEQ ID NO:226.4.  The method of claim 1, wherein the antibody or antigen-binding fragment comprises heavy and light chain CDR amino acid sequences hav

In [144]:
print(end_tag)

<center><b><i>Description</i></b></center>


In [149]:
start_tag = end_tag
end_tag = center_tags[idx + 2]

In [156]:
desc_text = ''
extract_text_between_tags(start_tag.next_sibling, end_tag, desc_text)



<hr/>


<center>
<a href="http://pdfpiw.uspto.gov/.piw?Docid=08357371&amp;homeurl=http%3A%2F%2Fpatft1.uspto.gov%2Fnetacgi%2Fnph-Parser%3FSect1%3DPTO1%2526Sect2%3DHITOFF%2526d%3DPALL%2526p%3D1%2526u%3D%25252Fnetahtml%25252FPTO%25252Fsrchnum.htm%2526r%3D1%2526f%3DG%2526l%3D50%2526s1%3D8357371.PN.%2526OS%3DPN%2F8357371%2526RS%3DPN%2F8357371&amp;PageNum=&amp;Rtype=&amp;SectionNum=&amp;idkey=NONE&amp;Input=View+first+page"><img alt="[Image]" border="0" src="/netaicon/PTO/image.gif" valign="middle"/></a>
<table>
<tr><td align="center"><a href="https://certifiedcopycenter.uspto.gov/other/patft/view.html?backUrl1=http%3A//patft1.uspto.gov/netacgi/nph-Parser?Sect1%3DPTO1%26Sect2%3DHITOFF%26d%3DPALL%26p%3D1%26u%3D%25252Fnetahtml%25252FPTO%25252Fsrchnum.htm%26r%3D1%26f%3DG%26l%3D50%26s1%3D8357371.PN.%26OS%3DPN%2F8357371&amp;backLabel1=Back%20to%20Document%3A%208357371"><img alt="[View Shopping Cart]" border="0" src="/netaicon/PTO/cart.gif" valign="m
iddle"/></a>
<a href="https://certifiedcopyce

AttributeError: 'NoneType' object has no attribute 'string'