# Patent Data Extraction

## Extract - Patent Data from PatBase API

PatBase is a patent database for patent search and analysis platform with worldwide coverage.
PatBase has a REST API paid license which use to extract patent data information, e.g. priority, publication date, family member, patent status, legal status, assignee, inventor, etc.

PatBase REST API Documentation: http://www.patbase.com/rest/PatBaseRestAPI.pdf

Here I will extract couple of patent informations with API call to PatBase.
License is necessary for PatBase UI access and API request.

In [None]:
# Dependencies
import re, json
import requests
import pandas as pd
import codecs
from config_patbase_2 import user_name, enter_key

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Access PatBase API using username and password
login_url = f"https://www.patbase.com/rest/api.php?method=login&userid={user_name}&password={enter_key}"
session_response = requests.Session().get(login_url, verify=False)

# Extract the cookies from the requests session to requests data later
cookiejar = session_response.cookies.get_dict()
print(cookiejar)

### GetMember
This method is used to get more information on a single member of a family, be it a publication or an
application document. This method requires authorization.
Usage

URL:
https://www.patbase.com/rest/api.php?method=getmember&pn=[Patent Number]&kd=[Kind
Code*]&ft=[True/False*]&legal=[True/False*]

All values marked with * are optional, you do not need to include the values in the request if they
are unwanted. 

In [None]:
# Requests GetMember data and input the cookies we got earlier
# Test run for one patent (Example: pn=IN245512)
url = 'https://www.patbase.com/rest/api.php?method=getmember&pn=IN245512'
requests_jar = requests.cookies.RequestsCookieJar()
requests_jar.set('SessionFarm_GUID', cookiejar['SessionFarm_GUID'])

response = requests.get(url, cookies=requests_jar)
decoded_response = codecs.decode(response.text.encode(), 'utf-8-sig')
data = json.loads(decoded_response)
print(json.dumps(data, indent=4))

In [None]:
patbase_family_number  = data["Family"]
patent_number = data["Publications"][0]["PN"]
application_number = data["Publications"][0]["AN"]
kind_code = data["Publications"][0]["KD"]

print(patbase_family_number)
print(patent_number)
print(application_number)
print(kind_code)

In [None]:
# List of patent/application numbers to be searched
patent_list = ['JP3174742', 'JP3215114', 'JP3236849']

requests_jar = requests.cookies.RequestsCookieJar()
requests_jar.set('SessionFarm_GUID', cookiejar['SessionFarm_GUID'])

# Using for loop we will request the patent data information 
# (PatBase family number, patent number, application number, kind code)

family_number_list = []
patent_number_list = []
application_number_list = []
kind_code_list = []

for patent in patent_list:
    url = f'https://www.patbase.com/rest/api.php?method=getmember&pn={patent}'
    response = requests.get(url, cookies=requests_jar)
    decoded_response = codecs.decode(response.text.encode(), 'utf-8-sig')
    data = json.loads(decoded_response)
    
    family_number_list.append(data["Family"])
    patent_number_list.append(data["Publications"][0]["PN"])
    application_number_list.append(data["Publications"][0]["AN"])
    kind_code_list.append(data["Publications"][0]["KD"])
    
print(family_number_list)
print(patent_number_list)
print(application_number_list)
print(kind_code_list)

In [None]:
# Requests GetMember data and input the cookies we got earlier
# Test run for one patent (Example: pn=JP3174742)
url = 'https://www.patbase.com/rest/api.php?method=getmember&pn=JP3174742'
requests_jar = requests.cookies.RequestsCookieJar()
requests_jar.set('SessionFarm_GUID', cookiejar['SessionFarm_GUID'])

response = requests.get(url, cookies=requests_jar)
decoded_response = codecs.decode(response.text.encode(), 'utf-8-sig')
data = json.loads(decoded_response)
print(json.dumps(data, indent=4))

### Query

This method is used to run a query on the PatBase Database, authorization is required for this
method.

URL:
https://www.patbase.com/rest/api.php?method=query&query=[PatBaseQuery]


In [None]:
# Requests Query data and input the cookies we got earlier
# Test run for one patent 
q_url = 'https://www.patbase.com/rest/api.php?method=query&query=PN%3DJP3174742'
requests_jar = requests.cookies.RequestsCookieJar()
requests_jar.set('SessionFarm_GUID', cookiejar['SessionFarm_GUID'])

q_response = requests.get(q_url, cookies=requests_jar, verify=False)
decoded_q_response = codecs.decode(q_response.text.encode(), 'utf-8-sig')
q_results = json.loads(decoded_q_response)
print(json.dumps(q_results, indent=4))

### SearchResultsBIB

This method is used to get the bibliographic information of a search result. This method requires
authorization.

URL:
https://www.patbase.com/rest/api.php?method=searchresultsbib&querykey=[QueryKey]&s
ortorder=[sortorder*]&from=[from*]&to=[to*]

All values marked with * are optional, you do not need to include the values in the request if they
are unwanted. 

In [None]:
# With the QueryKey we can extract the patent data information we need
# We will extract patents' bibliographic information
# Test run for one patent 
query_key = q_results["QueryKey"]
bib_url = f'https://www.patbase.com/rest/api.php?method=searchresultsbib&querykey={query_key}'
requests_jar = requests.cookies.RequestsCookieJar()
requests_jar.set('SessionFarm_GUID', cookiejar['SessionFarm_GUID'])

bib_response = requests.get(bib_url, cookies=requests_jar, verify=False)
decoded_bib_response = codecs.decode(bib_response.text.encode(), 'utf-8-sig')
bib_results = json.loads(decoded_bib_response)
print(json.dumps(bib_results, indent=4))

In [None]:
# Test run to extract the patent data information for granted patent only
total_results = pd.to_numeric(q_results["Results"])

collect_patent_no = []
collect_kind_code = []
collect_application_no = []
collect_assignee = []

for result in range(0, total_results):
    bib_result = bib_results["Families"][result]
    
    for p in range(0, len(bib_result["Publications"])):
        if (bib_result["Publications"][p]["PN"] == 'JP3174742') and ('B' in bib_result["Publications"][p]["KD"]):
            collect_patent_no.append(bib_result["Publications"][p]["PN"])
            collect_kind_code.append(bib_result["Publications"][p]["KD"])
            collect_application_no.append(bib_result["Publications"][p]["AN"])
            collect_assignee.append(bib_result['ProbableAssignee'])
            

print(collect_patent_no)
print(collect_kind_code)
print(collect_application_no)
print(collect_assignee)

### Patent Data Extraction with PatBase API
#### API Call Type: 'Query' and 'Search ResultsBIB'

In [None]:
# List of patent/application numbers to be searched/requested are saved in a csv file, so read csv
patent_df = pd.read_csv("patent_list_to_request_KR_2.csv", dtype=str)
patent_df = patent_df.rename(columns={"current_patent_number ": "current_patent_number"})
patent_df["country_and_patent_number"] = patent_df["country"] + patent_df["current_patent_number"].astype(str)

patent_list = patent_df["country_and_patent_number"].to_list()
print(patent_list)

In [None]:
len(patent_list)

In [None]:
# CODE FOR SEARCHING PATENT NUMBER (PATENT HAS BEEN GRANTED)

requests_jar = requests.cookies.RequestsCookieJar()
requests_jar.set('SessionFarm_GUID', cookiejar['SessionFarm_GUID'])

# Using for loop we will request the patent data information with 'Query' call which will give
# the number of patent records exist in PatBase for the searched patent and 
# 'QueryKey' as a key to extract the actual data on the next call ('SearchResultsBIB')

# Create empty list to store the patent data information 
# (patent number, kind code, application number, PatBase family number, and Assignee)
collect_patent_no = []
collect_kind_code = []
collect_application_no = []
collect_family_no = []
collect_assignee = []

for patent in patent_list:
    
    try:
        # Requests Query data and input the cookies we got earlier
        q_url = f'https://www.patbase.com/rest/api.php?method=query&query=PN%3D{patent}'
        q_response = requests.get(q_url, cookies=requests_jar, verify=False)
        decoded_q_response = codecs.decode(q_response.text.encode(), 'utf-8-sig')
        q_results = json.loads(decoded_q_response)

        # With the QueryKey we can extract the patent data information we need
        # We will extract patents' bibliographic information
        total_results = pd.to_numeric(q_results["Results"])
        query_key = q_results["QueryKey"]

        bib_url = f'https://www.patbase.com/rest/api.php?method=searchresultsbib&querykey={query_key}'
        bib_response = requests.get(bib_url, cookies=requests_jar, verify=False)
        decoded_bib_response = codecs.decode(bib_response.text.encode(), 'utf-8-sig')
        bib_results = json.loads(decoded_bib_response)

        print(f"Running records for {patent}")

        # One patent might have multiple records in PatBase / different patent but with same patent number
        # We want to make sure we extract only the granted patent 
        # For loop each records found in PatBase 
        for result in range(0, total_results):
            bib_result = bib_results["Families"][result]

            # Each records found will list all the publications/patents under the same family
            # so we need to ensure to only extract the patent we are searching for 
            for p in range(0, len(bib_result["Publications"])):

                # Extract only the patent we are searching for and the patent is granted (kind code starts with 'B')
                if (bib_result["Publications"][p]["PN"] == patent) and ('B' in bib_result["Publications"][p]["KD"]):
                    print("Granted patent exists")
                    collect_patent_no.append(bib_result["Publications"][p]["PN"])
                    collect_kind_code.append(bib_result["Publications"][p]["KD"])
                    collect_application_no.append(bib_result["Publications"][p]["AN"])
                    collect_family_no.append(bib_result["Family"])
                    collect_assignee.append(bib_result['ProbableAssignee'])
                    print("----------------------------------------")

    except:
        collect_patent_no.append(patent)
        collect_kind_code.append(None)
        collect_application_no.append(None)
        collect_family_no.append(None)
        collect_assignee.append(None)
        
print("PATENT DATA:")               
print(collect_patent_no)
print(collect_kind_code)
print(collect_application_no)
print(collect_family_no)
print(collect_assignee)


In [None]:
# CODE FOR SEARCHING APPLICATION NUMBER (PATENT IS CURRENTLY AN APPLICATION / NOT GRANTED YET)

requests_jar = requests.cookies.RequestsCookieJar()
requests_jar.set('SessionFarm_GUID', cookiejar['SessionFarm_GUID'])

# Using for loop we will request the patent data information with 'Query' call which will give
# the number of patent records exist in PatBase for the searched patent and 
# 'QueryKey' as a key to extract the actual data on the next call ('SearchResultsBIB')

# Create empty list to store the patent data information 
# (patent number, kind code, application number, PatBase family number, and Assignee)
collect_patent_no = []
collect_kind_code = []
collect_application_no = []
collect_family_no = []
collect_assignee = []

for patent in patent_list:
    
    try:
        # Requests Query data and input the cookies we got earlier
        q_url = f'https://www.patbase.com/rest/api.php?method=query&query=AN%3D{patent}'
        q_response = requests.get(q_url, cookies=requests_jar, verify=False)
        decoded_q_response = codecs.decode(q_response.text.encode(), 'utf-8-sig')
        q_results = json.loads(decoded_q_response)

        # With the QueryKey we can extract the patent data information we need
        # We will extract patents' bibliographic information
        total_results = pd.to_numeric(q_results["Results"])
        query_key = q_results["QueryKey"]

        bib_url = f'https://www.patbase.com/rest/api.php?method=searchresultsbib&querykey={query_key}'
        bib_response = requests.get(bib_url, cookies=requests_jar, verify=False)
        decoded_bib_response = codecs.decode(bib_response.text.encode(), 'utf-8-sig')
        bib_results = json.loads(decoded_bib_response)

        print(f"Running records for {patent}")

        # One patent might have multiple records in PatBase / different patent but with same patent number
        # We want to make sure we extract only the granted patent 
        # For loop each records found in PatBase 
        for result in range(0, total_results):
            bib_result = bib_results["Families"][result]

            # Each records found will list all the publications/patents under the same family
            # so we need to ensure to only extract the patent we are searching for 
            for p in range(0, len(bib_result["Publications"])):

                # Extract only the patent we are searching for and the patent is an application 
                # (kind code starts with 'A', some countries might have different kind code)
                if (bib_result["Publications"][p]["AN"] == patent) and ('A' in bib_result["Publications"][p]["KD"]): 
#                 or ('C' in bib_result["Publications"][p]["KD"])):
                    print("Patent Application exists")
                    collect_patent_no.append(bib_result["Publications"][p]["PN"])
                    collect_kind_code.append(bib_result["Publications"][p]["KD"])
                    collect_application_no.append(bib_result["Publications"][p]["AN"])
                    collect_family_no.append(bib_result["Family"])
                    collect_assignee.append(bib_result['ProbableAssignee'])
                    print("----------------------------------------")
                    
    except:
        collect_patent_no.append(patent)
        collect_kind_code.append(None)
        collect_application_no.append(None)
        collect_family_no.append(None)
        collect_assignee.append(None)
        
print("PATENT DATA:")               
print(collect_patent_no)
print(collect_kind_code)
print(collect_application_no)
print(collect_family_no)
print(collect_assignee)


In [None]:
print(len(collect_patent_no))
print(len(collect_kind_code))
print(len(collect_application_no))
print(len(collect_family_no))
print(len(collect_assignee))

In [None]:
# Put the collected patent data into pandas DataFrame
dict = {
       'patent_number': collect_patent_no,
       'kind_code': collect_kind_code,
       'application_number': collect_application_no,
       'patbase_family_number': collect_family_no,
       'assignee': collect_assignee
        }

collected_patent_data = pd.DataFrame(dict)
collected_patent_data

In [None]:
# Back up the data to excel
collected_patent_data.to_excel("patbase_patent_data_extraction_13.xlsx", index=False, header=True)