Takes raw data from json file and extracts family Number, application number & date, publication number & date, patent name & abstract, applicnat & inventor name, IPC & CPC classes, family members, number of geographical extensions, and oldest and newest family publication. Processed data is returned in a CSV file. 

In [1]:
# Cell 1

import requests
import json
import os
import re
import time
import pickle
import Levenshtein
import pandas as pd
import datetime as dt
from IPython.display import display
from bs4 import BeautifulSoup

Sets region. Use country code as specified in [ESpacenet](https://worldwide.espacenet.com/patent/help/countrycodes). Country code will automatically be added to exported files' names.

In [2]:
# Cell 2

startYear = 2017
endYear = 2023
region = "TN"

endYear = str(endYear)[-2:]

Defines functions to be used.

In [3]:
# Cell 3

# attempts to extract a sequence of one or more consecutive letters from the beginning of the string appNum. 
# if match is found, returns the identified substring; otherwise returns None.
# e.g. areaCodeRegex("EP3722443A1") returns "EP"

def areaCodeRegex(appNum):
    return re.match(r"[A-Za-z]+",appNum).group()


# extracts a string containing only json data (substring between the outermost curly brackets) from 'string'
# if match is found, returns the identified substring; otherwise returns None.
# e.g. extractJSON("lorem ipsum {'EP3722443A1':{'dolor':'sit'}} amet") returns "{'EP3722443A1':{'dolor':'sit'}}"

def extractJSON(string):
    match = re.search(r'Content-Type: application/json(.*?)Content-Type: application/json', string, re.DOTALL)
    
    if match:
        return re.search(r'{[\s\S]*}', match.group(), re.DOTALL).group()
    else:
        return re.search(r'{[\s\S]*}', string, re.DOTALL).group()


# calculates the number of months between two dates formatted as "%Y-%m-%d"
# returns the difference in months
# e.g. calculateMonthDiff("2022-01-15", "2023-07-20") returns 18

def calculateMonthDiff(start_date, end_date):
    start = dt.datetime.strptime(start_date, "%Y-%m-%d")
    end = dt.datetime.strptime(end_date, "%Y-%m-%d")

    month_difference = (end.year - start.year) * 12 + (end.month - start.month)

    return month_difference


# removes punctuation and spaces from string. Turns all letters lowercase.
# e.g. normaliseReference("Hello, World!") returns "helloworld"

def normaliseReference(string):
    cleaned_text = re.sub(r'[^\w\s]', '', string)
    cleaned_text = re.sub(r'\s+', '', cleaned_text).lower()
    return cleaned_text


# returns a number between 0-1 indicating the similarity between 2 strings. 
# {1: exactly the same, 0: completely different}

def compareReferences(ref1, ref2):
    if (ref1 == None) or (ref2 == None):
        return 0
    else:
        distance = Levenshtein.distance(ref1, ref2)
        similarity = 1 - (distance / max(len(ref1), len(ref2)))
        return similarity

Imports data from pickle file created in `importData.ipynb`.

In [4]:
# Cell 4

# "data" is a list that holds all the patents

data = list()

with open(os.getcwd()+f'/data/{region}/rawData{region}.pickle' , 'rb') as file:
    data = pickle.load(file)

print("\nDone.")


Done.


Sets up empty dataframe.

In [5]:
# Cell 5

# set up dataframe

df = pd.DataFrame(columns=["familyNumber",
                           "applicationNumber","applicationDate",
                           "publicationNumber","publicationDate",
                           "patentName","abstract",
                           "applicant","inventor",
                           "IPC","IPC_count","CPC","CPC_count",
                           "familyMembers (appNum)","geographicalExtensions",
                           "familyDates_min","familyDates_max","timeDifference"])



Only run the following cell if loading a backup CSV file (e.g. if an error occurs).

In [6]:
# Cell 6

# df = pd.read_csv(os.getcwd()+f'/data/{region}/no_cit_backup_{region}.csv')

The following code iterates through the patents to extract the relevant information and appends it to the dataframe. 

In [7]:
# Cell 7

# variables to track progress

count = 0
total = len(data)

# prints a baseline (full) progress bar for reference

print("|"*100)

# iterate through list "data"; each patent (data[q]) is represented by a dictionary containing the patent information


for q in range(len(data)):
    
    # code will start appending data to the end of the dataframe
    if q >= len(df):
        # extract simple family number

        familyNumber = data[q]['familyNumber']

        # extract publication number and date

        publicationNumber = data[q]['hits'][0]['fields']['publications.pn_docdb'][0]
        
        publicationDate, hasPubDate = "", False
        
        if data[q]['hits'][0]['fields'].get('publications.pd'):
            publicationDate = data[q]['hits'][0]['fields']['publications.pd'][0]
            hasPubDate = True
        
        # extract application number and date

        applicationNumber, applicationDate = "", ""
        
        for app in data[q]['fields']['biblio'][0]:
            
            # identifies which application number in the family corresponds with the current publication 
            if data[q]['fields']['biblio'][0][app].get('pn_docdb') is not None:        
                if [publicationNumber] in list(data[q]['fields']['biblio'][0][app]['pn_docdb'].values()):
                    applicationNumber = app

        if data[q]['hits'][0]['fields'].get('publications.app_fdate.untouched'):
            applicationDate = data[q]['hits'][0]['fields']['publications.app_fdate.untouched'][0]


        # extract patent name

        patentName, patentAbstract = "", ""
        hasTitle = False
        
        if data[q]['hits'][0]['fields'].get('publications.ti_en') is not None:
            
            patentName = data[q]['hits'][0]['fields']['publications.ti_en'][0]
        
        else:
            
            # searches for title in different lang if no English title found
            for w in list(data[q]['hits'][0]['fields']):
                if re.search(r'publications.ti',w):
                    hasTitle = True
                    patentName = data[q]['hits'][0]['fields'][w][0]

        
        if not hasTitle or not hasPubDate:
            # if no title or publication date was found, directly look up the patent info 

            url = "https://worldwide.espacenet.com/3.2/rest-services/search/family/"+familyNumber+"/aggregated/biblio"
            querystring = {"lang":"en,de,fr","q":publicationNumber,"qlang":"cql"}
            payload = ""
            headers = {
                "Accept": "multipart/form-data",
                "Accept-Language": "en-US,en;q=0.9",
                "Connection": "keep-alive",
                "Cookie": "_pk_id.16.72ee=e201aa9064b4f0dd.1704417755.; _pk_ref.16.72ee=%5B%22%22%2C%22%22%2C1704421700%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; _pk_ses.16.72ee=1; SameSite=None",
                "EPO-Trace-Id": "iazwu4-e9z380-BBB-000005",
                "Referer": "https://worldwide.espacenet.com/patent/search/family/003734942/publication/ID16772A?q=ID16772A",
                "Sec-Fetch-Dest": "empty",
                "Sec-Fetch-Mode": "cors",
                "Sec-Fetch-Site": "same-origin",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
                "X-EPO-PQL-Profile": "cpci",
                "sec-ch-ua": "\"Chromium\";v=\"104\", \"Not A;Brand\";v=\"99\", \"Google Chrome\";v=\"104\"",
                "sec-ch-ua-mobile": "?0",
                "sec-ch-ua-platform": "\"Windows\""
            }

            response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
            patInfo = ""

            try:
                patInfo = json.loads(extractJSON(response.text))

            except Exception as e:

                # if unable to load json, print patent json and error information

                print("Exception type:", type(e).__name__)
                print("\n\nError by: ", familyNumber)
                print("\n\nJson Text: ")
                print(response.text)
                print("\n\nExtracted Text: ")
                print(extractJSON(response.text))
                print()

            if patInfo != "":
                if not hasTitle:
                    
                    if patInfo[applicationNumber].get('ti_en'):
                        patentName = list(patInfo[applicationNumber]['ti_en'].values())[0]
                        
                    elif patInfo[applicationNumber].get('ti_ol'):
                        patentName = list(patInfo[applicationNumber]['ti_ol'].values())[0]
                            
                if not hasPubDate:
                    if patInfo[applicationNumber].get('pd'):
                        if len(patInfo[applicationNumber]['pd']) == 1:
                            publicationDate = patInfo[applicationNumber]['pd'].values()[0]
                        else:
                            for x in patInfo[applicationNumber]['pd'].keys():
                                if x in str(publicationNumber)[-2:]:
                                    publicationDate = patInfo[applicationNumber]['pd'][x][0]
    
        
        # extract patent abstract
        
        if data[q]['hits'][0]['fields'].get('publications.abs_en') is not None:

            if re.search(r'<p.*?>(.*?)</p>',data[q]['hits'][0]['fields']['publications.abs_en'][0]) is not None:
                patentAbstract = re.search(r'<p.*?>(.*?)</p>', data[q]['hits'][0]['fields']['publications.abs_en'][0]).group(1)

        else:

            # searches for abstract in a different language if no English abstract found

            for w in list(data[q]['hits'][0]['fields']):
                if re.search(r'publications.abs', w):
                    if re.search(r'<p.*?>(.*?)</p>', data[q]['hits'][0]['fields'][w][0]) is not None:
                        patentAbstract = re.search(r'<p.*?>(.*?)</p>', data[q]['hits'][0]['fields'][w][0]).group(1)


        # extract simple family member application numbers
            # defines "familyMembers" as list of simple family members (e.g. "US201514658322A; US43008803A")

        familyMembers = ""

        if len(list(data[q]['fields']['biblio'][0])) > 0:
            for fam in list(data[q]['fields']['biblio'][0]):
                if familyMembers == "":
                    familyMembers = fam
                else:
                    familyMembers += "; "+fam


        # calculate number of geographical extentions based off list of family members

        geographicalExtensions = len(list(set(map(areaCodeRegex, list(data[q]['fields']['biblio'][0])))))


        # extract dates of the oldest and youngest patent in the family
            # defines "familyDates_min" and "familyDates_max" as the earliest and oldest publication dates in the family
            # calculates difference in months between the two dates, stored in variable "timeDifference"

        mindate = dt.date(3000,1,1)
        maxdate = dt.date(1,1,1)

        for app in data[q]['fields']['biblio'][0]:
            for d in data[q]['fields']['biblio'][0][app]['pd'].values():
                date = list(map(int, d[0].split("-")))
                date = dt.date(date[0],date[1],date[2])
                if date < mindate:
                    mindate = date
                if date > maxdate:
                    maxdate = date

        familyDates_min = mindate.strftime("%Y-%m-%d")
        familyDates_max = maxdate.strftime("%Y-%m-%d")
        timeDifference = calculateMonthDiff(familyDates_min, familyDates_max)


        # extract ipc classes
            # defines "ipc" as list of classifications (e.g. C12Q1/68; C12Q1/6827)
            # defines "ipcNum" as the number of cpc classifications

        ipc = ""
        ipcNum = 0

        if data[q]['hits'][0]['fields'].get('publications.ipc_icai') is not None:
            for i in data[q]['hits'][0]['fields']['publications.ipc_icai']:
                if ipc == "":
                    ipc = i
                else:
                    ipc += "; "+i

                ipcNum += 1


        # extract cpc classes
            # defines "cpc" as string of classifications (e.g. "C08K13/02 (EP); C08K3/20 (EP)"")
            # defines "cpcNum" as the number of cpc classifications

        cpc = ""
        cpcNum = 0

        if data[q]['hits'][0]['fields'].get('publications.ci_cpci') is not None:
            for i in data[q]['hits'][0]['fields']['publications.ci_cpci']:
                if cpc == "":
                    cpc = i
                else:
                    cpc += "; "+i

                cpcNum += 1

        if data[q]['hits'][0]['fields'].get('publications.ca_cpci') is not None:
            for i in data[q]['hits'][0]['fields']['publications.ca_cpci']:
                if cpc == "":
                    cpc = i
                else:
                    cpc += "; "+i

                cpcNum += 1


        # extract applicant names + region
            # defines "applicant" as a string of applicants (e.g. "John (US); Paul (HK)")

        applicant = ""
        
        if data[q]['hits'][0]['fields'].get('publications.pa') is not None:
            for i in range(len(data[q]['hits'][0]['fields']['publications.pa'])):
                app = data[q]['hits'][0]['fields']['publications.pa'][i]
                reg = ""

                if (data[q]['hits'][0]['fields'].get('publications.pac') is not None) and (i < len(data[q]['hits'][0]['fields']['publications.pac'])):
                    reg = " ("+data[q]['hits'][0]['fields']['publications.pac'][i]+")"

                if applicant == "":    
                    applicant = app+reg
                else:
                    applicant += "; "+app+reg


        # extract inventor names + region
            # defines "inventor" as a string of inventors (e.g. "John (US); Paul (HK)")

        inventor = ""

        if data[q]['hits'][0]['fields'].get('publications.in') is not None:
            for i in range(len(data[q]['hits'][0]['fields']['publications.in'])):
                inv = data[q]['hits'][0]['fields']['publications.in'][i]
                reg = ""

                if (data[q]['hits'][0]['fields'].get('publications.inc') is not None) and (i < len(data[q]['hits'][0]['fields']['publications.inc'])):
                    reg = " ("+data[q]['hits'][0]['fields']['publications.inc'][i]+")" 

                if inventor == "":    
                    inventor = inv+reg
                else:
                    inventor += "; "+inv+reg


        # append info to new row in dataframe

        df.loc[q] = [familyNumber, applicationNumber, applicationDate, publicationNumber, publicationDate, patentName, patentAbstract, applicant, inventor, ipc, ipcNum, cpc, cpcNum, familyMembers, geographicalExtensions, familyDates_min, familyDates_max, timeDifference]


    # tracks progress
    
    count += 1
    if (int(count/total*100) > int((count-1)/total*100)):
        print("|", end="")
        
        # periodically saves a backup file to data folder
        
        if q >= len(df)-1:
            df.to_csv(os.getcwd()+f'/data/{region}/backup_{region}.csv',index=False,encoding='utf-8-sig')

print("\nDone.")

||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||


||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Done.


Add new columns to the dataframe to hold citation information. Initialise values to be empty.

In [8]:
# Cell 8

# sort dataframe

df = df.sort_values(by=['familyNumber','applicationDate']).drop_duplicates(subset=['familyNumber','applicationNumber','publicationNumber'],keep='last')


# initialise new columns for citation data

df['backwardCitationsESpaceRaw'] = 0
df['backwardCitationsESpaceA'] = 0
df['backwardCitationsESpaceB'] = 0

df['backwardCitationsGoogle'] = 0
df['forwardCitationsGoogle'] = 0
df['publicationNumberGoogle'] = ""


# Saves the dataframe as a csv file in the /data folder. File name can be adjusted below.

df.to_csv(os.getcwd()+f'/data/{region}/{startYear}-{endYear}Patents_{region}.csv', index=False, encoding='utf-8-sig')

Loads dataframe from backup file (only run if using a backup).

In [9]:
# Cell 9

# df = pd.read_csv(os.getcwd()+f'/data/{region}/backup{region}.csv')

Find family citations -- Espacenet 

Defines parameters to be used in API call. No parameters to change.

In [10]:
# Cell 10

querystring = {"citationFields":"publications.ti_*,publications.ct,publications.pn_docdb,publications.pd,oprid_full.untouched,opubd_full.untouched,publications.abs_*,publications.in,publications.inc,publications.pa,publications.pac,publications.app_fdate.untouched,publications.famn,publications.ci_cpci,publications.ca_cpci,publications.cl_cpci,publications.ipc_icai,publications.ipc_ican",
               "lang":"en,de,fr",
               "qlang":"cql"}

payload = ""
headers = {
    "Accept": "multipart/form-data",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
    "Cookie": "_pk_id.16.72ee=54208df12fa28cc0.1703812775.; _pk_ses.16.72ee=1; SameSite=None",
    "EPO-Trace-Id": "z09bmc-43bgij-TTT-000088",
    "Referer": "https://worldwide.espacenet.com/patent/search/family/073002675/publication/TW202029089A?f=publications.pac%3Ain%3DHK&q=pd%20within%20%222020-01-01%2C%202020-12-31%22",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "sec-ch-ua": "\"Chromium\";v=\"118\", \"Google Chrome\";v=\"118\", \"Not=A?Brand\";v=\"99\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\""
}

Extracts the number of citations for each patent based on information presented in the Espacenet API. 
Returns 3 values with different processing methods. 

In [11]:
# Cell 11

# initialise variables with empty values

last = ""
refCountRaw, refCountA, refCountB = None, None, None


# set variables for tracking progress

total = len(df)
count = 0


# prints reference bar for tracking progress
print("|"*100)

# !! adjust start if using backup data. (e.g. if backup file contains ESPacenet citation data until row index 1000, set start to 1000)
start = -1


# iterate through each patent in dataframe
for index, row in df.iterrows():
    # index = row number (zero-based indexing), row = series object holding row info
    
    # skips over rows before "start" row (if loading in backup file, remember to adjust "start" row)
    if index >= start:
        
        familyNumber = str(row['familyNumber'])
        
        # reuse last iteration's information if patent is from the same family sa the last patent (citation data identical within families on espacenet)
        
        if familyNumber != last:
            
            # API request call
            
            citations = ""
            url = "https://worldwide.espacenet.com/3.2/rest-services/search/family/"+familyNumber+"/aggregated/biblio"
            try:
                response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
            except:
                time.sleep(1) # attempts to catch timeout errors
                response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
            
            # convert API text string to a json object
            
            try:
                data = json.loads(extractJSON(response.text))
                
            except Exception as e:
                
                # if unable to load json, print patent and error information. 
                # depending on the API call output, regex in function "extractJSON()" may need to be adjusted.
                
                print("Exception type:", type(e).__name__)
                print("\n\nError by: ", familyNumber)
                print("\n\nJson Text: ")
                print(response.text)
                print("\n\nExtracted Text: ")
                print(extractJSON(response.text))
                print()
            
            # lists to hold the patent citations ("id_pat") and citations of non-patent literature ("id_npl")
            
            id_pat = []
            id_npl = []
            
            for item in list(data): 
                
                # id the citation with publication number for patent references
                # and normalised reference for non-patent literature
                
                if "citation" in item:
                    match = re.match(r'citation\..*?-(.*)\..*', item)

                    if "NPL" in item:
                        i_d = normaliseReference(data[item]['ct']['*']['reference'])
                        id_npl.append(i_d)
                        
                    else:
                        i_d = match.group(1)
                        id_pat.append(i_d)


            # RAW: data count from json
            
            refCountRaw = len(id_pat) + len(id_npl)

            # check similarity
            
            simIndices_pat = []
            simIndices_npl = []
            id_pat = sorted(id_pat)
            id_npl = sorted(id_npl)

            for i in range(len(id_pat)):
                if i == 0:
                    simIndices_pat.append(0)
                else:
                    simIndices_pat.append(compareReferences(id_pat[i], id_pat[i-1]))

            for i in range(len(id_npl)):
                if i == 0:
                    simIndices_npl.append(0)
                else:
                    simIndices_npl.append(compareReferences(id_npl[i], id_npl[i-1])) 

            # PRPCESS A: count citations after removing duplicate items with 100% similarity
            
            simIndicesA = [x for x in simIndices_pat if int(x) < 1] + [x for x in simIndices_npl if float(x) < 1]
            refCountA = len(simIndicesA)

            # PROCESS B: count citations after removing NPL references with over 95% similarity, and patent references with 100% similarity
            
            simIndicesB = [x for x in simIndices_pat if int(x) < 1] + [x for x in simIndices_npl if float(x) < 0.95]
            refCountB = len(simIndicesB)
        
        # update dataframe with calculated counts
        
        df.loc[index, 'backwardCitationsESpaceRaw'] = refCountRaw
        df.loc[index, 'backwardCitationsESpaceA'] = refCountA
        df.loc[index, 'backwardCitationsESpaceB'] = refCountB

        last = familyNumber  
    
    # tracks progress
    
    count += 1
    
    if (int(count/total*100) > int((count-1)/total*100)):
        
        print("|", end="")
        
        if index >= start:
            # periodically saves backup file
            df.to_csv(os.getcwd()+f'/data/{region}/backup_{region}.csv',index=False,encoding='utf-8-sig')

print("\nDone.")

||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Done.


Find family citations -- GooglePatents

Defines parameters to be used in API call. No parameters to change.

In [12]:
# Cell 12

payload = ""
headers = {
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
    "Cookie": "AEC=Ackid1RnDVfJgNrOQbsJ-QvTlEFUBOFC1vy5l_VNxJfe3rYQVoOXuJonQIE; SID=ewjm5qMzOWKmTVjcF4ljDJ_Qjpa_VeZjBZZiYsL4YzU4zo418lqG3brekZR4-qj7fwTA5w.; __Secure-1PSID=ewjm5qMzOWKmTVjcF4ljDJ_Qjpa_VeZjBZZiYsL4YzU4zo41wqVyi148PMZsEQ9p0PeBDg.; __Secure-3PSID=ewjm5qMzOWKmTVjcF4ljDJ_Qjpa_VeZjBZZiYsL4YzU4zo41a4daAgzcdUc6V_3IkYoUVA.; HSID=A-NEN_B7y29vlurDW; SSID=ACq_HLV2Xi2TvGb1X; APISID=sgxNCGTS5IkotwRV/Aj48pz6c5j7BL2j5l; SAPISID=lMzSbV9Y5wi16KgA/AI1tD5whIm7rxQ0Ay; __Secure-1PAPISID=lMzSbV9Y5wi16KgA/AI1tD5whIm7rxQ0Ay; __Secure-3PAPISID=lMzSbV9Y5wi16KgA/AI1tD5whIm7rxQ0Ay; SEARCH_SAMESITE=CgQIjJoB; 1P_JAR=2024-01-02-06; NID=511=IMfcdhimmvf78f-GOe-jdouV4Z1gdEnemPMiZvnK80a7GUwUaBQXVmqttEIM4KRZ-yJ3xn6TaPD8pI8SPh4rAR0NIQpDifg3wdMqiqxoTOI3YPMs7FiiSQIoh1USlSCL9DdV4zWRxz1VCJ4veAI3kLmcJAkk66Wdbt-NWj6GvOttKng_Gr0I_u3ddvJmYfJW-VcdHSKEgWosRL_omb6PAzKDnofG9XD3BoLoYTSFfgjFjNc43ByUx0nEsUBJxTVVDW1iO-da86V8IqVFF_uJ2wAz3hsSXw3J0Z6fajVwa0PXWDS7e25H4cQQtF9l0AFWn4S3-TeuIcHQ; SIDCC=ABTWhQFIcVBrzBklfqYryiP1olTqMdDgMfKDrbYmjKfVmGOGcvImSeA57yptRpdK0S3QqdytpQ; __Secure-1PSIDCC=ABTWhQHx2PbNYCEnL5oO-78T6uyoaMjUUyKnQ6vgC9o5JWVsb-zNAOJUFHgs_odEtuSIGkci; __Secure-3PSIDCC=ABTWhQEoHLQSDx_AhlLyv-MTeWNXOy6hi_B0CTBawmGLQatMQhHWAG__eTIH6rvzwx-EN2hP",
    "Referer": "https://patents.google.com/",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "X-Client-Data": "CK6LywE=",
    "sec-ch-ua": "\"Chromium\";v=\"118\", \"Google Chrome\";v=\"118\", \"Not=A?Brand\";v=\"99\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\""
}

Extracts the number of backward and forward citations for each patent based on information presented in the Google Patents.

In [13]:
# Cell 13

# set variables for tracking progress

total = len(df)
count = 0

# prints reference bar for tracking progress

print("|"*100)

# adjust start if using backup data. (e.g. if backup file contains Google Patent citation data until row index 1000, set start to 1000)

start = -1

# iterate through each patent in dataframe
    # index = row number (zero-based indexing), row = series object holding row info

for index, row in df.iterrows():
    
    if index >= start:
        publicationNumber = row['publicationNumber']
        querystring = {"id":"patent/"+str(publicationNumber)+"/en"}
        url = "https://patents.google.com/xhr/result"
        try:
            response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
        except:
            time.sleep(1)
            response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
            
        # if patent can't be found on Google Patents

        if response.status_code == 404:

            # find similar publication number through google patents search bar

            url404 = "https://patents.google.com/xhr/parse"
            querystring404 = {"text":publicationNumber}
            
            try:
                response404 = requests.request("GET", url404, data=payload, headers=headers, params=querystring404)
            except:
                time.sleep(1)
                response404 = requests.request("GET", url404, data=payload, headers=headers, params=querystring404)
                
            searchResult = response404.json()

           # find top search result, store new publication number in new column

            if not searchResult['error_no_patents_found']:

                if searchResult['results'][0].get('result'):
                    publicationNumber = searchResult['results'][0]['result']['number']   

                else:
                    url = "https://patents.google.com/xhr/query"
                    querystring = {"url":f"q=({publicationNumber})&oq={publicationNumber}","exp":"","tags":""}

                    try:
                        searchResult = requests.request("GET", url, data=payload, headers=headers, params=querystring).json()
                    except:
                        time.sleep(1)
                        searchResult = requests.request("GET", url, data=payload, headers=headers, params=querystring).json()

                    publicationNumber = searchResult['results']['cluster'][0]['result'][0]['patent']['publication_number']

                df.loc[index,'publicationNumberGoogle'] = publicationNumber

                # re-instantiate querystring and response with new publication number

                querystring = {"id":"patent/"+str(publicationNumber)+"/en"}
                try:
                    response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
                except:
                    time.sleep(1)
                    response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
                    

        # create beautiful soup object to parse html data

        soup = BeautifulSoup(response.text, 'html.parser')
        

        # calculate number of backward citations

        origBackCitations = soup.find_all('tr', itemprop='backwardReferencesOrig')
        famBackCitations = soup.find_all('tr', itemprop='backwardReferencesFamily')
        nplBackCitations = soup.find_all('tr', itemprop='detailedNonPatentLiterature')
        df.loc[index, 'backwardCitationsGoogle'] = len(origBackCitations) + len(famBackCitations) + len(nplBackCitations)
        

        # calculate number of forward citations

        origFwdCitations = soup.find_all('tr', itemprop='forwardReferencesOrig')
        famFwdCitations = soup.find_all('tr', itemprop='forwardReferencesFamily')
        df.loc[index, 'forwardCitationsGoogle'] = len(origFwdCitations) + len(famFwdCitations)


    # tracks progress
    
    count += 1
    if (int(count/total*100) > int((count-1)/total*100)):
        print("|", end="")
        
        if index >= start:
            
            # periodically saves backup file
            df.to_csv(os.getcwd()+f'/data/{region}/backup_{region}.csv', index=False, encoding='utf-8-sig')

print("\nDone.")

||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Done.


Saves the dataframe as a csv file in the `/data/REGION` folder. File name can be adjusted below.

In [14]:
# Cell 14

df.to_csv(os.getcwd()+f'/data/{region}/{startYear}-{endYear}PatentsWithCitations_{region}.csv',index=False,encoding='utf-8-sig')