Extracts raw patent data from espacenet. Variables "startYear", "endYear", and regionCode can be toggled as needed. Returns json files holding monthly data in /data/jsonRaw.

In [1]:
# Cell 1

import requests
import os
import calendar
import pickle

Variables to adjust parameters for the API call to Espacenet. Year dates are inclusive.

In [2]:
# Cell 2

startYear = 2017
endYear = 2023

regionCode = "TN"
marker = ""

Defines functions that generate a list of tuples containing the date pairs. Used as inputs for the API request.  

In [3]:
# Cell 3 

# MONTHLY

def generateDates(year):

    # Generates a list[12] of tuples containing the first and last date of each month given a certain year.

    month_ranges = []
    for month in range(1,13):
        _,last_day = calendar.monthrange(year,month)
        first_day = f"{year}-{month:02d}-01"
        last_day = f"{year}-{month:02d}-{last_day:02d}"
        month_ranges.append((first_day, last_day))
    return month_ranges

# BI-MONTLY

def generateDates_med(year):

    # Generates a list[24] of tuples containing start and end dates spanning throughout the year. 
    # Each month is split into 2 parts. 
    # For example: [(2003-1-1, 2003-1-15),(2003-1-16, 2003-1-31), ...].
    
    month_ranges = []
    for month in range(1,13):
        
        _,last_day = calendar.monthrange(year,month)
        
        start1_day = f"{year}-{month:02d}-01"
        end1_day = f"{year}-{month:02d}-15"
        start2_day = f"{year}-{month:02d}-16"
        end2_day = f"{year}-{month:02d}-{last_day:02d}"
        
        month_ranges.append((start1_day, end1_day))
        month_ranges.append((start2_day, end2_day))
        
    return month_ranges

# WEEKLY

def generateDates_large(year):

    # Generates a list[48] of tuples containing the first and last date of each week given a certain year. 
    # For example: [(2003-01-01, 2003-01-07),(2003-01-08, 2003-01-15),(2003-01-16, 2003-01-23),
    # (2003-01-24, 2003-01-31), ...].

    month_ranges = []
    for month in range(1,13):
        
        _,last_day = calendar.monthrange(year,month)
        
        # ensure all values are 2 places wide (e.g. 2023-01-07 instead of 2023-1-7)
        
        start1_day = f"{year}-{month:02d}-01"
        end1_day = f"{year}-{month:02d}-07"
        start2_day = f"{year}-{month:02d}-08"
        end2_day = f"{year}-{month:02d}-15"
        start3_day = f"{year}-{month:02d}-16"
        end3_day = f"{year}-{month:02d}-23"
        start4_day = f"{year}-{month:02d}-24"
        end4_day = f"{year}-{month:02d}-{last_day:02d}"
        
        month_ranges.append((start1_day, end1_day))
        month_ranges.append((start2_day, end2_day))
        month_ranges.append((start3_day, end3_day))
        month_ranges.append((start4_day, end4_day))
        
    return month_ranges

Sets parameters used in the API call to Espacenet. Variable "regionCode" in line 60 can be adjusted in Cell 2 above. 

In [4]:
# Cell 4

url = "https://worldwide.espacenet.com/3.2/rest-services/search"

payload = {
    "query": {
        "fields": ["publications.ti_*", "publications.abs_*", "publications.pn_docdb", "publications.in", "publications.inc", "publications.pa", "publications.pac", "publications.pd", "publications.pr_docdb", "publications.app_fdate.untouched", "publications.ipc_ic", "publications.ipc_icci", "publications.ipc_iccn", "publications.ipc_icai", "publications.ipc_ican", "publications.ci_cpci", "publications.ca_cpci", "publications.cl_cpci", "biblio:pa;pa_orig;pa_unstd;in;in_orig;in_unstd;pac;inc;pd;pn_docdb;allKindCodes;", "oprid_full.untouched", "opubd_full.untouched"],
        "from": 0,
        "size": 1000,
        "highlighting": [
            {
                "field": "publications.ti_en",
                "fragment_words_number": 20,
                "number_of_fragments": 3,
                "hits_only": True
            },
            {
                "field": "publications.abs_en",
                "fragment_words_number": 20,
                "number_of_fragments": 3,
                "hits_only": True
            },
            {
                "field": "publications.ti_de",
                "fragment_words_number": 20,
                "number_of_fragments": 3,
                "hits_only": True
            },
            {
                "field": "publications.abs_de",
                "fragment_words_number": 20,
                "number_of_fragments": 3,
                "hits_only": True
            },
            {
                "field": "publications.ti_fr",
                "fragment_words_number": 20,
                "number_of_fragments": 3,
                "hits_only": True
            },
            {
                "field": "publications.abs_fr",
                "fragment_words_number": 20,
                "number_of_fragments": 3,
                "hits_only": True
            },
            {
                "field": "publications.pn_docdb",
                "fragment_words_number": 20,
                "number_of_fragments": 3,
                "hits_only": True
            },
            {
                "field": "publications.pa",
                "fragment_words_number": 20,
                "number_of_fragments": 3,
                "hits_only": True
            }
        ]
    },
    "filters": {
        "publications.pac": [{"value": [regionCode]}],
        "publications.patent": [{"value": ["true"]}]
    },
    "widgets": {}
}

headers = {
    "Accept": "application/json,application/i18n+xml",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
    "Content-Type": "application/json",
    "Cookie": "_pk_id.16.72ee=217bfd78d38df5fc.1703641654.; _pk_ref.16.72ee=%5B%22%22%2C%22%22%2C1703665558%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; _pk_ses.16.72ee=1",
    "EPO-Trace-Id": "you8ki-tbjlcq-AAA-000000",
    "Origin": "https://worldwide.espacenet.com",
    "Referer": "https://worldwide.espacenet.com/patent/search?f=publications.pac%3Ain%3DHK&q=pd%20within%20%222020-01-01%2C%202020-12-31%22",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36",
    "X-EPO-PQL-Profile": "cpci",
    "sec-ch-ua": "\"Chromium\";v=\"104\", \"Not A;Brand\";v=\"99\", \"Google Chrome\";v=\"104\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\""
}

Iterates through each year to download monthly data using an API call to Espacenet. Appends patent information (in dicitonary form) to a list "data".

In [5]:
# Cell 5

data = list()

for year in range(startYear, endYear+1):
    
    print("Starting download for year", str(year)+"...")
    
# !! comment out the functions you're not using (PICK ONE) !!

    dates = generateDates(year)
    # dates = generateDates_med(year)
    # dates = generateDates_large(year)
    
    
    # cycles through dates to make API request and extract data
    for d in dates:
      
        # making the API request
        querystring = {"lang":"en,de,fr","q":"pd within \""+d[0]+", "+d[1]+"\"","qlang":"cql"}
        response = requests.request("POST", url, json=payload, headers=headers, params=querystring)
        
        resp = response.json()['hits']
            
        # prints file name if 1000 or more patents in the file. If file name prints, you need to re-import json data with smaller time-frames. Else some data will be lost.
        if len(resp) >= 1000:
            print(d)
        else:
            data += resp
        
print("Done.")

Starting download for year 2017...
Starting download for year 2018...
Starting download for year 2019...
Starting download for year 2020...
Starting download for year 2021...
Starting download for year 2022...
Starting download for year 2023...
Done.


NOTE: Ensure that the number of hits in each json file is < 1000. There is a download limit of 1000 items, meaning some data has been lost.
In this case pick/build a generate_dates function with shorter time frames.

Exports patent data as pickle file.

In [8]:
# Cell 6

with open(os.getcwd()+f'/data/{regionCode}/rawData{regionCode}.pickle', 'wb') as file:
    pickle.dump(data, file)