# Scraping granted patents from EPO publication server

https://data.epo.org/publication-server/rest/v1.2/publication-dates/ - hardcoded to work for this domain.

Notebook scrapes granted patents (determined by URL ending with **B1**, e.g. EP3963522NWB1) and saves them in subfolders by week (YYYYMMDD) like they are listed on publication server. Possible to change to A1 and scrape patent applications, for example, or remove function entirely to scrape granted patents and patent applications. Logic behind scraping granted patents is that they are legally valid, and correctly formatted. Serve as better training data.

*The scripts overall process is:*
- Get all weekly publication pages for given date range
- Get **B1** links to granted patent publications for each weekly publication page
- Filter links by what has already been downloaded (against .csv files)
- Attempt to download remaining files.

If script fails at downloading some links, simply clear logs (finished.csv) and run again. Notebook autotmatically detect which files have not been downloaded and tries them again. I.e. failed-urls.csv for reference and monitoring.

## Settings

In [5]:
# Date range to attempt to download
START_DATE = "20150730"
END_DATE = "20190730"

# Patent kind code to download; B1 = granted patent
ENDS_WITH = 'B1'

# Directory to save the downloaded files and subdirectory system of /YYYYMMDD/
DIRECTORY = "./data/ep-b1"

#Directory to save the finished.csv and failed-urls.csv. Finished comprises the dates which have been completed and failed urls those that failed to download. 
LOG_DIRECTORY = "./data/logs"

# Added to the relative URLs to get get valid full URL.
BASE_URL = "https://data.epo.org"

# Max workers for thread pool executor. Found 4 was a good limit for free VPN.
MAX_WORKERS = 6

# Retry settings
RETRIES = 10
DELAY = 1
BACKOFF = 1
JITTER = (1, 3)
TIMEOUT = 5

In [6]:
import requests
import csv
import os
import tempfile

from retry import retry
from bs4 import BeautifulSoup
from bs4.filter import SoupStrainer
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm


# Handle requests, retries and errors
@retry(tries=RETRIES, delay=DELAY, backoff=BACKOFF, jitter=JITTER)
def get_response(link):
    try:
        response = requests.get(link, timeout=TIMEOUT)
        response.raise_for_status()  # If the response contains an HTTP error status code, raise an exception
        return response
    except (requests.exceptions.RequestException, requests.exceptions.HTTPError, requests.exceptions.ConnectionError, requests.exceptions.Timeout) as err:
        #print ("Error:", err)
        return None

#Extract all a tags from the response
def extract_all_links_from_response(response_content):
    # Use SoupStrainer to parse only 'a' tags
    soup = BeautifulSoup(response_content, 'html.parser', parse_only=SoupStrainer('a'))

    # Extract and return the links
    return [link.get('href') for link in soup]

  
def extract_links_ending_with(page_url, ends_with):
  response = get_response(page_url)
  if response is None:
      return []
  
  soup = BeautifulSoup(response.content, 'html.parser', parse_only=SoupStrainer('a'))
  return [link.get('href') for link in soup if link.get('href').endswith(ends_with)]


def get_filtered_links(start_date_str, end_date_str):
    # Read the finished.csv file into a list of dates to not retrieve links from as all downloaded.
    finished_dates = []
    try:
        with open(LOG_DIRECTORY + '/finished.csv', 'r') as csvfile:
            reader = csv.reader(csvfile)
            finished_dates = [row[0] for row in reader]
    except FileNotFoundError:
        # If the file does not exist, continue with an empty list
        pass
    
    # Define the URL
    url = BASE_URL + "/publication-server/rest/v1.2/publication-dates/"

    # Send a GET request to the URL
    response = get_response(url)
    if response is None:
      return None
    
    # Find all 'a' tags (which represent links)
    links = extract_all_links_from_response(response.content)
    
    # Convert date strings to datetime objects
    start_date = datetime.strptime(start_date_str, '%Y%m%d')
    end_date = datetime.strptime(end_date_str, '%Y%m%d')

    # Filter the links based on the dates
    filtered_links = []
    for href in links:
        try:
            # Extract the date from the URL path
            date_str = href.split('/')[-2]
            link_date = datetime.strptime(date_str, '%Y%m%d')
            if start_date <= link_date <= end_date and date_str not in finished_dates:
                filtered_links.append(BASE_URL + href)
        except (ValueError, IndexError):
            # Ignore the link if its date cannot be parsed or if it doesn't have a date in the URL path
            pass

    return filtered_links
  
def get_date_from_url(url):
    # Assuming the date is at the end of the URL after the last slash
    date = url.rstrip('/').split('/')[-2]
    return date

def remove_existing_files(extracted_links_dict):
    for date, links in extracted_links_dict.items():
        # Create a new list of links that don't exist as files
        new_links = [link for link in links if not os.path.isfile(f'{DIRECTORY}/{date}/{link.split("/")[-2]}.xml')]

        # Replace the old list with the new one
        extracted_links_dict[date] = new_links

    return extracted_links_dict

def remove_empty_lists_and_write_to_csv(extracted_links_dict):
    # Open the CSV file in append mode
    os.makedirs(LOG_DIRECTORY, exist_ok=True)
    with open(LOG_DIRECTORY + '/finished.csv', 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)

        # Iterate over a copy of the dictionary keys
        for date in list(extracted_links_dict.keys()):
            if len(extracted_links_dict[date]) == 0:
                # If the list is empty, remove the key from the dictionary
                del extracted_links_dict[date]

                # Write the date to the CSV file
                writer.writerow([date])

    return extracted_links_dict

def download_file(url, directory, file_name):
    # Send a GET request to the URL
    response = get_response(url)

    # Check if the request was successful
    if response is not None:
        # Create the directory if it doesn't exist
        os.makedirs(f"{DIRECTORY}/{directory}", exist_ok=True)     
        try:
            # Write the content to a temporary file
            with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:  # filter out keep-alive new chunks
                        temp_file.write(chunk)
             # If the download completed without errors, move the temporary file to the desired location
            os.rename(temp_file.name, f'{DIRECTORY}/{directory}/{file_name}')
        except Exception as e:
            # If an error occurred, delete the temporary file
            if os.path.exists(temp_file.name):
                os.remove(temp_file.name)
            print(f"Error downloading {file_name} from {url}: {e}")

        # Return the size of the file in bytes
        return len(response.content)
    else:
        with open(LOG_DIRECTORY + '/failed-urls.csv', 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([datetime.now(), url])
        return 0

def download_all_files(extracted_links_dict):
    # Calculate the total number of links
    total_links = sum(len(links) for links in extracted_links_dict.values())

    # Create a progress bar for the number of links
    pbar_links = tqdm(total=total_links, desc="Downloading files")

    # Use a ThreadPoolExecutor to download the files concurrently
    with ThreadPoolExecutor(MAX_WORKERS) as executor:
        # Iterate over the dictionary
        for date, links in extracted_links_dict.items():
            for link in links:
                # Extract the file name from the link
                file_name = link.split('/')[-2] + '.xml'

                # Submit a new task to the executor to download the file
                future = executor.submit(download_file, link, date, file_name)

                # Update the progress bars when the task is done
                future.add_done_callback(lambda future: pbar_links.update())

    # Close the progress bars
    pbar_links.close()

def extract_all_links(date_links):
    extracted_links_dict = {}

    with ThreadPoolExecutor(MAX_WORKERS) as executor:
        future_to_url = {executor.submit(extract_links_ending_with, url, ENDS_WITH): url for url in date_links}
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                links = future.result()
            except Exception as exc:
                print('%r generated an exception: %s' % (url, exc))
            else:
                # Get the date from the URL
                date = get_date_from_url(url)

                # Add the links to the dictionary
                if date not in extracted_links_dict:
                    extracted_links_dict[date] = []
                
                # Append "/document.xml" to each link before adding it to the dictionary
                processed_links = [BASE_URL + link + "/document.xml" for link in links]
                extracted_links_dict[date].extend(processed_links)

    return extracted_links_dict

# Usage

In [7]:
date_links = get_filtered_links(START_DATE, END_DATE)
print(f"No. of dates to scrape: {len(date_links)}")
print(f"Dates to scrape: {date_links}")

extracted_links_dict = extract_all_links(date_links)

print(f"Total number of links for all dates in given date range: {sum(len(links) for links in extracted_links_dict.values())}")
remove_existing_files(extracted_links_dict)
print(f"Total number of links after removing existing files: {sum(len(links) for links in extracted_links_dict.values())}")


remove_empty_lists_and_write_to_csv(extracted_links_dict)
print(f"Total number of dates after removing finished dates: {len(extracted_links_dict)}")

download_all_files(extracted_links_dict)

No. of dates to scrape: 208
Dates to scrape: ['https://data.epo.org/publication-server/rest/v1.2/publication-dates/20190724/patents', 'https://data.epo.org/publication-server/rest/v1.2/publication-dates/20190717/patents', 'https://data.epo.org/publication-server/rest/v1.2/publication-dates/20190710/patents', 'https://data.epo.org/publication-server/rest/v1.2/publication-dates/20190703/patents', 'https://data.epo.org/publication-server/rest/v1.2/publication-dates/20190626/patents', 'https://data.epo.org/publication-server/rest/v1.2/publication-dates/20190619/patents', 'https://data.epo.org/publication-server/rest/v1.2/publication-dates/20190612/patents', 'https://data.epo.org/publication-server/rest/v1.2/publication-dates/20190605/patents', 'https://data.epo.org/publication-server/rest/v1.2/publication-dates/20190529/patents', 'https://data.epo.org/publication-server/rest/v1.2/publication-dates/20190522/patents', 'https://data.epo.org/publication-server/rest/v1.2/publication-dates/20190

Downloading files: 100%|██████████| 437853/437853 [5:33:47<00:00, 21.86it/s]   
