In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import numpy as np
from pathlib import Path
import os
import itertools
import threading
import sys
import pyperclip

In [None]:
# Patstat credentials

patstat_username = 'mail@domain.org'
patstat_password = 'password'

In [None]:
# Browser information:

path_to_browser = {
    'Firefox' : r'C:\Program Files\Mozilla Firefox\firefox.exe',
    'Edge' : r'C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe',
    'Chrome' : r"C:\Program Files\Google\Chrome\Application\chrome.exe",
    'Safari' : None,
}

# Path to web driver. For Firefox, Edge or Chrome, you have to download it. 
#See https://github.com/mdhorne/patstat_with_python

path_to_web_driver = {
    'Firefox' : r"C:\Program Files\Mozilla Firefox\geckodriver.exe",
    'Edge' : r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe",
    'Chrome' :  r"C:\Program Files\Google\Chrome\Application\chromedriver.exe",
    'Safari' : None,
}

In [None]:
def initialize_patstat(browser):
    """
    Function to initialize Pastat with Selenium
    """
    
    global driver
    
    #initialized = False
    #Waiting animation
    #def animate():
    #    for c in itertools.cycle(['   ','.  ', '.. ', '...']):
    #        if initialized:
    #            break
    #        sys.stdout.write('\rInitializing Patstat, please wait' + c)
    #        sys.stdout.flush()
    #        time.sleep(0.5)
    #    sys.stdout.write('\nPatstat initialized.')
    #t = threading.Thread(target=animate)#, daemon=True)
    #t.start()
        
    #Through Selenium test we will invoke the executable file which will then invoke actual browser
    if browser == 'Firefox':
        options = webdriver.firefox.options.Options()
        options.binary_location = path_to_browser[browser]
        driver = webdriver.Firefox(executable_path=path_to_web_driver[browser], options=options)
    elif browser == 'Edge':
        driver = webdriver.Edge(executable_path=path_to_web_driver[browser])
    elif browser == 'Chrome':
        options = webdriver.chrome.options.Options()
        driver = webdriver.Chrome(executable_path=path_to_web_driver[browser], options=options)
    elif browser == 'Safari':
        driver = webdriver.Safari(executable_path=path_to_web_driver[browser], options=options)
    else:
        raise Exception('Unknown browser')
        
    # Maximize the browser window
    driver.maximize_window()
    
    # Method to launch the URL
    URL = driver.get("https://data.epo.org/expert-services/")
    
    # Close PatStat warning message

    try_click(By.ID, 'dijit_form_Button_0_label', 'Close')
#    while len([el for el in driver.find_elements(By.ID, 'dijit_form_Button_0_label') if 'Close' in el.text]) != 1:
#        pass
#    button_Close = [el for el in driver.find_elements(By.ID, 'dijit_form_Button_0_label') if 'Close' in el.text][0]
#    button_Close.click()
#    time.sleep(1)
    
    # Connect to Patstat using the credentials
    
    button_Username = try_click(By.ID, 'username')
    button_Username.send_keys(patstat_username)
    
    button_Password = try_click(By.ID,'password')
    button_Password.send_keys(patstat_password)

    button_Log_in = try_click(By.ID,'buttonLogin_label')
    
    # Latest Patstat Online version
    time.sleep(2)
    try_click(By.CLASS_NAME, 'fullPart', 'PATSTAT Online')

    # Go to the tab Table
    time.sleep(2)
    try_click(By.ID,'goToTableLink')

    # Look for the query field in the elements of the page
    #list_el = [el for el in driver.find_elements_by_xpath('//*') if 'tls201' in el.text] # would look for all elements
    #query_field = list_el[1]
    #list_query_field_click = [list_el[i] for i in range(13,22)]
    #If the function does not work, maybe the idenfication of the query field is wrong.
    #The second element (index 1) has been identified as the good one in previous trials.
    #Try to set query_el = range(len(list_el)) or identify the right element corresponding to the query field,
    #within 0 to len(list_el).
    
    query_field_click = [el for el in driver.find_elements_by_class_name('searchBlockBodyWithFooter') if 'tls201' in el.text]
    if len(query_field_click) == 1:
        query_field_click = query_field_click[0]
    else:
        raise Exception('Multiple possible query field clicks found')
        
    if browser == 'Chrome':
        all_elements = [el for el in driver.find_elements_by_xpath('//*')] #listing all web elements of the page

        text_areas = []

        for i in range(len(all_elements)):
            el = all_elements[i]
            try:
                if el.get_attribute('type')=='textarea': # if the web element is of type textarea, try interaction
                    query_field_click.click()
                    el.send_keys(Keys.CONTROL + "a")
                    el.send_keys(str(i))
                    el.send_keys(Keys.CONTROL + "c")
                    copied_text = pyperclip.paste()
                    if copied_text == str(i):
                        query_field = el
                        break
            except:
                pass
    else:
        query_field = [el for el in driver.find_elements_by_class_name('claro') if 'tls201' in el.text]
        if len(query_field) == 1:
            query_field = query_field[0]
        else:
            raise Exception('Multiple possible query fields found')
    
    #initialized = True
    
    return driver, query_field, query_field_click

In [None]:
def write_query(text):
    """
    Writes text to the query field.
    """
    query_field_click.click()
    try:
        query_field_click.click()
        query_field.send_keys(Keys.CONTROL + "a")
        pyperclip.copy(text) #sends text to clipboard
        query_field.send_keys(Keys.CONTROL + "v") #pastes text in the query file
    except:
        pass
    
#def write_query(text):
#    """
#    Writes text to the query field.
#    """
#    query_field.click()
#    try:
#        query_field.click()
#        query_field.send_keys(Keys.CONTROL + "a")
#        query_field.send_keys(text)
#    except:
#        pass

In [None]:
def try_click(by_what, text_search, condition='', display=False):
    """
    A function that continuously tries to click elements of type 'by_what' which names contain 'text_search'
    """
    # Look for elements and keep waiting if none
    while len([el for el in driver.find_elements(by_what, text_search) if condition in el.text]) == 0:
        pass
    
    objects = [el for el in driver.find_elements(by_what, text_search) if condition in el.text]
    
    if len(objects) > 1 and display:
        print(len(objects), 'objects found (',by_what,text_search,condition,')')
    
    count = 0
    clicked_object = None
    
    # Try to click on found elements
    for obj in objects:
        try:
            obj.click()
            clicked_object = obj
            if len(objects) == 1:
                break
        except:
            count += 1
            pass
    if count == len(objects) and display:
        print('No element clicked')
    if len(objects) == 1:
        return clicked_object
    else:
        return objects

In [None]:
def download_all():
    """
    A function that downloads all files.
    """
    
    try_click(By.ID,'download_text')
    try_click(By.ID,'downloadManagerMenuItem_text')
    
    while len([q for q in driver.find_elements(By.CLASS_NAME, 'exportStatusMessage') if 'Ready' in q.text]) != 1:
        pass
    
    for download_button in driver.find_elements(By.XPATH,"//*[contains(@class, 'buttonDownloadSave') and not(contains(@class, 'buttonDownloadDelete'))]"):
        try:
            download_button.click()
        except:
            pass

In [None]:
show_again = True
clear_list_button = None
def clear_all_downloads():
    """
    Clears all the downloads.
    """
    
    global show_again, clear_list_button
    
    try_click(By.ID,'download_text')
    try_click(By.ID,'downloadManagerMenuItem_text')
    if not clear_list_button:
        clear_list_buttons = try_click(By.CLASS_NAME, 'dijitReset', condition='Clear list')
    else:
        clear_list_button.click()
    
    # Clicking the confirmation dialogue
    if show_again:
        try_click(By.CLASS_NAME, 'doNotShowAgain')
        try_click(By.CLASS_NAME, 'dijitReset', condition='Yes')
        show_again = False
    if not clear_list_button:
        clear_list_button = clear_list_buttons[0]

    # Closing the download manager
    close_dialogue()

In [None]:
def close_dialogue():
    """
    Function to close any dialogue.
    """
    try_click(By.CLASS_NAME,'dijitDialogCloseIcon')

In [None]:
def search():
    """
    Function to launch the query.
    """
    try_click(By.ID,'queryBlockContentResultTable_btLaunchQuery')

In [None]:
def find_nb_rows():
    """
    Function to find the number of rows of a query. Uses the red figure displayed on the top left.
    """
    try:
        idx = driver.page_source.find(' rows')
        text_around = driver.page_source[idx-100:idx]
        idx = text_around.find(""""onDblClick:stopEvent">""")
        text_around = text_around[idx+23:]
        nb_rows = int(text_around.replace("&nbsp;",''))
        return nb_rows
    
    except ValueError:
        print('No query found.')

In [None]:
def prepare_download():
    """
    Function to prepare the download of the actual query.
    """
    
    nb_rows = find_nb_rows()
    delta_row = 700000 # maximum entry for a download in Patstat
    row = 1
    count_downloads = 0
    while row + delta_row < nb_rows:
        
        count_downloads += 1

        button_download_menu = driver.find_element(By.ID,'download_text')
        button_download_menu.click()
        time.sleep(1)
        button_download_menu_2 = driver.find_element(By.ID,'downloadMenuItem_text')
        button_download_menu_2.click()
        time.sleep(1)

        content_from = driver.find_element(By.ID,'dijit_form_TextBox_1')
        content_from.click()
        content_from.send_keys(Keys.CONTROL + "a") #select all
        content_from.send_keys(row)

        content_to = driver.find_element(By.ID,'dijit_form_TextBox_2')
        content_to.click()
        content_to.send_keys(Keys.CONTROL + "a") #select all
        content_to.send_keys(row+delta_row-1)

        button_OK = driver.find_element(By.ID,'btDownload_label').click()

        row += delta_row

        time.sleep(1)

    # Last query
    
    count_downloads += 1

    button_download_menu = driver.find_element(By.ID,'download_text')
    button_download_menu.click()
    time.sleep(1)
    button_download_menu_2 = driver.find_element(By.ID,'downloadMenuItem_text')
    button_download_menu_2.click()
    time.sleep(1)

    content_from = driver.find_element(By.ID,'dijit_form_TextBox_1')
    content_from.click()
    content_from.send_keys(Keys.CONTROL + "a") #select all
    content_from.send_keys(row)

    content_to = driver.find_element(By.ID,'dijit_form_TextBox_2')
    content_to.click()
    content_to.send_keys(Keys.CONTROL + "a") #select all
    content_to.send_keys(nb_rows)

    button_OK = driver.find_element(By.ID,'btDownload_label').click()
    
    print(count_downloads,'download(s) ready')
    return count_downloads

In [None]:
def wait():
    '''
    A function that waits for a task to end on Patstat.
    
    The screen usually darken while some tasks are performed on Patstat,
    preventing the background to be interacted with.
    This function simply tries to interact with the background continuously.
    If it is not available, the tasks are considered to be still in progress.
    '''
    
    flag = True
    while flag:
        try:
            query_field_click.click()
            flag = False
            break
        except:
            pass

In the following example, the script launches three different queries. One for the patent count between 1990 and 1999, one between 2000 and 2009, and the last one between 2010 and 2019. Notice that this could have been done in one shot, from 1990 to 2019, but this just serves as an example here.

In [None]:
driver, query_field, query_field_click = initialize_patstat('Chrome')

# If TEST is displayed in the query field. Identification worked properly.
write_query('TEST')
print('\nCheck if the word TEST is displayed in the query field.')

In [None]:
def example_query(date_start, date_end):
    text = f"""
SELECT a.appln_filing_year, COUNT(DISTINCT a.appln_id) AS NumberOfPatentApplications
FROM tls201_appln a
JOIN tls209_appln_ipc i ON a.appln_id = i.appln_id
WHERE i.ipc_class_symbol LIKE 'B08B%' -- Change IPC symbol here
AND a.appln_filing_year BETWEEN {str(date_start)} AND {str(date_end)} -- Define year range here
GROUP BY a.appln_filing_year
ORDER BY a.appln_filing_year
"""
    return text

In [None]:
print(example_query(2012,2013))

In [None]:
#clear_all_downloads()

for date_start in [1990,2000,2010]:

    wait()
    
    # Write the query to the field
    write_query(example_query(date_start, date_start+9))
    
    # Launch the query
    search()
    
    # Wait for query to end
    wait()
    print(find_nb_rows(), 'entries found')
    
    # Prepare downloads of the actual query
    count_downloads = prepare_download()
    
    # Wait for preparation and download all
    wait()
    download_all()
    close_dialogue()
    
    # Rename the downloads
    
    # Folder in which files are downloaded
    download_path = Path('C:/Users/Dhorne/Downloads')
    
    # Wait for the download to end
    while len([file for file in download_path.iterdir() if 'resulttable' in file.name]) == 0:
        pass
    downloaded_files = [file for file in download_path.iterdir() if 'resulttable' in file.name and '.zip' in file.suffix]
    
    # Identifier to rename the generic file name. Here, the queries correspond to different dates.
    # Dates are therefore used to rename the files (do not use 'resulttable' in the name!)
    identifier = str(date_start)+'_'+str(date_start+9)

    if count_downloads != len(downloaded_files):
        raise Exception('Found '+str(len(downloaded_files))+' files, expected '+str(count_downloads))
    else:
        if len(downloaded_files) == 1:
            os.rename(downloaded_files[0], str(downloaded_files[0].parent)+'/'+identifier+downloaded_files[0].suffix)
        else:
            count = 1
            for file in downloaded_files:
                os.rename(file, str(file.parent)+'/'+identifier+'_'+str(count)+file.suffix)
                count += 1
                
    clear_all_downloads()

In [None]:
query_field[0].click()

In [None]:
aa = [el for el in driver.find_elements_by_xpath('//*')]
len(aa)

In [None]:
count = 0

In [None]:
aa[13].click()

In [None]:
aa[13].send_keys('a')

In [None]:
query_field_click.click()
count += 1
print(count)
aa[count].send_keys('a')


In [None]:
count = 0

In [None]:
aa[5461]?

In [None]:
aa[5461]
element="002cae85-38d7-45e3-a4a2-c3693c448c47"

In [None]:
query_field = aa[5460]

In [None]:
query_field.text

In [None]:
[el for el in driver.find_elements_by_id('removeTechnology_5_label')][0].send_keys('ABC')

In [None]:
query_field_click.click()
element = aa[5461]
element.send_keys('ABC')

In [None]:
areas = []
for aaa in aa:
    try:
        if aaa.get_attribute('type')=='textarea':
            areas.append(aaa)
    except:
        pass

In [None]:
[el for el in aa if el.get_attribute('type')=='textarea']

In [None]:
driver.find_element_by_class_name('')

In [None]:
for attrib in ['accept','accept-charset','accesskey','action','align','alt','async','autocomplete','autofocus','autoplay','bgcolor','border','charset','checked','cite','class','color','cols','colspan','content','contenteditable','controls','coords','data','data-*','datetime','default','defer','dir','dirname','disabled','download','draggable','enctype','for','form','formaction','headers','height','hidden','high','href','hreflang','http-equiv','id','ismap','kind','label','lang','list','loop','low','max','maxlength','media','method','min','multiple','muted','name','novalidate','onabort','onafterprint','onbeforeprint','onbeforeunload','onblur','oncanplay','oncanplaythrough','onchange','onclick','oncontextmenu','oncopy','oncuechange','oncut','ondblclick','ondrag','ondragend','ondragenter','ondragleave','ondragover','ondragstart','ondrop','ondurationchange','onemptied','onended','onerror','onfocus','onhashchange','oninput','oninvalid','onkeydown','onkeypress','onkeyup','onload','onloadeddata','onloadedmetadata','onloadstart','onmousedown','onmousemove','onmouseout','onmouseover','onmouseup','onmousewheel','onoffline','ononline','onpagehide','onpageshow','onpaste','onpause','onplay','onplaying','onpopstate','onprogress','onratechange','onreset','onresize','onscroll','onsearch','onseeked','onseeking','onselect','onstalled','onstorage','onsubmit','onsuspend','ontimeupdate','ontoggle','onunload','onvolumechange','onwaiting','onwheel','open','optimum','pattern','placeholder','poster','preload','readonly','rel','required','reversed','rows','rowspan','sandbox','scope','selected','shape','size','sizes','span','spellcheck','src','srcdoc','srclang','srcset','start','step','style','tabindex','target','title','translate','type','usemap','value','width','wrap']:
    query_field_click.click()
    aa[5461].send_keys(attrib)
    if aa[5461].get_attribute(attrib) is not None:
        print(attrib, aa[5461].get_attribute(attrib))

In [None]:
query_field_click.click()
aa[5461].send_keys('xpath')
print(aa[5461].get_attribute('xpath'))

In [None]:
driver.find

In [None]:
dijitReset dijitInline dijitButtonText
removeTechnology_5_label

In [None]:
count_start = 5460
count_end = 5463
for idx in range(count_start,count_end):
    try:
        print(idx, end='\r')
        query_field_click.click()
        aa[idx].send_keys(str(idx)+' OHHHH')
        aa[idx].send_keys(Keys.RETURN)
        aa[idx].send_keys(Keys.ENTER)
        time.sleep(2)
        print('')
        print('')
        print('--->',idx)
        print('')
        print('')

    except:
        pass

In [None]:
query_field = [el for el in driver.find_elements_by_class_name('CodeMirror-scroll') if 'tls201' in el.text]
query_field[0].click()
query_field[0].send_keys('A')
id="queryBlockContentResultTable"
searchBlockBodyWithFooter
class="dijitContentPane searchQueryEditor"
class="CodeMirror CodeMirror-wrap dojoDndTarget cm-s-sql dojoDndContainerOver"
class="CodeMirror-sizer"

In [None]:
query_field_click.click()

In [None]:
if browser == 'Chrome':
    all_elements = [el for el in driver.find_elements_by_xpath('//*')]
    
    text_areas = []

    for i in range(len(all_elements)):
        el = all_elements[i]
        try:
            if el.get_attribute('type')=='textarea':
                query_field_click.click()
                el.send_keys(Keys.CONTROL + "a")
                el.send_keys(str(i))
                el.send_keys(Keys.CONTROL + "c")
                copied_text = pyperclip.paste()
                if copied_text == str(i):
                    query_field = el
                    break
        except:
            pass

In [None]:
query_field_click.click()
el.send_keys("TEST")