In [1]:
from IPython.display import clear_output
from selenium import webdriver
from bs4 import BeautifulSoup
from tqdm import tqdm
from os import walk
import numpy as np
import datetime
import time
import re

In [2]:
chromedriver_file_name = r'PATH TO chromedriver' # can be found here: https://chromedriver.chromium.org/
patents_directory = 'patents_2014-2016/'
features_json_style_file = 'data/features.txt'
broken_links_file_name = 'data/broken_links.txt'
features_file_name = 'data/features.csv'
yandex_patents_prefix = 'https://yandex.ru'

In [3]:
code_to_name = {'(21)(22)':'request',
                '(22)':'request_date',
                '(24)':'start_date',
                '(32)':'convention_priority',
                '(45)':'publish_date',
                '(72)':'authors',
                '(73)':'patent_holder',
                '(56)':'links'}

In [4]:
driver = webdriver.Chrome(chromedriver_file_name)

In [5]:
def parse_bottom_links(soup, doc_id):
    links = soup.find("div", {"id":doc_id})
    if links is None:
        return None
    docs =  (links
                .find_next('div')
                .find_all("div", {"class":"doctable_row doctable_row_click"}))
    result = []
    for doc in docs:
        cells = [cell.contents[0] for cell in doc.find_all("span", {"class":"doctable_cell"})]
        result.append(cells)
    return result

In [6]:
def prettify(content):
    pretty = {}
    prettify_string = (lambda value: value.strip() if '<br/>' not in value
                                    else value.strip().split('<br/>'))
    
    for key, value in content.items():
        new_value = value
        if type(value) is list:
            new_value = list(map(prettify_string, new_value))
        else:
            new_value = prettify_string(new_value)
        pretty[code_to_name[key]] = new_value
    
    return pretty

In [7]:
def value_for_2122(item):
    elements = item.find_all("div", {"class": "header-content-item"})
    if len(elements) == 1: # href
        return elements[0].contents[0].find('span').contents[0]
    
    return elements[1].contents[0]

In [8]:
def value_for_56(item):
    tag_links = item.find("div", {"class": "right-part__citations"}).contents
    result = []
    for tag_link in tag_links:
        if not hasattr(tag_link.contents[0], 'contents'):
            result.append(tag_link.contents[0])
        else:
            result.append(tag_link.contents[0].contents[0].contents[0])
    
    return result

In [9]:
def parse_horizontal_part(part):
    content = {}
    
    for item in part.find_all("div", {"class": "header-item-part"}):
        header = item.find("div", {"class": "header-sup-item"}).contents[0]
        value = None
        
        if header == '(21)(22)':
            value = value_for_2122(item)
        elif header == '(56)':
            value = value_for_56(item)
        else:
            contents = item.find(re.compile("(div|span)"), {"class": "header-content-item"}).contents
            value = ''.join(map(str, contents))
            
        content[header] = value
        
    return prettify(content)

In [10]:
def parse_head_part(soup):
    content = {}
    
    content['title'] = soup.find("div", {"class": "document-title document-title__desktop"}).contents[0].contents[0]
    content['country'] = soup.find(lambda tag: tag.name == 'div' and tag.get('class') == ['doc-url-item']).contents[0]
    content['document_number'] = soup.find("div", {"class": "doc-url-item doc-url-number"}).contents[0]
    content['document_type_code'] = soup.find("div", {"class": "doc-url-item center-content"}).contents[0]
    content['mpk_1'] = soup.find("div", {"class": "header-mpk-item header-mpk-title-item"}).contents[0]
    content['mpk_2'] = [tag.contents[0] for tag in soup.find_all(lambda tag: tag.name == 'div'
                                                                 and tag.get('class') == ['header-mpk-item'])]
    #content['mpk_2'] = soup.find(lambda tag: tag.name == 'div' and tag.get('class') == ['header-mpk-item']).contents[0]
    #content['mpk_3'] = soup.find("span", {"class": "header-mpk-sub-item"}).contents[0]
    content['mpk_3'] = [tag.contents[0] for tag in soup.find_all("span", {"class": "header-mpk-sub-item"})]
    
    return content

In [11]:
def parse_yandex_links(soup):
    content = {}
    
    content['yandex_cited_by'] = parse_bottom_links(soup, "doc-table-cited-by")
    content['yandex_citations'] = parse_bottom_links(soup, "doc-table-citation")
    
    return content

In [12]:
def create_json(page_html):
    soup = BeautifulSoup(page_html)
    head_json = parse_head_part(soup)
    
    left_part = soup.find("div", {"class": "left-part left-part__desktop"})
    right_part = soup.find("div", {"class": "right-part right-part__desktop"})
    left_json = parse_horizontal_part(left_part)
    right_json = parse_horizontal_part(right_part)
    
    yandex_links = parse_yandex_links(soup)
    
    return {**head_json, **left_json, **right_json, **yandex_links}

In [13]:
_, _, year_files = next(walk(patents_directory))

In [14]:
def save_json_line(json, year_file):
    year = year_file.split('.')[0]
    split = features_json_style_file.split('.')
    file, ext = ''.join(split[:-1]), split[-1]
    with open(file + '_' + year + '.' + ext, 'a') as f:
        f.write(str(json))
        f.write('\n')

In [15]:
def save_broken_link(link):
    with open(broken_links_file_name, 'a') as f:
        f.write(link)
        f.write('\n')

In [16]:
def get_links(file_name):
    with open(file_name, 'r') as f:
        return [yandex_patents_prefix + line.strip().replace('"', '') for line in f]

In [17]:
def parse_html(driver, patent_link):
    driver.get(patent_link)
    time.sleep(np.random.rand() * 0.5 + 1.2)

In [18]:
for year_file in year_files:
    patent_links = get_links(patents_directory + year_file)
    print('Parsing for', year_file)
    for i, patent_link in enumerate(patent_links):
        try:
            html = parse_html(driver, patent_link)
            json = create_json(driver.page_source)
            json['link'] = patent_link
            save_json_line(json, year_file)
            print('+ Link parsed', patent_link, '|', f'{i + 1} from {len(patent_links)}')
        except KeyboardInterrupt: raise
        except:
            print('For link', patent_link)
            save_broken_link(patent_link)
        if (i + 1) % 1000 == 0:
            clear_output(wait=True)
            print('Parsing for', year_file)

Parsing for 2016.txt
+ Link parsed https://yandex.ru/patents/doc/RU2639398C1_20171221 | 32001 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU2638460C1_20171213 | 32002 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU175069U1_20171117 | 32003 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU2634815C1_20171103 | 32004 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU2634007C1_20171023 | 32005 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU174368U1_20171011 | 32006 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU173978U1_20170922 | 32007 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU173863U1_20170915 | 32008 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU173377U1_20170824 | 32009 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU172972U1_20170802 | 32010 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU172873U1_20170728 | 32011 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU172647U1_2017071

+ Link parsed https://yandex.ru/patents/doc/RU2647784C1_20180319 | 32196 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU2646955C1_20180312 | 32197 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU2646671C1_20180306 | 32198 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU2645878C1_20180228 | 32199 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU177331U1_20180215 | 32200 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU2641140C1_20180116 | 32201 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU2640051C1_20171226 | 32202 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU2639410C1_20171221 | 32203 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU175078U1_20171117 | 32204 from 32489
For link https://yandex.ru/patents/doc/RU2633901C1_20171019
+ Link parsed https://yandex.ru/patents/doc/RU2630482C1_20170911 | 32206 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU172552U1_20170712 | 32207 from 32489
+ Link parsed https

+ Link parsed https://yandex.ru/patents/doc/RU2638698C1_20171215 | 32392 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU2636903C1_20171128 | 32393 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU175253U1_20171128 | 32394 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU174570U1_20171020 | 32395 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU174436U1_20171012 | 32396 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU174272U1_20171009 | 32397 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU174283U1_20171009 | 32398 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU173703U1_20170906 | 32399 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU173622U1_20170904 | 32400 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU173020U1_20170807 | 32401 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU171436U1_20170531 | 32402 from 32489
+ Link parsed https://yandex.ru/patents/doc/RU171270U1_20170529 | 32403 from 32489
+ 