In [19]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re

In [20]:
def get_articles_by_person(name):
    driver = webdriver.Chrome('./chromedriver')

    try:
        raw_data = ""
        
        driver.get("https://suw.biblos.pk.edu.pl/")
        input_element = driver.find_element("name", "query").send_keys(name)
        submit_button = driver.find_element("xpath", "/html/body/form/table/tbody/tr[3]/td[2]/center/table/tbody/tr[1]/td[2]/input")
        driver.execute_script("arguments[0].click();", submit_button);
        
        # WebDriverWait(driver, 25).until(EC.presence_of_element_located(("xpath", "//*[starts-with(@id, 'resultsDiv']")))
        element = WebDriverWait(driver, 25).until(EC.presence_of_element_located(("xpath", '//*[@id="resultsDiv_16453"]/center/center/table[2]')))

        for row in driver.find_elements('xpath',"//*[starts-with(@id, 'resourceItemTable')]"):
            record = row.get_attribute('innerHTML')
            raw_data+=record
        
        return raw_data
        
    finally:
        driver.delete_all_cookies()
        driver.quit()

In [21]:
class Article():
    def __init__(self):
        self.title = None
        self.authors = None
        self.ext_authors = None
        self.typ = None
        self.series = None
        self.release_data = None
        self.points = None
        self.mnisw_list = None
        self.impact = None
    
    def __str__(self):
        return f"Title: {self.title}\nAuthors: {self.authors}\nExt_authors: {self.ext_authors}\nType: {self.typ}\nSeries: {self.series}\nRelease data: {self.release_data}\nPoints: {self.points}\nImpact: {self.impact}\nMNiWS list: {self.mnisw_list}"
    
    # Builder pattern
    def add_title(self,title):
        self.title=title
    def add_authors(self,authors):
        self.authors=authors
    def add_ext_authors(self,ext_authors):
        self.ext_authors=ext_authors
    def add_typ(self,typ):
        self.typ=typ
    def add_series(self,series):
        self.series=series
    def add_release_data(self,release_data):
        self.release_data=release_data
    def add_points(self,points):
        self.points=points
    def add_mnisw_list(self,mnisw_list):
        self.mnisw_list=mnisw_list
    def add_impact(self,impact):
        self.impact=impact

In [22]:
def prettify(value):
    return ' '.join(value.split())

def parse_html_data(articles_html):
    articles_soup = BeautifulSoup(articles_html,'html.parser')
    
    articles = []

    documents = articles_soup.find_all('html')

    for document in documents:
        soup_title = document.find_all('b')[0] 
        soup_authors = document.find_all('a', {'title' : 'Profil w BPP'})
        soup_ext_authors = document.find_all('a', {'title' : 'Pokaż prace tego autora'})
        
        authors = [author.text for author in soup_authors]
        ext_authors = [author.text for author in soup_ext_authors]
        title = prettify(soup_title.text)
            
        try:
            typ = document.find(string=re.compile("Typ:")).find_next('b').text
        except:
            typ = ""
        try:
            series = document.find(string=re.compile("Seria/Czasopismo:")).find_next('b').text
        except:
            series = ""
        try:
            release_date = document.find(string=re.compile("Data wydania:")).find_next('b').text
        except:
            release_date = ""
        try:
            impact = document.find(string=re.compile("Impact Factor:")).find_next('b').text
        except:
            impact = ""
        try:
            mnisw_list = document.find(string=re.compile("Lista MNiSW/MEiN:")).find_next('b').text

        except:
            mnisw_list = ""
        try:
            points = document.find(string=re.compile("Punktacja czasopisma:")).find_next('b').text
        except:
            points = 0
            
        # assign special property of authors
       

        
        art = Article()
        art.add_authors(authors)
        art.add_ext_authors(ext_authors)
        art.add_title(title)
        art.add_typ(typ)
        art.add_points(points)
        art.add_points(series)
        art.add_release_data(release_date)
        art.add_impact(impact)
        art.add_mnisw_list(mnisw_list)
        
        articles.append(art)
                    
    return articles

In [23]:
def extract_data(raw_data):
    articles = []
    
    documents = BeautifulSoup(raw_data).find_all('tbody')

    for document in documents:
        soup_title = document.find_all('b')[0]    
        soup_authors = document.find_all('a', {'title' : 'Profil w BPP'})
        soup_ext_authors = document.find_all('a', {'title' : 'Pokaż prace tego autora'})
        
        title = prettify(soup_title.text)
        
        authors = [author.text for author in soup_authors]
        ext_authors = [author.text for author in soup_ext_authors]
        if not authors:
            continue
            
        try:
            typ = document.find(string=re.compile("Typ:")).find_next('b').text
        except:
            typ = ""
        try:
            series = document.find(string=re.compile("Seria/Czasopismo:")).find_next('b').text
        except:
            series = ""
        try:
            release_date = document.find(string=re.compile("Data wydania:")).find_next('b').text
        except:
            release_date = ""
        try:
            impact = document.find(string=re.compile("Impact Factor:")).find_next('b').text
        except:
            impact = ""
        try:
            mnisw_list = document.find(string=re.compile("Lista MNiSW/MEiN:")).find_next('b').text
        except:
            mnisw_list = ""
        try:
            points = document.find(string=re.compile("Punktacja czasopisma:")).find_next('b').text
        except:
            points = 0
            
        article = Article()
        article.add_authors(authors)
        article.add_ext_authors(ext_authors)
        article.add_title(title)
        article.add_series(series)
        article.add_typ(typ)
        article.add_points(points)
        article.add_mnisw_list(mnisw_list)
        article.add_impact(impact)
        article.add_release_data(release_date)
        
        articles.append(article)
                    
    return articles

In [24]:
raw_articles = get_articles_by_person("Wojnar")
articles=extract_data(raw_articles)

  driver = webdriver.Chrome('./chromedriver')


In [25]:

for article in articles:
    print(article)

Title: Wpływ parametrów elektroosadzania na zmiany nanostruktury warstw Al2O3/WS2 o przeznaczeniu tribologicznym
Authors: ['Wojnar, Leszek']
Ext_authors: ['Korzekwa, Joanna', 'Skoneczny, Władysław']
Type:   artykuł w czasopiśmie 
Series:  Technical Transactions. Mechanics = Czasopismo Techniczne. Mechanika 
Release data:   2011 
Points:   5 
Impact: 
MNiWS list:   B 
Title: Ocena powtarzalności wyników ilościowej oceny struktury
Authors: ['Wojnar, L.', 'Gądek, A.']
Ext_authors: []
Type:   artykuł w czasopiśmie 
Series:  Archiwum Odlewnictwa 
Release data:   2006 
Points: 0
Impact: 
MNiWS list: 
Title: Zastosowanie metod obliczeniowych w ocenie spawalności stali - wykorzystanie techniki komputerowej
Authors: ['Mikuła, Janusz', 'Wojnar, Leszek']
Ext_authors: []
Type:   artykuł w czasopiśmie 
Series:  Przegląd Spawalnictwa = Welding Technology Review 
Release data:   1993 
Points: 0
Impact: 
MNiWS list: 
Title: Machine learning versus human-developed algorithms in image analysis of micros