In [69]:
import pandas as pd
import re
import requests
import numpy as np
from bs4 import BeautifulSoup, NavigableString, Tag
from pathlib import Path
from time import sleep

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize

#adding delay in second due to crawl-delay mentioned in robots.text in https://pureportal.coventry.ac.uk/robots.txt
crawl_delay = 0

In [70]:
def get_page_count():
    base_url = "https://pureportal.coventry.ac.uk/en/organisations/school-of-economics-finance-and-accounting/publications/"
    page = requests.get(base_url)
    soup = BeautifulSoup(page.content, "html.parser")
    nav = soup.find("nav", class_="pages")
    a_fetch = nav.find_all("a", class_="step")
    last_ele = a_fetch[-1].text
    return int(last_ele)

In [71]:
page_count = get_page_count()
print(page_count)
pub_urls = []
for i in range(page_count):
    url = "https://pureportal.coventry.ac.uk/en/organisations/school-of-economics-finance-and-accounting/publications/?page=" + f"{i}"
    pub_urls.append(url)
    
pub_urls

13


['https://pureportal.coventry.ac.uk/en/organisations/school-of-economics-finance-and-accounting/publications/?page=0',
 'https://pureportal.coventry.ac.uk/en/organisations/school-of-economics-finance-and-accounting/publications/?page=1',
 'https://pureportal.coventry.ac.uk/en/organisations/school-of-economics-finance-and-accounting/publications/?page=2',
 'https://pureportal.coventry.ac.uk/en/organisations/school-of-economics-finance-and-accounting/publications/?page=3',
 'https://pureportal.coventry.ac.uk/en/organisations/school-of-economics-finance-and-accounting/publications/?page=4',
 'https://pureportal.coventry.ac.uk/en/organisations/school-of-economics-finance-and-accounting/publications/?page=5',
 'https://pureportal.coventry.ac.uk/en/organisations/school-of-economics-finance-and-accounting/publications/?page=6',
 'https://pureportal.coventry.ac.uk/en/organisations/school-of-economics-finance-and-accounting/publications/?page=7',
 'https://pureportal.coventry.ac.uk/en/organisat

In [72]:
pub_list = []

for url in pub_urls:
    page = requests.get(url)
    sleep(crawl_delay) 
    soup = BeautifulSoup(page.content, "html.parser")
    for h3 in soup.find_all("h3", class_="title"):
        a_fetch = h3.find("a")
        link = a_fetch.attrs["href"]
        title = h3.findChild('span').text
        pub_map = {}
        pub_map["pub_link"] = link
        title_clean = re.sub('[^A-Za-z0-9]+ ', ' ', title)
        pub_map["pub_title"] = title_clean
        pub_list.append(pub_map)

In [73]:
pub_list

[{'pub_link': 'https://pureportal.coventry.ac.uk/en/publications/a-bibliometric-review-of-the-waqf-literature',
  'pub_title': 'A bibliometric review of the Waqf literature'},
 {'pub_link': 'https://pureportal.coventry.ac.uk/en/publications/a-note-on-covid-19-instigated-maximum-drawdown-in-islamic-markets',
  'pub_title': 'A note on COVID-19 instigated maximum drawdown in Islamic markets versus conventional counterparts'},
 {'pub_link': 'https://pureportal.coventry.ac.uk/en/publications/bank-stock-valuation-theories-do-they-explain-prices-based-on-the',
  'pub_title': 'Bank stock valuation theories do they explain prices based on theories?'},
 {'pub_link': 'https://pureportal.coventry.ac.uk/en/publications/ceo-duality-and-firm-performance-a-systematic-review-and-research',
  'pub_title': 'CEO Duality and Firm Performance A Systematic Review and Research Agenda'},
 {'pub_link': 'https://pureportal.coventry.ac.uk/en/publications/ceo-financial-experience-and-firms-earnings-management-in-m

In [74]:
len(pub_list)

638

In [75]:
def get_pub_date(soup):
    for div in soup.find_all("tr", class_="status"):
        a_fetch = div.find("span", class_="date").text
    return a_fetch

In [76]:
def get_auth(soup):
    auth_name = []
    auth_link = []
        
    for div in soup.find_all("p", class_="relations persons"):
        for sdiv in div.children:
            if isinstance(sdiv, NavigableString):
                continue
            else:
                link = sdiv.attrs["href"]
                name = sdiv.find("span").text
                auth_name.append(name)
                auth_link.append(link)

    return (auth_name,auth_link)

In [49]:
# url = "https://pureportal.coventry.ac.uk/en/publications/future-of-education-and-research-a-preliminary-thematic-analysis"
# url = "https://pureportal.coventry.ac.uk/en/publications/ceo-financial-experience-and-firms-earnings-management-in-mampa-t"
url = "https://pureportal.coventry.ac.uk/en/publications/fintech-financial-inclusion-and-income-inequality-a-quantile-regr"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
at_name,link = get_auth(soup)
print(at_name, link)

[] []


In [77]:
for index, pub in enumerate(pub_list):
    url = pub["pub_link"]
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    at_name,link = get_auth(soup)
    date = get_pub_date(soup)
    pub_list[index]["pub_date"] = date
    pub_list[index]["auth_name"] = at_name
    pub_list[index]["auth_link"] = link
    print(at_name)
    sleep(crawl_delay) 

['Rashedul Hasan']
['Rashedul Hasan']
['Alireza Zarei']
['Mei Yu']
['Thai Nguyen', 'Thang Nguyen', 'Panagiotis Andrikopoulos']
['Ruth Owusu-Mensah']
['Masud Ibrahim']
['John Ayuk Enombu']
['Ahmad Abras']
['Hafij Ullah']
['Hany Ahmed', 'Yilmaz Guney']
['Abay Mulatu', 'Boying Xu']
['Ibrahim Elmghaamez']
[]
['Ibrahim Elmghaamez']
['Uchenna Tony-Okeke']
[]
['Isaiah Oino']
[]
['Judith Kabajulizi', 'Francis Awuku Darko']
['Loai Alsaid']
['Rashedul Hasan']
['Jaliyyah Bello', 'Mohammad Khaleq Newaz']
[]
['Simon Huston']
['Mei Yu']
['Ahmed Saleh']
['Alireza Zarei', 'Mehdi Hosseini']
[]
['Simon Huston']
['Mehul Chhatbar']
['Tariq Al Montaser']
['Alaa Alhaj Ismail']
['Styliani (Elina) Panetsidou', 'Angelos Synapis']
['Loai Alsaid']
['Rashedul Hasan']
[]
[]
['Simon Huston']
['Simon Huston']
['Ibrahim Elmghaamez']
['Rashedul Hasan']
['Mehtap Hisarciklilar']
['Ahmad Abras']
[]
['Dimitris Serenis']
['Eliana Lauretta', 'Daniel Santamaria']
['Sarkar Kabir']
['Styliani Panetsidou']
['Sandar Win']
['Meht

In [78]:
len(pub_list)

638

In [79]:
def remove_empty_auth(pub_list):
    indices_to_delete = []
    for index, pub in enumerate(pub_list):
        if not pub["auth_name"]:
            indices_to_delete.append(index)

    for index in sorted(indices_to_delete, reverse=True):
        del pub_list[index]
    return pub_list

pub_list = remove_empty_auth(pub_list)

In [81]:
len(pub_list)

372

In [82]:
pub_list

[{'pub_link': 'https://pureportal.coventry.ac.uk/en/publications/a-bibliometric-review-of-the-waqf-literature',
  'pub_title': 'A bibliometric review of the Waqf literature',
  'pub_date': 'Jun 2022',
  'auth_name': ['Rashedul Hasan'],
  'auth_link': ['https://pureportal.coventry.ac.uk/en/persons/rashedul-hasan']},
 {'pub_link': 'https://pureportal.coventry.ac.uk/en/publications/a-note-on-covid-19-instigated-maximum-drawdown-in-islamic-markets',
  'pub_title': 'A note on COVID-19 instigated maximum drawdown in Islamic markets versus conventional counterparts',
  'pub_date': 'May 2022',
  'auth_name': ['Rashedul Hasan'],
  'auth_link': ['https://pureportal.coventry.ac.uk/en/persons/rashedul-hasan']},
 {'pub_link': 'https://pureportal.coventry.ac.uk/en/publications/bank-stock-valuation-theories-do-they-explain-prices-based-on-the',
  'pub_title': 'Bank stock valuation theories do they explain prices based on theories?',
  'pub_date': '1 Mar 2022',
  'auth_name': ['Alireza Zarei'],
  'aut

In [83]:
df = pd.DataFrame(pub_list)

In [84]:
df['id'] = df.index

In [85]:
df.head()

Unnamed: 0,pub_link,pub_title,pub_date,auth_name,auth_link,id
0,https://pureportal.coventry.ac.uk/en/publicati...,A bibliometric review of the Waqf literature,Jun 2022,[Rashedul Hasan],[https://pureportal.coventry.ac.uk/en/persons/...,0
1,https://pureportal.coventry.ac.uk/en/publicati...,A note on COVID-19 instigated maximum drawdown...,May 2022,[Rashedul Hasan],[https://pureportal.coventry.ac.uk/en/persons/...,1
2,https://pureportal.coventry.ac.uk/en/publicati...,Bank stock valuation theories do they explain ...,1 Mar 2022,[Alireza Zarei],[https://pureportal.coventry.ac.uk/en/persons/...,2
3,https://pureportal.coventry.ac.uk/en/publicati...,CEO Duality and Firm Performance A Systematic ...,25 May 2022,[Mei Yu],[https://pureportal.coventry.ac.uk/en/persons/...,3
4,https://pureportal.coventry.ac.uk/en/publicati...,CEO Financial Experience and Firms Earnings Ma...,7 Mar 2022,"[Thai Nguyen, Thang Nguyen, Panagiotis Andriko...",[https://pureportal.coventry.ac.uk/en/persons/...,4


In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372 entries, 0 to 371
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pub_link   372 non-null    object
 1   pub_title  372 non-null    object
 2   pub_date   372 non-null    object
 3   auth_name  372 non-null    object
 4   auth_link  372 non-null    object
 5   id         372 non-null    int64 
dtypes: int64(1), object(5)
memory usage: 17.6+ KB


In [87]:
df[df['id']==23]["auth_name"]

23    [Alireza Zarei, Mehdi Hosseini]
Name: auth_name, dtype: object

In [88]:
def rem_punc_low_case(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

def clean_title(title_list):
    title = ' '.join(title_list)
    title = re.sub('[^a-zA-Z]', ' ', title)
    title = title.lower()
    return title

def remove_brac(x):
    title = ', '.join(x)
    return title

In [89]:
df['auth_name_extract'] = df['auth_name'].apply(lambda x: clean_title(x))

In [90]:
df['auth_name'] = df['auth_name'].apply(lambda x: remove_brac(x))

In [91]:
df['pub_date'] = df['pub_date'].apply(lambda x: rem_punc_low_case(x))

In [92]:
df[df['id']==4]["auth_name_extract"]

4    thai nguyen thang nguyen panagiotis andrikopoulos
Name: auth_name_extract, dtype: object

In [93]:
title_lower_case = df['pub_title'].str.lower()

In [94]:
df["text"] = title_lower_case + " " + df["auth_name_extract"] + " " + df["pub_date"]

In [95]:
org_pub = "pub.csv"

In [96]:
df.to_csv(org_pub, index=False)

In [97]:
df.head()

Unnamed: 0,pub_link,pub_title,pub_date,auth_name,auth_link,id,auth_name_extract,text
0,https://pureportal.coventry.ac.uk/en/publicati...,A bibliometric review of the Waqf literature,jun 2022,Rashedul Hasan,[https://pureportal.coventry.ac.uk/en/persons/...,0,rashedul hasan,a bibliometric review of the waqf literature r...
1,https://pureportal.coventry.ac.uk/en/publicati...,A note on COVID-19 instigated maximum drawdown...,may 2022,Rashedul Hasan,[https://pureportal.coventry.ac.uk/en/persons/...,1,rashedul hasan,a note on covid-19 instigated maximum drawdown...
2,https://pureportal.coventry.ac.uk/en/publicati...,Bank stock valuation theories do they explain ...,1 mar 2022,Alireza Zarei,[https://pureportal.coventry.ac.uk/en/persons/...,2,alireza zarei,bank stock valuation theories do they explain ...
3,https://pureportal.coventry.ac.uk/en/publicati...,CEO Duality and Firm Performance A Systematic ...,25 may 2022,Mei Yu,[https://pureportal.coventry.ac.uk/en/persons/...,3,mei yu,ceo duality and firm performance a systematic ...
4,https://pureportal.coventry.ac.uk/en/publicati...,CEO Financial Experience and Firms Earnings Ma...,7 mar 2022,"Thai Nguyen, Thang Nguyen, Panagiotis Andrikop...",[https://pureportal.coventry.ac.uk/en/persons/...,4,thai nguyen thang nguyen panagiotis andrikopoulos,ceo financial experience and firms earnings ma...
