In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
referer = 'https://www.resilience.org/latest-articles/'
headers = {'User-Agent': user_agent, 'referer':referer}

# Applied Ecology and Environmental Sciences

In [3]:

archive = 'http://www.sciepub.com/journal/AEES/archive'

In [4]:
response = requests.get(archive, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
issues = [e['href'] for e in soup.find_all('a', {'class':'ctl00_cph_journal_otherinfo_list_issues_0'}) 
          if 'AEES' in e['href']]

In [178]:
base_url = 'http://www.sciepub.com'

links = []
dates = []

for issue in issues:
    issue_url = base_url + issue[5:]
    response = requests.get(issue_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    new_links = [e['href'] for e in soup.find_all('a', {'class':'colortj ml20'}) if 'html' in e['id']]
    new_links.reverse()
    links += new_links
    
    new_dates = [e.find('span').text for e in soup.find_all('div', {'class':'aritlce-pubinfo mt5'})]
    new_dates.reverse()
    dates += new_dates

In [82]:
assert len(links) == len(dates)

In [203]:
new_format = links[:269]
old_format = links[269:]

In [190]:
def get_attributes_newformat(link):
    
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    sections = soup.find_all('div', {'class':'section'})
    content_indicies = [0, len(sections)]

    abstract = ''
    if sections[0].find('h3').text == 'Abstract':
        abstract = ' '.join([p.text for p in sections[0].find_all('p')])
        content_indicies[0] = 1

    for i, section in enumerate(sections):
        if 'Conclusion' in section.find('h3').text:
            content_indicies[1] = i + 1


    text = ' '.join([p.text for s in sections[content_indicies[0]:content_indicies[1]] 
                     for p in s.find_all('p')])
    
    title = soup.find('h3', {'class':'fw700 col333'}).text

    return title, abstract, text

In [191]:
def get_attributes_oldformat(link):
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    title = soup.find('div', {'class':'articleTitleArea'}).find('h2').text
    
    abstract = ' '.join([p.text for p in soup.find('div', {'class':'abstract'}).find_all('p')[1:]])
    
    paragraphs = [p.text for p in soup.find('div', {'class':'articleCon'}).find_all('p')]

    p_to_exclude = 0
    for h3 in soup.find('div', {'class':'articleCon'}).find_all('h3'):
        if h3.text == 'Acknowledgements':

            current = h3
            more_to_go = True

            while more_to_go:
                nextNode = current.nextSibling
                try:
                    tag_name = nextNode.name
                except AttributeError:
                    tag_name = ""
                    more_to_go = False

                if tag_name == "p":
                    p_to_exclude += 1
                current = nextNode
    
    if p_to_exclude > 0:
        paragraphs = paragraphs[:-p_to_exclude]
    text = ' '.join(paragraphs)
    
    return title, abstract, text

In [207]:
titles = []
abstracts = []
texts = []

for link in new_format:
    title, abstract, text = get_attributes_newformat(link)
    titles.append(title)
    abstracts.append(abstract)
    texts.append(text)
    
for link in old_format:
    title, abstract, text = get_attributes_oldformat(link)
    titles.append(title)
    abstracts.append(abstract)
    texts.append(text)

df = pd.DataFrame({'link':links,
                   'title':titles,
                   'date': dates,
                   'abstract':abstracts,
                   'text': texts})

In [208]:
df

Unnamed: 0,link,title,date,abstract,text
0,http://pubs.sciepub.com/aees/10/2/4/index.html,A Conceptual Framework on Bhutanâs Environme...,"Pub. Date: February 23, 2022",The small states enjoy equal status with other...,The problem and perspective of security differ...
1,http://pubs.sciepub.com/aees/10/2/3/index.html,Estimation of Indoor Radon and Thoron Concentr...,"Pub. Date: February 11, 2022",Measurement of indoor radon and thoron levels ...,In the past few yearsâ scientists have indic...
2,http://pubs.sciepub.com/aees/10/2/2/index.html,Diversity of Anurans in Temporary Breeding Gro...,"Pub. Date: February 07, 2022","To record anuran diversity, a study was conduc...","In amphibians, breeding grounds are of great i..."
3,http://pubs.sciepub.com/aees/10/2/1/index.html,Screening of Lipid Production in Marine Cyanob...,"Pub. Date: February 07, 2022",Microalgae and Cyanobacteria are potentially d...,Biofuel production coupled with carbon dioxide...
4,http://pubs.sciepub.com/aees/10/1/6/index.html,Assessment of the PM2.5 and PM10 Particulate C...,"Pub. Date: January 25, 2022","This work is a pilot study in Aurangabad, Maha...",The present study aims to find out the concent...
...,...,...,...,...,...
355,http://pubs.sciepub.com/aees/1/2/2/index.html,\r\n Variability in Cynara ...,"Pub. Date: May 05, 2013",\nCynara cardunculus.L exhibits an important a...,Cynara cardunculus. L is a robust thistle wid...
356,http://pubs.sciepub.com/aees/1/2/1/index.html,\r\n Using Geographic Infor...,"Pub. Date: April 15, 2013",\r\n Ne...,There are now so many methods and techniques ...
357,http://pubs.sciepub.com/aees/1/1/3/index.html,\r\n Haematological Respons...,"Pub. Date: March 16, 2013",\r\n A ...,Rapid growth and expansion of industries in r...
358,http://pubs.sciepub.com/aees/1/1/2/index.html,\r\n Sequestered Organic Ca...,"Pub. Date: March 05, 2013",\r\n Ve...,Global warming and emission of carbon are of ...


In [218]:
def clean(text):
    rv = text.replace('\r\n','')
    rv = rv.strip()
    return rv

df['title'] = df.title.apply(clean)
df['abstract'] = df.abstract.apply(clean)

In [219]:
df

Unnamed: 0,link,title,date,abstract,text
0,http://pubs.sciepub.com/aees/10/2/4/index.html,A Conceptual Framework on Bhutanâs Environme...,"Pub. Date: February 23, 2022",The small states enjoy equal status with other...,The problem and perspective of security differ...
1,http://pubs.sciepub.com/aees/10/2/3/index.html,Estimation of Indoor Radon and Thoron Concentr...,"Pub. Date: February 11, 2022",Measurement of indoor radon and thoron levels ...,In the past few yearsâ scientists have indic...
2,http://pubs.sciepub.com/aees/10/2/2/index.html,Diversity of Anurans in Temporary Breeding Gro...,"Pub. Date: February 07, 2022","To record anuran diversity, a study was conduc...","In amphibians, breeding grounds are of great i..."
3,http://pubs.sciepub.com/aees/10/2/1/index.html,Screening of Lipid Production in Marine Cyanob...,"Pub. Date: February 07, 2022",Microalgae and Cyanobacteria are potentially d...,Biofuel production coupled with carbon dioxide...
4,http://pubs.sciepub.com/aees/10/1/6/index.html,Assessment of the PM2.5 and PM10 Particulate C...,"Pub. Date: January 25, 2022","This work is a pilot study in Aurangabad, Maha...",The present study aims to find out the concent...
...,...,...,...,...,...
355,http://pubs.sciepub.com/aees/1/2/2/index.html,Variability in Cynara cardunculus L. Tunisian ...,"Pub. Date: May 05, 2013",Cynara cardunculus.L exhibits an important ada...,Cynara cardunculus. L is a robust thistle wid...
356,http://pubs.sciepub.com/aees/1/2/1/index.html,Using Geographic Information System and Analyt...,"Pub. Date: April 15, 2013",Nepal frequently suffers from various types of...,There are now so many methods and techniques ...
357,http://pubs.sciepub.com/aees/1/1/3/index.html,Haematological Responses of Tilapia guineensis...,"Pub. Date: March 16, 2013",A total of 180 Tilapia guineensis (mean length...,Rapid growth and expansion of industries in r...
358,http://pubs.sciepub.com/aees/1/1/2/index.html,Sequestered Organic Carbon Status in the Soils...,"Pub. Date: March 05, 2013",Vegetative growth serves as an important means...,Global warming and emission of carbon are of ...


In [None]:
# Fix dates

In [220]:
df.to_pickle('../data/aees.pkl')

# Environmental Sociology

In [239]:
pages = []
for vol in range(1, 9):
    for iss in range(1, 5):
        pages.append('https://www.tandfonline.com/toc/rens20/{}/{}?nav=tocList'.format(vol, iss))

pages = pages[:-3]

In [264]:
links = []
dates = []

for page in pages:
    response = requests.get(page, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    links += [e['href'] for e in soup.find_all('a', {'class':'ref nowrap full'})]
    dates += [e.text for e in soup.find_all('span', {'class':'date'})]

In [265]:
assert len(links) == len(dates)
print(len(links))

258


In [315]:
def get_attributes_aees(link):
    
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    title = soup.find('span', {'class':'NLM_article-title hlFld-title'}).text

    text_find = soup.find('div', {'class':'hlFld-Fulltext'})
    if str(text_find) == 'None':
        text = ''
    else:
        text = ' '.join([p.text for p in soup.find('div', {'class':'hlFld-Fulltext'}).find_all('p')])

    abstract_find = soup.find('div', {'class':'abstractSection abstractInFull'})
    if str(abstract_find) == 'None':
        abstract = ''
    else:
        abstract = abstract_find.text
    
    return title, abstract, text

In [316]:
titles = []
abstracts = []
texts = []

for link in links:
    title, abstract, text = get_attributes_aees('https://www.tandfonline.com' + link)
    titles.append(title)
    abstracts.append(abstract)
    texts.append(text)

df_aees = pd.DataFrame({'link':links,
                        'title':titles,
                        'date': dates,
                        'abstract':abstracts,
                        'text': texts})

In [None]:
# Need date and text cleaning

In [318]:
df_aees.to_pickle('../data/aees.pkl')