In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
referer = 'https://www.resilience.org/latest-articles/'
headers = {'User-Agent': user_agent, 'referer':referer}

# Applied Ecology and Environmental Sciences

In [3]:

archive = 'http://www.sciepub.com/journal/AEES/archive'

In [4]:
response = requests.get(archive, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
issues = [e['href'] for e in soup.find_all('a', {'class':'ctl00_cph_journal_otherinfo_list_issues_0'}) 
          if 'AEES' in e['href']]

In [7]:
base_url = 'http://www.sciepub.com'

links = []
dates = []

for issue in issues:
    issue_url = base_url + issue[5:]
    response = requests.get(issue_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    new_links = [e['href'] for e in soup.find_all('a', {'class':'colortj ml20'}) if 'html' in e['id']]
    new_links.reverse()
    links += new_links
    
    new_dates = [e.find('span').text for e in soup.find_all('div', {'class':'aritlce-pubinfo mt5'})]
    new_dates.reverse()
    dates += new_dates

In [8]:
assert len(links) == len(dates)

In [9]:
new_format = links[:269]
old_format = links[269:]

In [10]:
def get_attributes_newformat(link):
    
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    sections = soup.find_all('div', {'class':'section'})
    content_indicies = [0, len(sections)]

    abstract = ''
    if sections[0].find('h3').text == 'Abstract':
        abstract = ' '.join([p.text for p in sections[0].find_all('p')])
        content_indicies[0] = 1

    for i, section in enumerate(sections):
        if 'Conclusion' in section.find('h3').text:
            content_indicies[1] = i + 1


    text = ' '.join([p.text for s in sections[content_indicies[0]:content_indicies[1]] 
                     for p in s.find_all('p')])
    
    title = soup.find('h3', {'class':'fw700 col333'}).text

    return title, abstract, text

In [11]:
def get_attributes_oldformat(link):
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    title = soup.find('div', {'class':'articleTitleArea'}).find('h2').text
    
    abstract = ' '.join([p.text for p in soup.find('div', {'class':'abstract'}).find_all('p')[1:]])
    
    paragraphs = [p.text for p in soup.find('div', {'class':'articleCon'}).find_all('p')]

    p_to_exclude = 0
    for h3 in soup.find('div', {'class':'articleCon'}).find_all('h3'):
        if h3.text == 'Acknowledgements':

            current = h3
            more_to_go = True

            while more_to_go:
                nextNode = current.nextSibling
                try:
                    tag_name = nextNode.name
                except AttributeError:
                    tag_name = ""
                    more_to_go = False

                if tag_name == "p":
                    p_to_exclude += 1
                current = nextNode
    
    if p_to_exclude > 0:
        paragraphs = paragraphs[:-p_to_exclude]
    text = ' '.join(paragraphs)
    
    return title, abstract, text

In [14]:
titles = []
abstracts = []
texts = []

for link in links:
    try:
        title, abstract, text = get_attributes_newformat(link)
    except:
        title, abstract, text = get_attributes_oldformat(link)
    titles.append(title)
    abstracts.append(abstract)
    texts.append(text)

'''
for link in new_format:
    title, abstract, text = get_attributes_newformat(link)
    titles.append(title)
    abstracts.append(abstract)
    texts.append(text)
    
for link in old_format:
    title, abstract, text = get_attributes_oldformat(link)
    titles.append(title)
    abstracts.append(abstract)
    texts.append(text)
'''
df = pd.DataFrame({'link':links,
                   'title':titles,
                   'date': dates,
                   'abstract':abstracts,
                   'text': texts})

In [15]:
df.head()

Unnamed: 0,link,title,date,abstract,text
0,http://pubs.sciepub.com/aees/10/3/10/index.html,Maxent Modelling for Predicting the Spatial Di...,"Pub. Date: March 23, 2022",Vultures are ecologically important primarily ...,Long-billed vulture (LBV) Gyps indicus is an o...
1,http://pubs.sciepub.com/aees/10/3/9/index.html,A Comparative Study Focusing the Effect of Cri...,"Pub. Date: March 23, 2022","Now a day, India along with whole world is fac...",According to the latest data compiled in the W...
2,http://pubs.sciepub.com/aees/10/3/8/index.html,Studies of Seasonal Algal Composition during M...,"Pub. Date: March 20, 2022",This study is on seasonal algal composition in...,River Rupnarayan begins as the Dhaleswari (Dha...
3,http://pubs.sciepub.com/aees/10/3/7/index.html,"Morphological Characteristics of Capsules, See...","Pub. Date: March 15, 2022",An endangered species of Aquilaria malaccensis...,An endangered species of Aquilaria malaccensis...
4,http://pubs.sciepub.com/aees/10/3/6/index.html,Ingestion Effect of Polyethylene Terephthalate...,"Pub. Date: March 15, 2022",Polyethylene terephthalate (PET) is among the ...,Since 1950 plastic has become a material globa...


In [17]:
def clean(text):
    rv = text.replace('\r\n','')
    rv = rv.strip()
    return rv

df['title'] = df.title.apply(clean)
df['abstract'] = df.abstract.apply(clean)

In [None]:
df

In [None]:
# Fix dates

In [18]:
df.to_pickle('../Data/aees.pkl')

# Environmental Sociology

In [19]:
pages = []
for vol in range(1, 9):
    for iss in range(1, 5):
        pages.append('https://www.tandfonline.com/toc/rens20/{}/{}?nav=tocList'.format(vol, iss))

pages = pages[:-3]

In [20]:
links = []
dates = []

for page in pages:
    response = requests.get(page, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    links += [e['href'] for e in soup.find_all('a', {'class':'ref nowrap full'})]
    dates += [e.text for e in soup.find_all('span', {'class':'date'})]

In [21]:
assert len(links) == len(dates)
print(len(links))

258


In [22]:
links[0]

'/doi/full/10.1080/23251042.2015.1022983'

In [23]:
def get_attributes_es(link):
    
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    title = soup.find('span', {'class':'NLM_article-title hlFld-title'}).text

    text_find = soup.find('div', {'class':'hlFld-Fulltext'})
    if str(text_find) == 'None':
        text = ''
    else:
        text = ' '.join([p.text for p in soup.find('div', {'class':'hlFld-Fulltext'}).find_all('p')])

    abstract_find = soup.find('div', {'class':'abstractSection abstractInFull'})
    if str(abstract_find) == 'None':
        abstract = ''
    else:
        abstract = abstract_find.text
    
    return title, abstract, text

In [24]:
titles = []
abstracts = []
texts = []

for link in links:
    title, abstract, text = get_attributes_es('https://www.tandfonline.com' + link)
    titles.append(title)
    abstracts.append(abstract)
    texts.append(text)

df_es = pd.DataFrame({'link':links,
                        'title':titles,
                        'date': dates,
                        'abstract':abstracts,
                        'text': texts})

In [25]:
df_es

Unnamed: 0,link,title,date,abstract,text
0,/doi/full/10.1080/23251042.2015.1022983,Why environmental sociology?,19 Mar 2015,,"In some ways, the answer is obvious. As the sc..."
1,/doi/full/10.1080/23251042.2014.971479,The lie of the lion: racialization of nature i...,19 Mar 2015,"The narrative of the Maasai lion hunt, invoked...",
2,/doi/full/10.1080/23251042.2015.1012617,The territorialization of environmental Govern...,19 Mar 2015,The territorial/local level of action appears ...,
3,/doi/full/10.1080/23251042.2015.1020466,A behavioural measure of environmental decisio...,19 Mar 2015,There is great benefit in using measures of en...,
4,/doi/full/10.1080/23251042.2014.965402,Natural waste: canine companions and the lure ...,19 Mar 2015,The most organized and regulated societies in ...,
...,...,...,...,...,...
253,/doi/full/10.1080/23251042.2021.1980936,Livelihood discourses at the water-energy-food...,28 Oct 2021,ABSTRACTOnshore Coal Seam Gas (CSG) extraction...,
254,/doi/full/10.1080/23251042.2021.1958545,Understanding the rebound: normative evaluatio...,01 Aug 2021,ABSTRACTRenewable energy may have smaller envi...,
255,/doi/full/10.1080/23251042.2021.2002000,Community sharing: sustainable mobility in a p...,14 Nov 2021,ABSTRACTThis paper examines new initiatives in...,"Having ratified the Paris Agreement, Japan has..."
256,/doi/full/10.1080/23251042.2021.1975350,The relationship between state-level carbon em...,12 Sep 2021,ABSTRACTClimate change is perhaps the most pre...,


In [None]:
# Need date and text cleaning

In [26]:
df_es.to_pickle('../Data/es.pkl')