In [1]:
import pandas as pd
import plotly.express as px

In [18]:
df = pd.read_pickle('../Data/Resilience/final.pkl')

In [6]:
df.head()

Unnamed: 0,url,title,date,author,text,text_list,num_words,month
21,https://www.resilience.org/stories/2021-07-12/...,Scientists Call Northwest Heatwave the ‘Most E...,2021-07-12,Jake Johnson,Ed. note: This piece was originally published ...,"[Ed., note:, This, piece, was, originally, pub...",635,7
22,https://www.resilience.org/stories/2021-07-12/...,Letter From The Farm | Welcome To The Burren,2021-07-12,Shane Casey,Ed. note: This article first appeared on ARC20...,"[Ed., note:, This, article, first, appeared, o...",1165,7
23,https://www.resilience.org/stories/2021-07-12/...,"Wisdom Traditions, Science and the Search for ...",2021-07-12,David Bollier,Jeremy Lent has taken on an audacious task for...,"[Jeremy, Lent, has, taken, on, an, audacious, ...",616,7
24,https://www.resilience.org/stories/2021-07-12/...,Defending Beef: Excerpt,2021-07-12,Nicolette Hahn Niman,The following excerpt is from Nicolette Hahn N...,"[The, following, excerpt, is, from, Nicolette,...",1928,7
25,https://www.resilience.org/stories/2021-07-12/...,The show is over,2021-07-12,Greta Thunberg,Ed. note: This piece was originally published ...,"[Ed., note:, This, piece, was, originally, pub...",984,7


In [7]:
df['month'] = df.date.dt.month
df['year'] = df.date.dt.year

In [10]:
toplot = df.groupby(['year', 'month']).agg({'title':'count', 'num_words':'mean', 'date':'min'})

In [15]:
fig = px.scatter(toplot, x='date', y='title', size='num_words', 
           labels = {'date':'Date', 'title':'Number of Articles'},
           title = 'Figure 3')

fig.update_layout(
    font_family = 'Times New Roman',
    font_color = 'black'

)

### Updating the Resilience Dataset to Include Through 2021

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# Create a list of urls (one for each page in latest-articles)
pagelist = ['https://www.resilience.org/latest-articles/page/' + str(p) for p in range(1, 200)]

In [12]:
# Function that lists the urls of the 12 articles presented on a given page
def list_articles(url):
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.text, 'html.parser')
    return [h2.find('a')['href'] for h2 in soup.find_all("h2", {'class':'post-title'})]

In [4]:
# Create a headers object - this step avoids errors by telling the webpage where the request is coming from
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
referer = 'https://www.resilience.org/latest-articles/'
headers = {'User-Agent': user_agent, 'referer':referer}

In [11]:
requests.get('https://en.wikipedia.org/wiki/Main_Page')

<Response [200]>

In [13]:
urls = []
for page in pagelist:
    urls = urls + list_articles(page)

In [16]:
data = [scrape_article(url) for url in urls]
new = pd.concat(data)

In [17]:
new

Unnamed: 0,url,title,date,author,text
0,https://www.resilience.org/stories/2022-02-06/...,Increased U.S. natural gas exports = higher U....,"February 6, 2022",Kurt Cobb,Few people noticed when energy reporters wrote...
0,https://www.resilience.org/stories/2022-02-04/...,Ecological Civilisation: Beyond Consumerism an...,"February 4, 2022",Samuel Alexander,This is the introduction to the Ecological Civ...
0,https://www.resilience.org/stories/2022-02-04/...,Growing community in vacant Chicago lots,"February 4, 2022",Jordyn Harrison,"From 100 feet in th­­e air, the parcel at 500 ..."
0,https://www.resilience.org/stories/2022-02-04/...,Does renewable energy threaten efforts to cons...,"February 4, 2022",Sebastian Dunnett,The world is facing a climate and ecological c...
0,https://www.resilience.org/stories/2022-02-04/...,Túmin: the alternative currency rebuilding com...,"February 4, 2022",Mattha Busby,"In southern Mexico, Itzel Castro sits behind t..."
...,...,...,...,...,...
0,https://www.resilience.org/stories/2020-04-16/...,COVID-19 and the Death of Market Fundamentalism,"April 16, 2020",Paul Gilding,"On top of the countless human tragedies, there..."
0,https://www.resilience.org/stories/2020-04-15/...,In Praise of Short Supply Chains,"April 15, 2020",Francesca Price,As the coronavirus pandemic affects every area...
0,https://www.resilience.org/stories/2020-04-15/...,Climate Change Won’t Stop for the Coronavirus ...,"April 15, 2020",Abrahm Lustgarten,The next several months could bring hurricanes...
0,https://www.resilience.org/stories/2020-04-15/...,Open and Answered Questions,"April 15, 2020",Chris Nelder,Your browser does not support the audio elemen...


In [15]:
# Function that scrapes an individual article
def scrape_article(url):
    try:
        page = requests.get(url, headers=headers)
        soup = BeautifulSoup(page.text, 'html.parser')

        title, author, date, text = 'None', 'None', 'None', 'None'

        try:
            post_content = soup.find("section", {"class":'post-content'})
            text = ' '.join([p.text.replace('\n', ' ') \
                                   .replace('\xa0' , ' ') \
                                   .replace('â€™', "'") for p in post_content.find_all('p') if len(p.text) > 0])
        except:
            pass

        try:
            title = soup.find('h1', {"class":"post-title"}).text
        except:
            pass
        
        try:
            date = soup.find("li", {"class":'post-date'}).text
        except:
            pass
        
        try:
            author = soup.find("span", {"class":"featured-image-credit"}).find("a").text
        except:
            pass

        return pd.DataFrame({'url':[url], 'title':[title], 'date':[date], 'author':[author], 'text':[text]})
    except:
        print("Failure to scrape: " + url)

In [31]:
df[df.date == df.date.max()]

Unnamed: 0,url,title,date,author,text,text_list,num_words
0,https://www.resilience.org/stories/2021-07-16/...,Elinor Ostrom’s Essential Lessons for Collecti...,2021-07-16,Erik Nordman,The Uncommon Knowledge of Elinor Ostrom by Eri...,"[The, Uncommon, Knowledge, of, Elinor, Ostrom,...",1267
1,https://www.resilience.org/stories/2021-07-16/...,No Time for Castles: From Closed to Open Democ...,2021-07-16,Hélène Landemore,"For proponents of deliberative democracy, toda...","[For, proponents, of, deliberative, democracy,...",2922
2,https://www.resilience.org/stories/2021-07-16/...,Greed and Its Offsets,2021-07-16,Simon Fairlie,Farmland shouldn’t be used to expiate the carb...,"[Farmland, shouldn’t, be, used, to, expiate, t...",3965


In [34]:
new_trim = new[new.date > df.date.max()]

In [35]:
final = pd.concat([df, new])

In [37]:
final.to_pickle('../Data/Resilience/final.pkl')

In [23]:
new['date'] = pd.to_datetime(new.date)

In [30]:
new.iloc[:2500, :].head()

Unnamed: 0,url,title,date,author,text
0,https://www.resilience.org/stories/2022-02-06/...,Increased U.S. natural gas exports = higher U....,2022-02-06,Kurt Cobb,Few people noticed when energy reporters wrote...
0,https://www.resilience.org/stories/2022-02-04/...,Ecological Civilisation: Beyond Consumerism an...,2022-02-04,Samuel Alexander,This is the introduction to the Ecological Civ...
0,https://www.resilience.org/stories/2022-02-04/...,Growing community in vacant Chicago lots,2022-02-04,Jordyn Harrison,"From 100 feet in th­­e air, the parcel at 500 ..."
0,https://www.resilience.org/stories/2022-02-04/...,Does renewable energy threaten efforts to cons...,2022-02-04,Sebastian Dunnett,The world is facing a climate and ecological c...
0,https://www.resilience.org/stories/2022-02-04/...,Túmin: the alternative currency rebuilding com...,2022-02-04,Mattha Busby,"In southern Mexico, Itzel Castro sits behind t..."
