In [None]:
# Major usage of: https://gist.github.com/dannguyen/c9cb220093ee4c12b840

In [57]:
import json
import requests
from os import makedirs
from os.path import join, exists, abspath
from datetime import date, timedelta
from newspaper import Article

In [91]:
ARTICLES_DIR = join('/home/ostapkharysh/Documents/bt_data/Guardian_news', 'tempdata', 'articles')
makedirs(ARTICLES_DIR, exist_ok=True)

In [92]:
MY_API_KEY = "0abd707a-96aa-4e21-b5cd-b314c6955753"
API_ENDPOINT = 'http://content.guardianapis.com/search'
my_params = {
    'from-date': "",
    'to-date': "",
    'order-by': "newest",
    'show-fields': 'all',
    'page-size': 200,
    'api-key': MY_API_KEY,
    'lang': 'en'
}

In [93]:
# day iteration from here:
start_date = date(2016, 2, 1)  #month, day
end_date = date(2016,3, 1)
dayrange = range((end_date - start_date).days + 1)
for daycount in dayrange:
    dt = start_date + timedelta(days=daycount)
    datestr = dt.strftime('%Y-%m-%d')
    fname = join(ARTICLES_DIR, datestr + '.json')
    if not exists(fname):
        # then let's download it
        print("Downloading", datestr)
        all_results = []
        my_params['from-date'] = datestr
        my_params['to-date'] = datestr
        current_page = 1
        total_pages = 1
        while current_page <= total_pages:
            print("...page", current_page)
            my_params['page'] = current_page
            resp = requests.get(API_ENDPOINT, my_params)
            data = resp.json()
            all_results.extend(data['response']['results'])
            # if there is more than one page
            current_page += 1
            total_pages = data['response']['pages']
        with open(fname, 'w') as f:
            print("Writing to", fname)
            # re-serialize it for pretty indentation
            f.write(json.dumps(all_results, indent=2))

Downloading 2016-02-01
...page 1
...page 2
Writing to /home/ostapkharysh/Documents/bt_data/Guardian_news/tempdata/articles/2016-02-01.json
Downloading 2016-02-02
...page 1
...page 2
Writing to /home/ostapkharysh/Documents/bt_data/Guardian_news/tempdata/articles/2016-02-02.json
Downloading 2016-02-03
...page 1
...page 2
Writing to /home/ostapkharysh/Documents/bt_data/Guardian_news/tempdata/articles/2016-02-03.json
Downloading 2016-02-04
...page 1
...page 2
...page 3
Writing to /home/ostapkharysh/Documents/bt_data/Guardian_news/tempdata/articles/2016-02-04.json
Downloading 2016-02-05
...page 1
...page 2
Writing to /home/ostapkharysh/Documents/bt_data/Guardian_news/tempdata/articles/2016-02-05.json
Downloading 2016-02-06
...page 1
Writing to /home/ostapkharysh/Documents/bt_data/Guardian_news/tempdata/articles/2016-02-06.json
Downloading 2016-02-07
...page 1
...page 2
Writing to /home/ostapkharysh/Documents/bt_data/Guardian_news/tempdata/articles/2016-02-07.json
Downloading 2016-02-08
...p

In [94]:
import pandas as pd
with open('/home/ostapkharysh/Documents/bt_data/Guardian_news/tempdata/articles/2016-02-01.json', 'r') as f:
    data = json.load(f)
df = pd.DataFrame(data)
el = df.iloc[0]
el

apiUrl                https://content.guardianapis.com/us-news/2016/...
fields                {'bodyText': 'New York lawyers have been secre...
id                    us-news/2016/jan/31/lawyers-grey-money-underco...
isHosted                                                          False
pillarId                                                    pillar/news
pillarName                                                         News
sectionId                                                       us-news
sectionName                                                     US news
type                                                            article
webPublicationDate                                 2016-02-01T00:00:12Z
webTitle              Undercover film shows how lawyers could ease f...
webUrl                https://www.theguardian.com/us-news/2016/jan/3...
Name: 0, dtype: object

In [60]:
print(el['webPublicationDate'])
k = el['webUrl']

2016-02-01T00:00:12Z


In [62]:
article = Article(k.strip())
try:
    article.download()
except HTTPError:
    print('***FAILED TO DOWNLOAD***', article.url)


In [75]:
requests.get(k)

<Response [200]>

In [64]:
article.parse()

In [65]:
article.authors

['Rupert Neate']

In [85]:
article.publish_date

datetime.datetime(2016, 1, 31, 0, 0)

In [69]:
article.text

"New York lawyers have been secretly filmed advising how an African minister could use “grey money” to buy up a Manhattan brownstone, a private jet and a luxury yacht without US authorities – or his impoverished citizens back home – ever knowing.\n\nAn undercover activist posing as an adviser to an African mining minister filmed encounters with 13 US law firms, offering a rare glimpse into how US lawyers could possibly act to facilitate the flow of dirty money into the country despite Barack Obama repeatedly demanding a crackdown on global corruption.\n\nAlmost all the lawyers offered suggestions as to how the minister could move “grey money” and “suspect funds” into the US while ensuring his identity was never disclosed. None of them made suggestions that were illegal.\n\nOne lawyer, Mark Koplik, was recorded by an undercover investigator from anti-corruption charity Global Witness as saying: “So we have to scrub it at the beginning, if we can, or scrub it at the intermediary location