In [26]:
# Import the two main libraries
from bs4 import BeautifulSoup       # to process html
import requests
import urllib.request,sys,time
import pandas as pd
from tqdm import tqdm
import pickle
import datetime
import json
import re

## Why we scrape using api's instead of using selenium

[That's the page were we scraped the links to the articles](https://www.reuters.com/site-search/?query=russia-ukraine+war)  
[These are the api's that we found](https://www.reuters.com/pf/api/v3/content/fetch/articles-by-search-v2?query={%22keyword%22%3A%22russia-ukraine%20war%22%2C%22offset%22%3A0'%2C%22orderby%22%3A%22display_date%3Adesc%22%2C%22size%22%3A100%2C%22website%22%3A%22reuters%22}&d=138&_website=reuters)

The problem that we had was that the page where we scraped the link to the article is client-side rendered, or at least the part with the links. That means that the server sends to the client the APIs that we found plus a script that renders in html the page. So we can't scrape that type of page only with requests and beautiful soup, the alternatives were to use selenium or go directly for the APIs. We choose the APIs because they are faster to scrape and also more reliable.

In order to find the API's we have done a search on the page linked above, then looking at the network tab that came out when you analyze the source code of the page we have found the call to the server with the URL to the API's.

![Screen of how we found the api's](reuters-api-screen.png "Screen of how we found the api's")

In order to do this you need to perform a query on the search bar and then look at which response arrives from the server there is supposed to be one JSON reply which is the one with the data that we are looking for.

In [2]:
#get all the links of the articles
saved_links = []
numberOfArticles = 0

# scraping data directly from the api's of reuters
for page in tqdm(range(0,4750,50)): # this cicle takes some minutes, consider to load the pickle file
    url = 'https://www.reuters.com/pf/api/v3/content/fetch/articles-by-search-v2?query={%22keyword%22%3A%22russia-ukraine%20war%22%2C%22offset%22%3A'+str(numberOfArticles)+'%2C%22orderby%22%3A%22display_date%3Adesc%22%2C%22size%22%3A100%2C%22website%22%3A%22reuters%22}&d=138&_website=reuters'
    result = requests.get(url).text
    page += 50
    
    saved_links.append(result)

100%|██████████| 95/95 [04:04<00:00,  2.58s/it]


We save the links that we obtained because can happen to be blocked by the server, so is better to have a backup.

In [3]:
file_name = "reuters_links.pkl"

open_file = open(file_name, "wb")
pickle.dump(saved_links, open_file)
open_file.close()

In [65]:
# to load
open_file = open("reuters_links.pkl", "rb")
saved_links = pickle.load(open_file)
open_file.close()

Since sometimes the reply by the server is an HTML page because there is an issue loading the JSON file we need to try to parse the JSON in order to see if it's a JSON or not and then remove the ones that are HTML.

In [66]:
before = len(saved_links)

for i, link in tqdm(enumerate(saved_links)):
    try:
        json.loads(link)
    except:
        saved_links.remove(link)
        
print("Length before: ", before, "\nLength after: ", len(saved_links))

92it [00:00, 175.04it/s]

Length before:  95 
Length after:  92





Here we save on the dataset all the information that we have gained from the APIs a part of the text that wasn't there, for the text we need to open each page and scrape it. We took all the data from the APIs because as we know pages can be different, so is better to use the information that we already have instead of trying to read them from the HTML. For the text we look into the article tag where the text of the article is supposed to be, then we perform a regex that searches for all the attributes named data-testid set to paragraph-x where the is a number. These are all the paragraph tags that have inside the text of the article.

In [58]:
#creating the dataframe

ReutersArticles = pd.DataFrame({'title': pd.Series([], dtype='string'),
                             'author': pd.Series([], dtype='string'),
                             'date': pd.Series([], dtype='float'),
                             'text': pd.Series([], dtype='string'),
                             })


for i, link in tqdm(enumerate(saved_links)):
    try:
        link = json.loads(link) # Since the api's gives back a json file we need to load as a json in order to access to the objects inside
        iterations = len(link['result']['articles']) # we measure the number of article that we have for the next iteration

        for j in range(0,iterations): # we have a object called result, with everything we are looking for inside
            try:
                page = requests.get('https://www.reuters.com' + link['result']['articles'][j]['canonical_url']) # from the canonical url one part is missing so we add that part, again the result is in the object result, article and then canonical_url. Since we have many articles we need to iterate over them.
                soup = BeautifulSoup(page.text, "html.parser")

                # get article tag
                article = soup.find_all("article")[0]

                #get title
                title = link['result']['articles'][j]['title']

                #get date
                date = pd.to_datetime(link['result']['articles'][j]['published_time']).strftime('%d/%m/%y')

                #get text
                text = ''
                for p in article.find_all('p', {'data-testid': re.compile(r'paragraph-\d')}):
                    text += p.text

                # get author
                try:
                    author = ''
                    for x in range(0,len(link['result']['articles'][j]['authors'])):
                        author += link['result']['articles'][j]['authors'][x]['name']
                except:
                    author = None

                ReutersArticles = ReutersArticles.append({"title": title, "author": author, "date": date, "text": text}, ignore_index=True)
            except:
                pass
    except:
        pass


92it [2:16:52, 89.27s/it] 


In [59]:
ReutersArticles

Unnamed: 0,title,author,date,text
0,"Homes smashed, 34 wounded in latest Russian st...",Ivan Lubysh-Kirdey,01/05/23,May 1 (Reuters) - Russia unleashed a fresh vol...
1,Factbox: Countries rush to evacuate foreign ci...,Reuters,25/04/23,"KHARTOUM, April 25 (Reuters) - Countries have ..."
2,Latest on the Ukraine war: Pope says Vatican i...,Reuters,22/04/23,April 22 (Reuters) - Russia unleashed a fresh ...
3,Russia's Prigozhin renews appeal for more ammu...,Reuters,01/05/23,May 1 (Reuters) - The head of the Wagner priva...
4,Ukraine to boycott World Judo Championships ov...,Reuters,01/05/23,May 1 (Reuters) - Ukrainian judokas will not t...
...,...,...,...,...
9188,"Russian parliament votes to tighten ""foreign a...",Reuters,07/06/22,"LONDON, June 7 (Reuters) - The lower house of ..."
9189,Russian attack destroys warehouses of major Uk...,Pavel Polityuk,07/06/22,"KYIV, June 7 (Reuters) - Russian shelling dest..."
9190,Russian parliament votes to break with Europea...,Reuters,07/06/22,"LONDON, June 7 (Reuters) - Russia's parliament..."
9191,Russia says two Ukrainian ports ready to ship ...,Reuters,07/06/22,"LONDON, June 7 (Reuters) - Russia said on Tues..."


In [60]:
ReutersArticles.to_csv("ReutersArticles.csv")