In [1]:
# Import the two main libraries
import datetime
import pickle
import sys
import time

import pandas as pd
import requests
from bs4 import BeautifulSoup  # to process html
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from seleniumwire import webdriver  # Import from seleniumwire
from tqdm import tqdm

we read from an html file because was more time effective to download manually the page with the link to the aricles

In [2]:
sputnik = open("sputnik.html", "r")
soup = BeautifulSoup(sputnik, "html.parser")

we save all the links to the articles that we will scrape

In [3]:
saved_links = []
links = soup.find_all("a", {"list__title"})

for link in links:
    saved_links.append(link["href"])

In [4]:
len(saved_links)

1719

In [5]:
# creating the dataframe

Articles = pd.DataFrame(
    {
        "link": pd.Series([], dtype="string"),
        "title": pd.Series([], dtype="string"),
        "author": pd.Series([], dtype="string"),
        "date": pd.Series([], dtype="float"),
        "text": pd.Series([], dtype="string"),
        "comments": pd.Series([], dtype="object"),
    }
)

since the scraper taks around 3 hours sometime is better to stop it save and resume from here

In [4]:
# Articles = pd.read_parquet("sputnik.parquet.snappy", engine="fastparquet")

We setup a selenium wire istance, selenium wire is a fork of selenium that add to selenium some features like intercept requestes. This is usefull because we can block the dowload of images and script that slow down the scraper. We still need to let run some javascript that show the comment section that's why we don't block all the script but only part of them.

In [5]:
options = Options()
options.headless = True

# Open Firefox with Selenium
driver = webdriver.Firefox(options=options)


def interceptor(request):
    # Block PNG, JPEG and GIF images
    if request.path.endswith(
        (
            ".png",
            ".jpg",
            ".gif",
            ".webp",
            ".svg",
            ".css",
            "iubenda.js",
            "analytics.js",
            "core-en.js",
            "tag.js",
            "stub-v2.js",
            "iubenda_cs.js",
            "iubenda_cons.js",
            "safe-tcf-v2.js",
            "context.js",
            "host.js",
            "tcf-v2-0.21.0.js",
        )
    ):
        request.abort()


driver.request_interceptor = interceptor

  options.headless = True


We download the html from the pages that we open, with driver wait we tell to selenium to wait until the comments are ready or to abort after two seconds.

In [6]:
for i, link in tqdm(enumerate(saved_links)):

    driver.get("https://sputnikglobe.com" + link)

    try:
        element = WebDriverWait(driver, 2).until(
            EC.presence_of_element_located((By.CLASS_NAME, "best-comments"))
        )
    except:
        pass
    finally:
        page = driver.page_source

    soup = BeautifulSoup(page, "html.parser")

    body = soup.find("div", {"article__body"})
    meta = soup.find("div", {"article__meta"})
    if meta:
        # get title
        title = meta.find("div", {"itemprop": "name"}).get_text()

        # get author
        author = meta.find("div", {"itemprop": "author"}).get_text()

        # get date
        date = meta.find("div", {"itemprop": "datePublished"}).get_text()
        date = pd.to_datetime(date).strftime("%d/%m/%y")

        # get text
        text = ""
        for p in body.find_all("div", {"data-type": ["text", "quote"]}):
            text += p.text

        comments = soup.find_all("div", {"best-comments__item-message"})
        saved_comments = []

        if comments:
            for c in comments:
                saved_comments.append(c.get_text())

        Articles = Articles.append(
            {
                "link": link,
                "title": title,
                "author": author,
                "date": date,
                "text": text,
                "comments": saved_comments,
            },
            ignore_index=True,
        )

driver.close()

179it [19:40,  6.59s/it]


In [7]:
Articles

Unnamed: 0,link,title,author,date,text,comments
0,/20230501/watch-russian-army-sappers-blow-up-a...,Watch Russian Army Sappers Blow Up Abandoned Ammo,Oleg Burunov https://cdn1.img.sputnikglobe.com...,01/05/23,The Russian Ministry of Defense (MoD) has rele...,[]
1,/20230501/kiev-lost-over-300-soldiers-over-pas...,Kiev Lost Over 300 Soldiers Over Past 24 Hours...,Sputnik International,01/05/23,"""Over the past day, the aviation carried out s...",[330 US mercenaries with mostly Ukrainian pass...
2,/20230430/russia-destroys-up-to-200-tonnes-of-...,Russia Destroys Up to 200 Tons of Ukrainian Am...,Sputnik International,30/04/23,"""As a result of a strike on an echelon at a ra...",[Very soon they will only have stones to throw...
3,/20230430/russian-forces-discover-underground-...,Russian Forces Discover Underground Soledar Ar...,Oleg Burunov https://cdn1.img.sputnikglobe.com...,30/04/23,The Armed Forces of Ukraine failed in its effo...,[Kudos to the Russian explosive ordnance dispo...
4,/20230430/ukraine-loses-over-480-military-merc...,"Ukraine Loses Over 480 Military, Mercenaries i...",Sputnik International,30/04/23,"""Over the past 24 hours, over 480 Ukrainian se...",[]
...,...,...,...,...,...,...
1714,/20220226/russian-fsb-border-service-agent-inj...,Russian FSB Border Service Agent Injured in Pr...,Sputnik International,26/02/22,In the course of suppressing the provocation o...,[If SN says one injured = 3-5 killed in realit...
1715,/20220226/three-mines-fired-from-ukraine-land-...,Three Mines Fired From Ukraine Land in Russia'...,Sputnik International,26/02/22,"""On 26 February, the border department of the ...",[]
1716,/20220226/video-ukrainian-flag-replaced-as-lpr...,Video: Ukrainian Flag Replaced as LPR Takes Co...,Sofia Chegodaeva,26/02/22,The people's militia of the Lugansk People's R...,[Russia can drop leaflets on enemy soldiers an...
1717,/20220226/zelensky-has-fled-kiev-for-lvov-russ...,"Zelensky Has Fled Kiev for Lvov, Russian State...",Sputnik International,26/02/22,Russian State Duma speaker Vyacheslav Volodin ...,[]


In [8]:
Articles.to_parquet("sputnik.parquet.snappy", engine="fastparquet")