##### !! Web scraping requires firefox browser to be installed !!

# Gathering Data from lrt.lt and 15min.lt Websites

In [None]:
# Importing necessary modules
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
import time

## lrt.lt Data Scraping

In [None]:
# Choosing from which categories to gather the data
categories = ["lietuvoje", "nuomones", "pasaulyje", "verslas", "lrt-tyrimai", "eismas",
              "sportas", "pozicija", "sveikata", "mokslas-ir-it", "kultura", "veidai",
              "gyvenimas", "tavo-lrt", "muzika"]

In [None]:
# Selecting fields of interest for every article
news_dict = {"source": [], "category": [], "date": [], "title": [], "score": [], "last_updated": []}

In [None]:
today_date = datetime.date.strftime(datetime.date.today(), format="%Y-%m-%d")

In [None]:
# Variables for removing the 'Cookies' window
cookies_accepted = False
warmup_time = 15

# Selecting maximum number of pages with articles for browser to open for each category
max_pages = 800

In [None]:
# Starting up headless browser as driver

# geckodriver.exe is used to interact with the headless browser
geckodriver_path = "geckodriver.exe"

service = Service(geckodriver_path)
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options, service=service)

In [None]:
# Gathering information of articles for every category in categories list
for category in categories:
    # Articles for each category can reached by 'https://www.lrt.lt/naujienos/CATEGORY' where CATEGORY is the name of category
    url = f"https://www.lrt.lt/naujienos/{category}"
    page_num = 1
    
    driver.get(url)
    
    # Accepting cookies if needed, because 'Cookies' window blocks the necessary button
    while not cookies_accepted:
        try:
            driver.find_element(by=By.CSS_SELECTOR, value="#CybotCookiebotDialogBodyButtonAccept").click()
            cookies_accepted = True
            print("Cookies successfully accepted!")
        except:
            if warmup_time <= 0:
                cookies_accepted = True
                print("Warmup time ended, no cookies this time!")
            warmup_time -= 1
        time.sleep(1)
    
    # Loading additional pages of articles if possible
    while page_num <= max_pages:
        print(f"{category}. Getting page number: {page_num}")
        try:
            element = driver.find_element(by=By.CSS_SELECTOR, value='[onclick="_load_more(this)"]').click()
        except:
            page_num = max_pages+1
            print(f"Not enough pages in {category} category!")
        page_num += 1
        time.sleep(0.005)
    
    # Converting whole page to html and then parsing it with BeautifulSoup
    html = driver.page_source.encode('utf-8')
    soup = BeautifulSoup(html, "html.parser")
    
    # Iterating through acrticles and collecting desired information
    for news in soup.find(id="category_list").find_all(class_="col"):
        # Getting article's title
        title_link = news.findChild("a").attrs["href"].split("/")[-1]
        title = title_link.replace("-", " ")

        # Getting article's publishment date
        pub_dt = news.findChild(class_="info-block__text").text
        pub_date = pub_dt.split()[0]

        # Getting article's 'facebook likes' collected so far as the score
        if news.findChild(class_="btn btn--primary btn--xs btn--fb"):
            fb_button = news.findChild(class_="btn btn--primary btn--xs btn--fb")
            fb_score = fb_button.findChild(class_="btn__text text-purple-2").text
        else:
            fb_score = "0"
        
        # Saving all the data to a dictionary
        news_dict["source"].append("lrt.lt")
        news_dict["category"].append(category)
        news_dict["date"].append(pub_date)
        news_dict["title"].append(title)
        news_dict["score"].append(fb_score)
        news_dict["last_updated"].append(today_date)
    
    print(f"{category} completed! {len(categories)-categories.index(category)-1} categories remaining")

print("Data successfully collected!")
driver.close()

In [None]:
# Converting collected information to a pandas dataframe and saving it locally
df = pd.DataFrame(news_dict)
df.to_csv("../Main/Data/data_lrt.csv")

## 15min.lt Data Scraping

In [None]:
# Choosing from which categories to gather the data and assigning a link for them
categories_links = {"lietuvoje": "naujienos/aktualu/lietuva", "pasaulyje": "naujienos/aktualu/pasaulis",
                    "muzika": "kultura/naujienos/muzika", "sveikata": "gyvenimas/naujienos/sveikata",
                    "sportas": "sportas/naujienos", "mokslas-ir-it": "verslas/naujienos/mokslas-it",
                    "kultura": "kultura/naujienos", "nuomones": "naujienos/aktualu/komentarai",
                    "verslas": "verslas/naujienos", "gyvenimas": "gyvenimas/naujienos",
                    "eismas": "verslas/naujienos/transportas", "kriminalai": "naujienos/aktualu/nusikaltimaiirnelaimes",
                    "maistas": "maistas/naujienos/naujienos"}

In [None]:
# Selecting fields of interest for every article
news_dict = {"source": [], "category": [], "date": [], "title": [], "score": [], "last_updated": []}

In [None]:
today_date = datetime.date.strftime(datetime.date.today(), format="%Y-%m-%d")

In [None]:
# Selecting the number of days to collect the articles from
days_to_extract = 410

In [None]:
# Gathering information of articles for every category (and it's link) in categories_links dictionary
for cat, link in categories_links.items():
    curr_date = None
    prev_day_link = None

    # Proceeding through each day for (days_to_extract) number of days
    for i in range(days_to_extract):
        print(f"{cat}. Getting {i} day")
        links = []

        # Formatting each day and the day before it into suitable form
        if not curr_date:
            curr_date = datetime.date.strftime(datetime.date.fromisoformat(today_date) + datetime.timedelta(days=1), format="%Y-%m-%d")
        else:
            curr_date = datetime.date.strftime(datetime.date.fromisoformat(curr_date) - datetime.timedelta(days=1), format="%Y-%m-%d")
        prev_date = datetime.date.strftime(datetime.date.fromisoformat(curr_date) - datetime.timedelta(days=1), format="%Y-%m-%d")

        # Articles published on DATE and later, can be reached by 'https://www.15min.lt/LINK?offset=DATE 00:00:00' where
        # LINK is categorie's link and DATE is the minimal possible date of publishment
        
        l_prev = f"https://www.15min.lt/{link}?offset={prev_date} 00:00:00"
        l_curr = f"https://www.15min.lt/{link}?offset={curr_date} 00:00:00"

        # Articles published on DATE (curr_date) can be collected by getting articles published from DATE-1 (prev_date) and
        # later, and removing the articles from it that were published from DATE (curr_date) and later
        r = requests.get(l_prev)
        soup = BeautifulSoup(r.text, "html.parser")

        a_group = soup.find(class_="visual-list").select_one("div[class^='vl-row vl-row']")
        a = a_group.select_one("article[class^='item item-col-']")
        prev_day_link = a.select_one(".vl-img-container").get_attribute_list("href")[0]


        r = requests.get(l_curr)
        soup = BeautifulSoup(r.text, "html.parser")

        # Collecting information of each article published on (curr_date)
        found_limited_link = False
        for a_group in soup.find(class_="visual-list").select("div[class^='vl-row vl-row']"):
            if not found_limited_link:
                for a in a_group.select("article[class^='item item-col-']"):
                    # Getting article's link and checking if it's publishment date is valid
                    l = a.select_one(".vl-img-container").get_attribute_list("href")[0]
                    if l == prev_day_link:
                        found_limited_link = True
                        break
                    links.append(l)
                    
                    # Getting article's title
                    t = (l.split("/")[-1]).split("-")[:-2]
                    title = " ".join(t)
                    
                    # Getting article's 'facebook likes' collected so far as the score
                    try:
                        s = a.select_one(".item-fb-count")
                        fb_score = s.select_one(".icon-text").text
                    except:
                        fb_score = 0
                    
                    # Saving all the data to a dictionary
                    news_dict["source"].append("15min.lt")
                    news_dict["category"].append(cat)
                    news_dict["date"].append(prev_date)
                    news_dict["title"].append(title)
                    news_dict["score"].append(fb_score)
                    news_dict["last_updated"].append(today_date)
            else:
                break
    print(f"{cat} completed!")

In [None]:
# Converting collected information to a pandas dataframe and saving it locally
df = pd.DataFrame(news_dict)
df.to_csv("../Main/Data/data_15min.csv")