In [None]:
# Import the two main libraries
import datetime
import locale
import pickle
import sys
import time

import pandas as pd
import requests
from bs4 import BeautifulSoup  # to process html
from tqdm import tqdm

locale.setlocale(
    locale.LC_ALL, "it_IT.utf8"
)  # for seeing the list of locale in linux run "locale -a" it may change based on the operative system

In [None]:
# get all the links of the articles

saved_links = []
for page in tqdm(range(1, 300)):

    url = "https://www.ilfattoquotidiano.it/tag/guerra-russia-ucraina/page/" + str(page)
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")
    main = soup.find_all("div", {"rullo-tag-main default-block-color-rullo"})[0]
    links = main.find_all("h3", {"p-item"})

    for link in links:
        saved_links.append(link.find_all("a")[0]["href"].strip())

In [None]:
file_name = "falso_links.pkl"

open_file = open(file_name, "wb")
pickle.dump(saved_links, open_file)
open_file.close()

In [None]:
# to load
open_file = open("falso_links.pkl", "rb")
saved_links = pickle.load(open_file)
open_file.close()

In [None]:
# creating the dataframe

Articles = pd.DataFrame(
    {
        "link": pd.Series([], dtype="string"),
        "title": pd.Series([], dtype="string"),
        "author": pd.Series([], dtype="string"),
        "date": pd.Series([], dtype="datetime64"),  # this need to be tested
        "text": pd.Series([], dtype="string"),
    }
)

In [None]:
for i, link in tqdm(enumerate(saved_links)):

    page = requests.get(link)
    soup = BeautifulSoup(page.text, "html.parser")

    # get article tag
    if soup.find_all("section", {"article-content"}):
        article = soup.find_all("section", {"article-content"})[0]
    else:
        continue

    # get title
    title = soup.find_all("h1", {"title-article"})[0].get_text()

    # get author
    cite = soup.find("cite")
    author = cite.find("a").get_text().split("di")[1].strip()

    # get date
    date = soup.find("span", {"date"}).get_text().split("|")[1].strip()
    date = datetime.strftime(date, "%d %B %Y")

    # get text
    text = ""
    for p in article.find_all("p"):
        text += p.text

    Articles = Articles.append(
        {"link": link, "title": title, "author": author, "date": date, "text": text},
        ignore_index=True,
    )

In [None]:
Articles.to_parquet("FalsoArticles.parquet.snappy", engine="fastparquet")