# Scrape administrative text samples

In this step we scrape [news bulletins from our own website](https://www.zh.ch/de/news-uebersicht.html?page=1&orderBy=new). 

**Imports**

In [1]:
import pandas as pd
from pandarallel import pandarallel

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500
pandarallel.initialize(progress_bar=True)

import re
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
import requests
import random
import warnings

warnings.simplefilter("ignore", category=UserWarning)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


**Constants and functions**

In [2]:
INPUT_DIR = "_input/"

# Scrape news texts from zh.ch

In [3]:
def get_sitemap(url):
    raw = requests.get(url)
    soup = BeautifulSoup(raw.text, "lxml")
    links = [link.text.strip() for link in soup.find_all("loc")]
    return links

In [4]:
# Get base sitemap from which we get all sub sitemaps.
url = "https://www.zh.ch/bin/zhweb/publish/sitemap.xml"
sitemaps = get_sitemap(url)
sitemaps_news = [sitemap for sitemap in sitemaps if "mitteilungen" in sitemap]
print(sitemaps_news)

['https://www.zh.ch/de/news-uebersicht/medienmitteilungen.zhweb-sitemap.xml', 'https://www.zh.ch/de/news-uebersicht/mitteilungen.zhweb-sitemap.xml']


In [5]:
# Fetch links of HTML pages from all sub sitemaps.
results = []
for sitemap in tqdm(sitemaps_news):
    links = get_sitemap(sitemap)
    links = [(sitemap, link) for link in links]
    results.extend(links)

links = [x[1] for x in results]
random.shuffle(links)

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
corpus = []
for link in tqdm(links[:1500]):
    raw = requests.get(link)
    soup = BeautifulSoup(raw.text, "lxml")
    lead = soup.find("p", class_="atm-lead")
    if lead is None:
        continue
    content = soup.find("div", class_="text aem-GridColumn aem-GridColumn--default--12")
    if content is None:
        continue
    paragraphs = content.find_all("p", class_="atm-paragraph")
    if len(paragraphs) == 0:
        continue
    paragraphs = [p.get_text() for p in paragraphs]
    paragraphs = [lead.get_text()] + paragraphs
    final_text = " ".join(paragraphs)
    corpus.append((link, final_text))

In [7]:
def clean_text(data):
    data = re.sub(r"\n", " ", data)
    data = re.sub(r"\s+", " ", data)
    data = re.sub(r"\xa0", " ", data)
    data = re.sub(r"-{2,}", "", data)
    data = re.sub(r"\.{2,}", ".", data)
    data = data.split("Kantonspolizei Zürich Mediendienst")[0]
    return data.strip()

In [8]:
df = pd.DataFrame(corpus, columns=["url", "text"])
df["text"] = df["text"].apply(clean_text)
df["text_length"] = df["text"].apply(len)
df = df[df.text_length > 200]
df = df.drop_duplicates(subset=["text"])
df.drop(columns=["text_length"], inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_parquet(f"{INPUT_DIR}zh_news.parq")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1245 entries, 0 to 1244
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     1245 non-null   object
 1   text    1245 non-null   object
dtypes: object(2)
memory usage: 19.6+ KB
