# Scrape data from Landratsamt (LR) Traunstein (TS)

LR TS: https://www.traunstein.com/aktuelles/meldungen

In [None]:
%reset -f 

In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from datetime import datetime, timedelta, date
import math
import pandas as pd

In [None]:
# make sure chromedriver is installed
# sudo apt install chromium-driver
driver = webdriver.Chrome("/usr/bin/chromedriver")

In [None]:
def get_soup(driver, url):
    driver.get(url)
    time.sleep(2)
    content = driver.page_source
    soup = BeautifulSoup(content)
    return soup

## Bing search `https://www.traunstein.com/aktuelles/meldungen` with keywords "Insgesamt" "Corona"

In [None]:
url = "https://www.bing.com/search?q=site%3Atraunstein.com%2Faktuelles%2Fmeldungen+Insgesamt+Corona&count=100"

urls = []
for page, ext in [(0, ''), (1, '&first=49')]:
    soup = get_soup(driver, url + ext)
    _urls = [href for a in soup.find_all('a')
          if 'faelle' in (href := a.get('href', '').lower()) and 
             href.startswith('https://www.traunstein.com')
    ]
    urls.extend(_urls)

urls = set(urls)
len(urls)

## Extract the actual data from URLs retrieved from bing

In [None]:
def extract_cases_table(soup):
    for div in soup.findAll('div', attrs={'class': 'field-item even', 'property': 'content:encoded'}):
        table = div.find('table')
        if table is not None:
            break
    else:
        return None

    data = []
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele]) # Get rid of empty values
    return data

def get_matching_row(s):
    return next((row for row in data for col in row if s.lower() in col.lower()), None)

### Example

In [None]:
url = "https://www.traunstein.com/aktuelles/meldungen/insgesamt-1278-bestaetigte-corona-faelle-im-landkreis-traunstein"
soup = get_soup(driver, url)
extract_cases_table(soup)

### Get them all

In [None]:
all_data = []

for i, url in enumerate(urls):
    print(i, end=',')
    soup = get_soup(driver, url)
    
    if soup is None:
        print('\nCannot retrieve URL:', url, i)
        continue        
        
    data = extract_cases_table(soup)
    
    if data is None:
        print('\nCannot parse URL:', url, i)
        continue
    
    date = get_matching_row('stand')[0]
    recovered = get_matching_row('geheilt')[1]
    death_match = get_matching_row('verstorbene')
    if death_match is None:
        death = float('nan')
        #print('\nCannot retrieve death from URL:', url)
    else:
        death = death_match[1]    
    active_match = get_matching_row('aktive')
    if active_match is None:
        active = float('nan')
        #print('\nCannot retrieve active from URL:', url)
    else:
        active = active_match[1]
    
    total_idx, total_line = next(((i, row) for row in data for i, col in enumerate(row) if 'gesamt' in col.lower()), None)
    date_idx = next((i for i, col in enumerate(total_line) if 'datum' in col.lower()), None)
    table_values = [(row[date_idx], row[total_idx]) for row in data if len(row) == len(total_line)]
    
    all_data.append(dict(
        date=date,
        recovered=recovered,
        table_values=table_values,
        url=url,
        death=death,
        active=active,
    ))

### Reformat

In [None]:
all_data_sorted = sorted([(datetime.strptime(d['date'].replace('\xa0', ''), 'Stand: %d.%m.%Y %H:%M Uhr'), d)
                          for d in all_data])

In [None]:
cases_ts = {}

EXTRA_INT_KEYS = ['recovered', 'death', 'active']

for date_updated, dvals in all_data_sorted:
    du = date_updated.date()
    cases_ts.setdefault(du, {})
    for key in extra_int_keys:
        cases_ts[du][key] = float(dvals[key])
    
    for date_str, total in dvals['table_values'][1:]:
        d = datetime.strptime(date_str, '%d.%m.%Y').date()
        cases_ts.setdefault(d, {})
        cases_ts[d]['cases'] = int(total)
        for key in extra_int_keys:
            cases_ts[d].setdefault(key, float('nan'))

cases_ts[du]

### Clean the data

In [None]:
# fix very first entry by using the same value as second entry
date0, date1, *_  = sorted(cases_ts)
for key in EXTRA_INT_KEYS:
    assert math.isnan(cases_ts[date0][key])
    cases_ts[date0][key] = cases_ts[date1][key]

In [None]:
# NaN death are 0 (they exist only at the beginning)
for k in cases_ts:
    if math.isnan(cases_ts[k]['death']):
        cases_ts[k]['death'] = 0

### Write to file

In [None]:
df = pd.DataFrame([(k, v['cases'], v['recovered'], v['death'], v['active']) for k, v in sorted(cases_ts.items())],
                  columns=['date', 'confirmed', 'recovered_alive', 'death', 'active']).set_index('date')
df.index = pd.to_datetime(df.index)
df

In [None]:
df.to_csv('data_raw_TS_LR.csv', line_terminator="\n")