# Data Collection

This notebook does all data collection for the thesis project

## COVID-19 Data RIVM

In [None]:
import requests, re
from bs4 import BeautifulSoup
from pprint import pprint

In [None]:
# Scraping all URL's on RIVM
base_url = 'https://data.rivm.nl/covid-19/'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
pat = re.compile(r'COVID\-19.+\.csv')
csv_anchors = soup.find_all('a', href=pat)
csv_urls = [{'file_name':_.get('href'), 'url':f"{base_url}{_.get('href')}"} for _ in csv_anchors]
pprint(csv_urls)

In [None]:
data_dir = '../data/rivm/'
for csv_url in csv_urls:
    dest_file = f"{data_dir}{csv_url['file_name']}"
    print(f"Downloading file {csv_url['file_name']} into {data_dir}")
    with open(dest_file, 'wb') as file_obj:
        file_obj.write(requests.get(csv_url['url']).content)

### Compressing files

As the data is provided as CSV, this has a significantly larger filesize. We will 'compress' the files to parquet to save storage.

In [None]:
import pandas as pd
import os

In [None]:
data_dir = '../data/rivm/'
for file_name in os.listdir(data_dir):
    if '.csv' in file_name:
        print(f"Compressing {file_name} to parquet")
        data = pd.read_csv(f"{data_dir}/{file_name}", sep=";")
        data.to_parquet(f"{data_dir}/{file_name[:-4]}.parquet")

In [None]:
import shutil
shutil.make_archive('../data/rivm', 'zip', '../data/rivm/')

In [None]:
for file_name in os.listdir(data_dir):
    if '.csv' in file_name:
        os.remove(f"{data_dir}/{file_name}")

## Get AMS stock pricing history

In [None]:
api_key = ""
index_code = "AMS"
base_url = "https://financialmodelingprep.com/api/v3/historical-price-full/"
full_url = f"{base_url}/{index_code}?apikey={apikey}"
response = requests.get(full_url)

In [None]:
ams_data = pd.DataFrame(response.json()['historical'])
ams_data.to_csv('../data/stockmarkets/AMS.csv')
ams_data.to_parquet('../data/stockmarkets/AMS.parquet')

In [None]:
display(ams_data)

## John Hopkins

In [None]:
!git clone https://github.com/govex/COVID-19.git ../data/govex/