# Data Acquisition

## Start-up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Set main variables
MAIN_URL = "https://www.munichre.com/en"

In [3]:
# Import libraries
from crawler.utils import (
    get_html_from_page,
    get_html_from_page_selenium,
    get_all_urls_from_html,
    get_visible_text_from_html
)
import tqdm
import pandas as pd

## Let's scrap

In [4]:
# Get the html from the main page
main_html = get_html_from_page_selenium(MAIN_URL, headless=True)

In [5]:
# Get all href
urls_from_main = get_all_urls_from_html(main_html)

In [6]:
# Only those that point at a munich re webpage
useful_urls = [url for url in urls_from_main if url.startswith("https://www.munichre.com/en")]

# Remove duplicates
useful_urls = list(set(useful_urls))

In [8]:
# Iterate over all urls and store them
scrapped_urls = []
text_list = []
for url in tqdm.tqdm(useful_urls):
    # Add it to the list
    if url not in scrapped_urls:
        try:
            # Get html from page
            html = get_html_from_page_selenium(url, headless=True)
            # Extract only the visible text
            text = get_visible_text_from_html(html)
            # Append
            text_list.append(text)
            scrapped_urls.append(url)
        except:
            print(f"Could not get text from {url}")

 12%|█▏        | 93/805 [22:25<2:14:57, 11.37s/it]

Could not get text from https://www.munichre.com/en/company/media-relations/media-information-and-corporate-news/media-information/2007/2007-11-06-munich-re-support-for-project-to-develop-a-global-open-source-earthquake-model-global-earthquake-model-generating-information-of-the-highest-standard-through-cooperation-between-many-of-the-worlds-top-earthquake-experts-munich-re-sees-improved-opportunities-in-the-medium-term-for-loss-prevention-and-insurability.html


 23%|██▎       | 184/805 [39:41<1:57:36, 11.36s/it]

Could not get text from https://www.munichre.com/en/company/media-relations/media-information-and-corporate-news/media-information/2005/2005-03-15-munich-re-group-2004-profit-of-eur1833bn-pleasing-combined-ratio-of-989-in-reinsurance-despite-high-natural-catastrophe-losses-combined-ratio-in-primary-insurance-of-only-930-dividend-proposal-of-eur200-125-per-share-target-for-anniversary-year-2005-12-return-on-equity-innovation-offensive-to-create-new-earnings-potentials.html


 73%|███████▎  | 584/805 [2:03:01<41:08, 11.17s/it]  

Could not get text from https://www.munichre.com/en/company/media-relations/media-information-and-corporate-news/media-information/2007/2007-12-27-natural-catastrophe-figures-for-2007-higher-losses-despite-absence-of-megacatastrophes-very-many-loss-events-overall-economic-losses-of-us-75bn-board-member-dr-torsten-jeworrek-loss-figures-in-line-with-the-rising-trend-in-natural-catastrophes-munich-re-is-prepared.html


 96%|█████████▌| 770/805 [2:41:30<06:34, 11.26s/it]

Could not get text from https://www.munichre.com/en/company/media-relations/media-information-and-corporate-news/media-information/2007/2007-12-13-munich-re-investors-day-focuses-on-the-groups-primary-insurance-business-munich-re-demonstrates-success-of-integrated-business-model-with-primary-insurance-and-reinsurance-ergo-insurance-group-to-contribute-significantly-to-achieving-the-financial-objectives-of-the-munich-re-groups-changing-gear-programme-ergo-with-ambitious-targets-for-2012-normalised-profit-of-over-eur900m-and-sustainable-roe-of-12-15-with-a-premium-volume-of-over-eur23bn-optimised-capital-structure-ergo-plans-payment-of-eur10bn-dividend-and-financing-through-hybrid-capital.html


100%|██████████| 805/805 [2:48:27<00:00, 12.56s/it]


In [9]:
# Store results as a pandas dataframe
results_df = pd.DataFrame(
    data={
        'url': scrapped_urls,
        'text': text_list
    }
)
results_df.to_csv('../data/01_raw_df.csv', index=False)