In [None]:
import time
import requests
import os
import codecs
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
course = 'https://learn.microsoft.com/de-de/credentials/certifications/power-bi-data-analyst-associate/'

In [None]:
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

### Sections

In [None]:
driver.get(course)
wait = WebDriverWait(driver, 10)
element = wait.until(EC.presence_of_element_located((By.ID, "learn-bizapps.get-started-data-analytics_title")))
html = driver.page_source

In [None]:
soup = BeautifulSoup(html, 'html.parser')
sections = soup.find_all('div', class_='card-template')
sections = [element.a.get('href') for element in sections]
sections = ['https://learn.microsoft.com/' + element for element in sections]

### Modules

In [None]:
modules = []

for i, section in enumerate(sections):
    driver.get(section)
    time.sleep(2)
    html = driver.page_source

    searched_class = 'display-block text-decoration-none font-weight-semibold font-size-h6 margin-none has-content-margin-right-xxl-tablet'
    
    if i == 0:
        searched_class = 'display-block text-decoration-none'
    
    soup = BeautifulSoup(html, 'html.parser')
    section_modules = soup.find_all('a', class_=searched_class)
    section_modules = [module.get('href') for module in section_modules]
    section_modules = ['https://learn.microsoft.com/de-de/training/modules/' + module for module in section_modules]
    section_modules = [module.replace('/../../modules/', '/') for module in section_modules]

    for module in section_modules:
        modules.append(module)

In [None]:
modules = [module.replace('//de-de/training/modules', '') for module in modules]

### Pages

In [None]:
pages = []

for module in modules:
    
    driver.get(module)
    time.sleep(2)
    html = driver.page_source

    soup = BeautifulSoup(html, 'html.parser')
    module_pages = soup.find_all('a', class_='unit-title display-block font-size-md has-line-height-reset')
    module_pages = [page.get('href') for page in module_pages]
    module_pages = [module + page for page in module_pages]

    for page in module_pages:
        pages.append(page)

In [None]:
with open('Data/page_links.csv', 'w') as f:
    for page in pages:
        f.write(page+'\n')

### HTML

In [None]:
output_folder = 'Data/Content/HTML'
knowledge_checks = []
excercises = []

for i, page in tqdm(enumerate(pages), total=len(pages)):

    if 'check' in page:
        knowledge_checks.append(page)
        continue

    if 'lab' in page:
        excercises.append(page)
        continue
    
    response = requests.get(page)
    if response.status_code != 200:
        print(f'Error when requesting')
        break

    file_path = os.path.join(output_folder, str(i+1) + '.html')
    with open(file_path, 'w') as f:
        f.write(str(response.content))

In [None]:
with open('Data/Content/knowledge_checks.csv', 'w') as f:
    for check in knowledge_checks:
        f.write(check + '\n')

with open('Data/Content/excercises.csv', 'w') as f:
    for exercise in excercises:
        f.write(exercise + '\n')

### Chapters

In [None]:
input_folder = 'Data/Content/HTML'
output_folder = 'Data/Content/Chapters'

chapter_count = 1

html_pages = sorted(os.listdir(input_folder))
html_pages = sorted(html_pages, key=lambda x: int(x.split('.')[0]))

for i, page in enumerate(html_pages):
    
    module = pages[i]
    module = module.split('/')[-2]

    html_path = os.path.join(input_folder, page)
    with open(html_path) as f:
        soup = BeautifulSoup(f, 'html.parser')
        inner_section = soup.find(id="unit-inner-section")
        
    items = []
    for item in inner_section:
        if item.name != None:
            if item.name != 'div':
                items.append(item)
            
    content = []
    for item in items:
        if 'h' in item.name:
            if content:

                file_path = os.path.join(output_folder, str(chapter_count) + '.txt')
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(module)
                    f.write('\n')
                    for element in content:
                        decoded_element = codecs.decode(element, 'unicode_escape')
                        f.write(decoded_element)
                        f.write('\n')

                chapter_count += 1
                content = []

        content.append(item.text) 

    file_path = os.path.join(output_folder, str(chapter_count) + '.txt')
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(module)
        f.write('\n')
        for element in content:
            decoded_element = codecs.decode(element, 'unicode_escape')
            f.write(decoded_element)
            f.write('\n')
    
    chapter_count += 1