In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import json

### Function for scraping the pages

In [3]:
def sessionInfo(raw_data, url):
    
    session_info = dict()
    
    date = raw_data.find("input", {"id": "date"}).get("value").replace("/", "-")
    session_info['date'] = date
    
    keys = ['term', 'ejlasie', 'preset_mps', 'session']

    info_string = raw_data.find("div", {"class": "col-sm-4 paddingHR"}).text.strip() + \
                        " - " + raw_data.find("div", {"class": "col-sm-3 paddingHL"}).text.strip()
    
    info_list = re.sub("\\r\\n +", " - ", info_string).split(" - ")

    for item, key in zip(info_list, keys):
        session_info[key] = re.findall(r"([0-9]+)", item)[0]
        
    session_info['url'] = url 

    
    return session_info

In [4]:
def scrape_url(url):
    
    
    # Read data, Create soup object & Filter data
    
    page = requests.get(url)
    raw_data = BeautifulSoup(page.content, 'html.parser')
    results = raw_data.findAll("div", {"class": "col-sm-10"})
    
    
    # Create dictionary & Insert `sessionInfo` into dictionary
    
    extracted = dict()
    extracted["sessionInfo"] = sessionInfo(raw_data, url)
    
    
    # Extract `orders` and `discussions`
    
    content = []

    for line in results:
        if line.find("i", {"class": "fa"}) != None:
            content.append([line.text.strip()])
        
        elif line.find("div", {"class": "personInfoCol"}) != None:
            name = line.find("div", {"class": "personInfoCol"}).find("div", {"class": "personInfoCol"}).text.strip()
            if line.find("div", {"class": "col-sm-10 colLeft"}) != None:
                opinion = line.find("div", {"class": "col-sm-10 colLeft"}).find("div", {"class": "lector"}).text.strip()
            lecture = line.find("div", {"class": "lectorText"}).text.strip()
            content.append([name, opinion, lecture])
    
    
    # Extract `orders` and `discussions` and insert into dictionary
    
    Mashrooh = dict()

    start = 0
    end = 1

    for i in content[1:]:
        if len(i) == 1:
            end = content.index(i)
            Mashrooh["order"+content[start][0].split()[0]] = {"title": content[start][0], "discussions": content[start+1:end]}
            start = end

    Mashrooh["order"+content[start][0].split()[0]] = {"title": content[start][0], "discussions": content[start+1:]}

    extracted['mashrooh'] = Mashrooh
    
    
    return extracted

### Function for exporting extracted data

In [5]:
def export_to_file(extracted_data):
    
    term = extracted_data['sessionInfo']['term']
    ejlasie = extracted_data['sessionInfo']['ejlasie']
    session = extracted_data['sessionInfo']['session']
    date = extracted_data['sessionInfo']['date']
    
    filename = f"{date}_Term-{term}_Session-{session}_Ejlasie-{ejlasie}.json"
    
    parsed = json.dumps(extracted_data, ensure_ascii=False)
    
    with open("exports/"+filename, 'w', encoding='utf8') as json_file:
        json.dump(parsed, json_file, ensure_ascii=False)

### Reading Valid session URLs from `valid_session_urls.csv`

In [6]:
urls = pd.read_csv("valid_session_urls.csv")["url"].to_list()

### Scraping the pages

In [7]:
for url in urls:
    extracted = scrape_url(url)
    export_to_file(extracted)
    print(f"Scraped and exported {urls.index(url)+1} out of {len(urls)} page contents.", end='\r')

print("\nDone!")

Scraped and exported 82 out of 82 page contents.
Done!
