# Import Necessary Modules

In [1]:
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import ElementNotInteractableException, TimeoutException, WebDriverException, ElementClickInterceptedException
from selenium.webdriver.support import expected_conditions as EC
import openpyxl

## Loading Webdriver for selenium

In [2]:
ser = Service('/home/irfan/Downloads/chromedriver_linux64')

## Initializing web driver

In [3]:
myoptions = Options()
driver = Chrome(service=ser, options=myoptions)

## Opening the browser or redirecting to the link using get method

In [4]:
driver.get('https://www.eduvision.edu.pk/')

### Using Selenium Selectors, find all the anchor tags having class = 'list_hlaf_block' <br> and extract Universities name nested in anchors tags. <br>Scrape the home page and got anchor tags of all sectors 

In [5]:
sectors_name = []
sectors_links = []
anchors = driver.find_elements(By.CLASS_NAME, "list_hlaf_block")
anchors = anchors[-4:-7:-1]
for a in anchors:
    b = a.find_element(By.CSS_SELECTOR, 'b')
    sectors_name.append(b.text)
for a in anchors:
    sectors_links.append(a.get_attribute('href'))

### Scraping the links and names of Universities of all Sectors

In [None]:
institutes = {}
def scrape_sectors():
    all_divs = driver.find_elements(By.CLASS_NAME, "threecolumn")
    for div in all_divs:
        d = div.find_element(By.CLASS_NAME, 'fixText')
        a = d.find_element(By.CSS_SELECTOR, 'a')
        anchor = a.get_attribute('href')
        name = a.text
        institutes[name] = anchor


for link in sectors_links:
    try:
        driver.get(link)
        scrape_sectors()
    except:
        element = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        while element == '':
            driver.refresh()
            if driver.title != '':
                break
        scrape_sectors()

### Scrap_info() <-- this will scrape the information and store into the dictionary

In [None]:
institutes_degrees = {}
degree_programs = {}


def scrap_info(labels, ta=1):
    table = driver.find_elements(By.TAG_NAME, 'table')
    for label in labels:
        if label.get_attribute('for') == f"tab-{ta}":
            label.click()
            tbody = table[ta-1].find_elements(By.CLASS_NAME, 'para')
            ta += 1
            for body in tbody:
                tr = body.find_elements(By.TAG_NAME, 'tr')
                grade_level_data = {}
                for row in tr:
                    td = row.find_elements(By.TAG_NAME, 'td')
                    indvidual_program_info = []
                    for data in td:
                        indvidual_program_info.append(data.text)
                    grade_level_data[indvidual_program_info[0]] = indvidual_program_info[1:]
            if len(labels) > 0:
                degree_programs[label.text] = grade_level_data
            else:
                degree_programs['Programs'] = grade_level_data
    institutes_degrees[name] = degree_programs


for name, link in institutes.items():
        try:
            driver.get(link)
            labels = driver.find_elements(By.CSS_SELECTOR, "label")
            scrap_info(labels)
        except ElementNotInteractableException:
            scrap_info(labels, ta=2)
        except ElementClickInterceptedException:
            try:
                wait = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "label")))
            except:
                overlay = driver.find_element(By.CSS_SELECTOR, "label")
                if overlay.is_displayed():
                    overlay.click()
                else:
                    continue
        except (TimeoutException, WebDriverException):
            element = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            while element == '':
                driver.refresh()
                if driver.title != '':
                    break
            labels = driver.find_elements(By.CSS_SELECTOR, "label")
            scrap_info(labels)

# Retrieved data will be stored into text file

In [None]:
def write_data_into_text_file():
    with open('scraped data.txt', 'w') as file:
        for uni, uni_info in institutes_degrees.items():
            file.write(f"{uni}\n")
            for level, program in uni_info.items():
                file.write(f"------{level}------\n")
                for name, info in program.items():
                    file.write(f"{name}\t")
                    for data in info:
                        file.write(f"{data}\t")
                    file.write('\n')

write_data_into_text_file()

# If you want to store the whole data in Excel

In [None]:
def header(c, r, worksheet):
    worksheet.cell(row=r+1, column=c, value='Program')
    worksheet.cell(row=r+1, column=c+1, value='Durations')
    worksheet.cell(row=r+1, column=c+2, value='Fee')
    worksheet.cell(row=r+1, column=c+3, value='Deadline')
    worksheet.cell(row=r+1, column=c+4, value='Merit')
    return c+4
    
def write_in_excel():
    filename = "Universities.xlsx"
    workbook = openpyxl.Workbook()
    worksheet = workbook.active
    worksheet.title = "institutes"
    main_header = 'Institutes'
    cell = worksheet.cell(row=1, column=1)
    cell.value = main_header
    r = 2
    for uni_name, uni_info in institutes_degrees.items():
        main_header = uni_name
        cell = worksheet.cell(row=r, column=1)
        cell.value = main_header
        c = 2
        for level, prog in uni_info.items():
            worksheet.cell(row=r, column=c, value=level)
            c = header(c, r, worksheet)
        c = 2
        tmp1 = 0
        for prog in uni_info.values():
            tmp = 0
            for progr, prog_info in prog.items():
                r += 1
                worksheet.cell(row=r, column=c, value=progr)
                for info in prog_info:
                    c += 1
                    worksheet.cell(row=r, column=c, value=info)
                c -= len(prog_info)
                tmp += 1
            c += len(prog_info)
            r -= tmp
            tmp1 = tmp
        r += tmp1
    workbook.save(filename)

write_in_excel()