In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import time

<p>ChromeDriver is a separate executable that Selenium WebDriver uses to control Chrome. It is maintained by the Chromium team with help from WebDriver contributors. If you are unfamiliar with Selenium WebDriver, you should check out the <a href="https://www.selenium.dev/">Selenium site</a>.</p>

https://googlechromelabs.github.io/chrome-for-testing/

# 1. Create df_state_universities dataframe.
#### Index of the dataframe is university code and the single column is the university name.

In [2]:
service = Service("chromedriver.exe")
driver= webdriver.Chrome(service=service)
driver.get("https://yokatlas.yok.gov.tr/lisans-anasayfa.php")
driver.maximize_window()
driver.implicitly_wait(5)
optgroups = driver.find_elements(by=By.XPATH,value="//optgroup[@label]")[:4]
labels = [optgroup.get_attribute("label").split()[0]   for optgroup in optgroups]
universities = { }
 
for label , optgroup in zip(labels,optgroups):
    options = optgroup.find_elements (by=By.TAG_NAME,value="option")
    universities[label] = {int(option.get_attribute("value")):option.get_attribute("innerText").strip()  for option in options}
df_universities =pd.DataFrame.from_dict(universities).stack().to_frame().reset_index()
df_universities.set_index(df_universities.columns[0],inplace=True) 
df_universities.index.name = "uni_code"
df_universities.columns = ["type","name"]
df_universities.head(3)  

Unnamed: 0_level_0,type,name
uni_code,Unnamed: 1_level_1,Unnamed: 2_level_1
1065,Devlet,ABDULLAH GÜL ÜNİVERSİTESİ
1104,Devlet,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİV...
1002,Devlet,ADIYAMAN ÜNİVERSİTESİ


# 2- Add the city column
#### Create new data frame with columns city name and university name

In [3]:
driver.get("https://yokatlas.yok.gov.tr/universite.php")
driver.implicitly_wait(10)
city_web_elements = driver.find_elements(by=By.CLASS_NAME, value="sehir")

cities = [web_element.get_attribute("innerText").strip() for web_element in city_web_elements]

name_web_elements = driver.find_elements(by=By.CLASS_NAME, value="baslik")
names =[web_element.get_attribute("innerText").strip() for web_element in name_web_elements]
df_city = pd.DataFrame({"city":cities,"name":names})
df_city.head(3)

Unnamed: 0,city,name
0,Kayseri,ABDULLAH GÜL ÜNİVERSİTESİ
1,İstanbul,ACIBADEM MEHMET ALİ AYDINLAR ÜNİVERSİTESİ
2,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİV...


Merge two dataframes: df_state_universities and df_city

In [4]:
df_universities=df_universities.reset_index().merge(df_city, on="name", how='left').set_index("uni_code")
df_universities.head(3)

Unnamed: 0_level_0,type,name,city
uni_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1065,Devlet,ABDULLAH GÜL ÜNİVERSİTESİ,Kayseri
1104,Devlet,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİV...,Adana
1002,Devlet,ADIYAMAN ÜNİVERSİTESİ,Adıyaman


In [5]:
df_universities.shape

(224, 3)

In [6]:
df_universities.to_csv("df_universities.csv")

# 3- Create df_quota 
### df_quota includes all departments and their quota/enrollments 

In [7]:
base_url = "https://yokatlas.yok.gov.tr/lisans-univ.php?u="

In [8]:
def get_department_names(driver,base_url, uni_codes):
    department_names = set()
    for uni_code in uni_codes:
        driver.get(base_url + str(uni_code) )  
        driver.implicitly_wait(2)
        divs_departments =  driver.find_elements(by=By.XPATH, value="//a[@data-parent='#']/div")
        for div_department  in divs_departments:
            department_name = div_department.get_attribute("innerText")
            if "KKTC" not in department_name:
                department_names.add(department_name) 
                
    return department_names    

In [9]:
 df_universities.head(3)

Unnamed: 0_level_0,type,name,city
uni_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1065,Devlet,ABDULLAH GÜL ÜNİVERSİTESİ,Kayseri
1104,Devlet,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİV...,Adana
1002,Devlet,ADIYAMAN ÜNİVERSİTESİ,Adıyaman


In [10]:
department_names = get_department_names(driver,base_url, df_universities.index)
len(department_names)

1545

In [11]:
department_names = list(department_names)
department_names.sort()
midx = pd.MultiIndex.from_product([ department_names, ["quota","enrollments"]  ]) 
df_quota = pd.DataFrame(index = midx,columns=df_universities.index)
df_quota.head()

Unnamed: 0,uni_code,1065,1104,1002,1004,1126,1005,1007,1008,1105,1009,...,3004,3005,4011,4013,4017,4020,4039,4052,4096,4088
Acil Yardım ve Afet Yönetimi,quota,,,,,,,,,,,...,,,,,,,,,,
Acil Yardım ve Afet Yönetimi,enrollments,,,,,,,,,,,...,,,,,,,,,,
Acil Yardım ve Afet Yönetimi (İngilizce) (%50 İndirimli),quota,,,,,,,,,,,...,,,,,,,,,,
Acil Yardım ve Afet Yönetimi (İngilizce) (%50 İndirimli),enrollments,,,,,,,,,,,...,,,,,,,,,,
Acil Yardım ve Afet Yönetimi (İngilizce) (Burslu),quota,,,,,,,,,,,...,,,,,,,,,,


In [12]:
def get_quota_not_enrolled(driver):
    try:
        pop_up_element_xpath = "//span[@class='featherlight-close-icon featherlight-close']"
        WebDriverWait(driver,30).until( EC.presence_of_element_located((By.XPATH,pop_up_element_xpath) )  )
        driver.implicitly_wait(5)
        pop_ups = driver.find_elements(by=By.XPATH, value=pop_up_element_xpath)
        for i in range(len(pop_ups)-1,-1,-1): # close pop-ups
            pop_ups[i].click() 
    except:
        pass
    WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.XPATH,"//*[contains(text(),'Genel Bilgiler')]") )  ).click()
    td = WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.XPATH,"//td/strong[contains(text(),'Toplam Kontenjan')]") )  )
    tr_parent=WebDriverWait(td, 30).until( EC.presence_of_element_located((By.XPATH,"./../..") ) )
    WebDriverWait(tr_parent, 30).until( EC.presence_of_element_located((By.TAG_NAME,"td") ) )
    tds = tr_parent.find_elements(by=By.TAG_NAME, value="td")
    
    WebDriverWait(tds[1], 30).until( EC.presence_of_element_located((By.TAG_NAME,"strong") )  )                                   
    quota = tds[1].find_element(by=By.TAG_NAME,value='strong').get_attribute("innerText")
    td = WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.XPATH,"//td[contains(text(),'Boş Kalan Kontenjan')]") )  )
    
    tr= WebDriverWait(td, 30).until( EC.presence_of_element_located((By.XPATH,"..") )  )# td.find_element(by=By.XPATH, value="..")
    tds = tr.find_elements(by=By.TAG_NAME, value="td")
    not_enrolled = tds[1].get_attribute("innerText")
    
    return quota, not_enrolled

In [14]:
def get_department_anchors_divs(driver):
    time.sleep(5)
    driver.implicitly_wait(30)
    anchors          =  driver.find_elements(by=By.XPATH, value="//a[@data-parent='#']") #links to departments
    time.sleep(5)
    driver.implicitly_wait(30)
    divs_departments =  driver.find_elements(by=By.XPATH, value="//a[@data-parent='#']/div")
    return anchors, divs_departments

In [20]:
def create_df_quota(driver, df_quota,uni_code):
    driver.get(base_url + str(uni_code) )
   
    anchors, divs_departments = get_department_anchors_divs(driver)
    for anchor,div_department in zip(anchors, divs_departments):
        result=False
        while not result:
            time.sleep(1)
            try:               
                department_name = div_department.get_attribute("innerText").strip()  
                result=True
            except:
                print("Failed to get department name. uni_code:",uni_code)
                create_df_quota(driver, df_quota,uni_code) #recursive call to restart scraping the same page
   
        if "KKTC" not in department_name:         
            driver.get(anchor.get_attribute("href"))
            driver.implicitly_wait(30)  
            quota, not_enrolled =  get_quota_not_enrolled(driver)           
            df_quota.loc[department_name,"quota"][uni_code] = int(quota)
            df_quota.loc[department_name,"enrollments"][uni_code] = int(quota)-int(not_enrolled)
            #print(department_name,"int(quota)",int(quota)," int(quota)-int(not_enrolled)", int(quota)-int(not_enrolled))    
            driver.implicitly_wait(30)
            driver.back()        

In [23]:
for uni_code in df_universities.index: #list(uni_codes):#
    create_df_quota(driver, df_quota,uni_code)    

TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF62340AD32+56930]
	(No symbol) [0x00007FF62337F632]
	(No symbol) [0x00007FF6232342E5]
	(No symbol) [0x00007FF6232798ED]
	(No symbol) [0x00007FF623279A2C]
	(No symbol) [0x00007FF6232BA967]
	(No symbol) [0x00007FF62329BCDF]
	(No symbol) [0x00007FF6232B81E2]
	(No symbol) [0x00007FF62329BA43]
	(No symbol) [0x00007FF62326D438]
	(No symbol) [0x00007FF62326E4D1]
	GetHandleVerifier [0x00007FF623786ABD+3709933]
	GetHandleVerifier [0x00007FF6237DFFFD+4075821]
	GetHandleVerifier [0x00007FF6237D818F+4043455]
	GetHandleVerifier [0x00007FF6234A9766+706710]
	(No symbol) [0x00007FF62338B90F]
	(No symbol) [0x00007FF623386AF4]
	(No symbol) [0x00007FF623386C4C]
	(No symbol) [0x00007FF623376904]
	BaseThreadInitThunk [0x00007FF8EFB0257D+29]
	RtlUserThreadStart [0x00007FF8F17EAA58+40]


In [None]:
df_quota.info()

In [None]:
df_quota.to_csv("df_quota.csv")

In [22]:
service = Service("chromedriver.exe")
driver= webdriver.Chrome(service=service)