In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import numpy as np
from tqdm import tqdm
import locale
service = Service("chromedriver.exe")
driver= webdriver.Chrome(service=service)
driver.maximize_window()
base_url = "https://yokatlas.yok.gov.tr/lisans-univ.php?u="

 In the previous notebook we have prepared a template df including university type,city,name and code.<br>
Firstly, we read this df which is ready to extend and sort index. In reading process we use columns as index.

In [4]:
locale.setlocale(locale.LC_ALL, 'tr_TR.utf8')
df = pd.read_csv("df_template.csv",index_col=[0,1,2,3])
df = df.sort_index(level=[0,1,2],key=lambda x: pd.Index([locale.strxfrm(e) for e in x] ))
df.head(10)

uni_type,city,uni_name,uni_code
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104
Devlet,Adana,ÇUKUROVA ÜNİVERSİTESİ,1029
Devlet,Adana,SAĞLIK BİLİMLERİ ÜNİVERSİTESİ,1110
Devlet,Adıyaman,ADIYAMAN ÜNİVERSİTESİ,1002
Devlet,Afyonkarahisar,AFYON KOCATEPE ÜNİVERSİTESİ,1004
Devlet,Afyonkarahisar,AFYONKARAHİSAR SAĞLIK BİLİMLERİ ÜNİVERSİTESİ,1126
Devlet,Ağrı,AĞRI İBRAHİM ÇEÇEN ÜNİVERSİTESİ,1005
Devlet,Aksaray,AKSARAY ÜNİVERSİTESİ,1008
Devlet,Amasya,AMASYA ÜNİVERSİTESİ,1009
Devlet,Ankara,ANKARA HACI BAYRAM VELİ ÜNİVERSİTESİ,1117


In this notebook we are going to:
* extend df to include departments and their features.
* fill the df with feature values.

## 1- Add department names

### 1.2 Add department names

In [6]:
def get_department_names(driver,base_url, city,uni_code):
    department_names = set()   
    driver.get(base_url + str(uni_code) )  
    driver.implicitly_wait(2)
    anchor_departments =  driver.find_elements(by=By.XPATH, value="//a[@data-parent='#']")
    for anchor_department  in anchor_departments:
        department_name = anchor_department.find_element(by=By.XPATH, value="div").get_attribute("innerText")
        place_name = anchor_department.find_element(by=By.XPATH, value="small").get_attribute("innerText")        
        # Exclude abroad programs 
        excluded_strings = ["KKTC", "UOLP"]
        if all(exclude_str not in department_name for exclude_str in excluded_strings)  and "KKTC" not in place_name:
            if uni_code==1110: # We deal with Sağlık Bilimleri Üniversitesi seperately
                if (city=="Ankara" and "Ankara" not in department_name) or (city!="Ankara" and "Ankara" in department_name) or\
                    (city in ["Adana","Erzurum","Bursa","Trabzon","İzmir","Kayseri"] and city not in place_name):
                    department_name = None
                if department_name!=None and "(" in department_name:
                        department_name = department_name[:department_name.index("(")].strip()    
            
            if department_name!=None: 
                department_names.add(department_name) 
               
    return department_names    

In [7]:
df_new = pd.DataFrame() 
for _,city,_,uni_code in tqdm(df.index):
    department_names = get_department_names(driver,base_url,city, uni_code) 
    midx = pd.MultiIndex.from_product([[uni_code],department_names] ,names=["uni_code","department"]) 
    df_temp = pd.DataFrame(index = midx)
    df_new = pd.concat( (df_new,df.loc[pd.IndexSlice[:,city,:,uni_code],:].join(df_temp)) ) #.loc[pd.IndexSlice[:, :,uni_code,:],:]#data.loc[:, pd.IndexSlice[:, ["a", "b"]]]

# rename df_new as df again and shift type to first index
df = df_new.reorder_levels([1,2,3,0,4]) 
df.head()

100%|██████████| 207/207 [04:35<00:00,  1.33s/it]


uni_type,city,uni_name,uni_code,department
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,İngilizce Mütercim ve Tercümanlık
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Psikoloji (İngilizce)
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Bilgisayar Mühendisliği (İngilizce)
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Endüstri Mühendisliği (İngilizce)
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Enerji Sistemleri Mühendisliği (İngilizce)


Sort the new department-index.

In [None]:
df = df.sort_index(level=[0,1,2,4],key=lambda x: pd.Index([locale.strxfrm(e) for e in x] ))
df.head()

## 2- Add other features of departments

In [None]:
cities = sorted( df.index.get_level_values(1).unique().tolist() )
len(cities)

In [None]:
department_features = ["quota","placements","not_registered","enrollments","faculty","department_type","scholarship","num_female","num_male","num_city_same","num_city_different","region","language"]
df[department_features[:4]] = 0         # the first four columns are numeric(integers)
df[department_features[2:-1]] = np.nan  # other columns are categorical
df[department_features[-1]] = "Turkish" # default education language is Turkish
df[cities]= 0                           # add number of students coming from cities

### Extract the language if  exists in department name
In this step we :
* remove some expressions like 'Açıköğretim' (open education) ,  'Uzaktan Öğretim' (distant education), 'İÖ' (evening education)
      which will be extracted seperately and saved as features.
* extract language feature if it exists inside parentheses and save it in the corresponding language column.

As a result a department name  such as Psikoloji (İngilizce)  will be Psikoloji while language column will be "İngilizce".

In [None]:
print("Number of deparment names before extracting language and removing expressions in parantheses",len(df.index.get_level_values(4).unique()))

In [None]:
import re
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
#languages_dict = {'Almanca':"German", 'Arapça':"Arabic", 'Bulgarca':"Bulgarian", 'Ermenice':"Armenian", 'Fransızca':"French", 'Korece':"Korean",
 #                 'Lehçe':"Polish", 'Rusça':"Russian",  'Çince':"Chinese", 'İngilizce':"English", 'İspanyolca':"Spanish", 'İtalyanca':"Italian"}
languages_set= {'Almanca', 'Arapça', 'Bulgarca', 'Ermenice', 'Fransızca', 'Korece', 'Lehçe', 'Rusça',  'Çince', 'İngilizce','İspanyolca', 'İtalyanca'}
#res=pd.DataFrame(df.loc[pd.IndexSlice["Devlet",:,:,:]].index.get_level_values(3).unique())["department"]
#res
pattern = r'\((.*?)\)'  # Matches text inside parentheses
new_department_index = {}
for uni_type, city, uni_name, uni_code,department in df.index:
    parentheses_content = set(re.findall(pattern, department))
    if parentheses_content:
        if parentheses_content & languages_set:    
            language, =parentheses_content & languages_set
            df.loc[(uni_type, city, uni_name, uni_code,department),"language"] = language # set the language value 
        new_department = department[:department.index("(")].strip() # remove all parentheses
        new_department_index[department] = new_department

In [None]:
df = df.rename(index=new_department_index, level=4)
print("Number of deparment names before extracting language",len(df.index.get_level_values(4).unique()))

# 3- Fill features 

In [None]:
def get_department_anchors_divs(driver):
  #  time.sleep(2)
    driver.implicitly_wait(imp_wait_time)
    anchors          =  driver.find_elements(by=By.XPATH, value="//a[@data-parent='#']") #links to departments
   # time.sleep(2)
    driver.implicitly_wait(imp_wait_time)
    divs_departments =  driver.find_elements(by=By.XPATH, value="//a[@data-parent='#']/div")
    return anchors, divs_departments

In [None]:
def get_row_content(driver,searched_text,add_strong=False):
    # Find the td element within the row_element,# Selecting the last td element  
    strong=""
    if add_strong:
        strong ="/strong"
    x_path = f"//tr[td{strong}[contains(text(),'{searched_text}')]]/td[last()]"
    td = WebDriverWait(driver, 3).until( EC.presence_of_element_located((By.XPATH, x_path)) )  
    return td.get_attribute("innerText")

In [None]:
wait_time = .2
imp_wait_time= .2
def get_cell_contents(driver,tab_name):
    # close pop-ups
    try:
        pop_up_element_xpath = "//span[@class='featherlight-close-icon featherlight-close']"
        WebDriverWait(driver,wait_time).until( EC.presence_of_element_located((By.XPATH,pop_up_element_xpath) )  )
        #driver.implicitly_wait(5)
        pop_ups = driver.find_elements(by=By.XPATH, value=pop_up_element_xpath)
        for i in range(len(pop_ups)-1,-1,-1): 
            pop_ups[i].click() 
    except:
        pass
    
    result=False
    while not result:
        try:
            x_path_tab = "//*[contains(text(),"+tab_name +")]" # find the tab   
            WebDriverWait(driver, wait_time).until( EC.presence_of_element_located((By.XPATH, x_path_tab) )  ).click() #click the tab
            quota = int(get_row_content(driver,searched_text= "Toplam Kontenjan",add_strong = True) )       
            placements = int( get_row_content(driver,searched_text= "Toplam Yerleşen",add_strong = True)   )
            non_registered = int(get_row_content(driver,searched_text= "Yerleşip Kayıt Yaptırmayan")  )
            uni_type = get_row_content(driver,searched_text= "Burs Türü") 
            faculty = get_row_content(driver,searched_text= "Fakülte")
            department_type = get_row_content(driver,searched_text= "Puan Türü")
            result=True
        except:
            pass        
    
    return quota, placements,non_registered,uni_type

In [None]:
def fill_features(driver, df, uni_code,sleep_time=2):    
    driver.implicitly_wait(imp_wait_time)
    driver.get(base_url + str(uni_code) )
    #  time.sleep(sleep_time)  
    driver.implicitly_wait(imp_wait_time)
    anchors, divs_departments = get_department_anchors_divs(driver)
  #  time.sleep(sleep_time)  
    print(len(divs_departments)) 
    for anchor,div_department in zip(anchors, divs_departments):      
        result=False
        while not result:
          #  time.sleep(sleep_time)   
            try:           
                driver.implicitly_wait(imp_wait_time)
                department_name = div_department.get_attribute("innerText").strip()  
                result=True
                print(department_name)
            except:
                print("Failed to get department name. uni_code:",uni_code)
                exit_result = fill_features(driver, df,uni_code, 5) #recursive call to restart scraping the same page
                if exit_result:
                    return True
                
        if "KKTC" not in department_name:         
            driver.get(anchor.get_attribute("href"))
            driver.implicitly_wait(imp_wait_time)  
            quota, placements,non_registered, scholarship =  get_cell_contents(driver, tab_name = "'Genel Bilgiler'")             
            df.loc[pd.IndexSlice[:,:,:,uni_code,department_name], ["quota","placements","non_registered","scholarship"]] = /
                                                                int(quota),int(placements),int(non_registered),scholarship
            driver.implicitly_wait(imp_wait_time)
            driver.back()    
   # print("Completed "+str(uni_code),end=",")
    return True