In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import numpy as np
from tqdm import tqdm
import locale
import re
service = Service("chromedriver.exe")
driver= webdriver.Chrome(service=service)
driver.maximize_window()
base_url = "https://yokatlas.yok.gov.tr/lisans-univ.php?u="

 In the previous notebook we have prepared a template df including university type,city,name and code.<br>
Firstly, we read this df which is ready to extend and sort index. In reading process we use columns as index.

In [2]:
locale.setlocale(locale.LC_ALL, 'tr_TR.utf8')
df = pd.read_csv("df_template.csv",index_col=[0,1,2,3])
df = df.sort_index(level=[0,1,2],key=lambda x: pd.Index([locale.strxfrm(e) for e in x] ))
df.head(10)

uni_type,city,uni_name,uni_code
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104
Devlet,Adana,ÇUKUROVA ÜNİVERSİTESİ,1029
Devlet,Adana,SAĞLIK BİLİMLERİ ÜNİVERSİTESİ,1110
Devlet,Adıyaman,ADIYAMAN ÜNİVERSİTESİ,1002
Devlet,Afyonkarahisar,AFYON KOCATEPE ÜNİVERSİTESİ,1004
Devlet,Afyonkarahisar,AFYONKARAHİSAR SAĞLIK BİLİMLERİ ÜNİVERSİTESİ,1126
Devlet,Ağrı,AĞRI İBRAHİM ÇEÇEN ÜNİVERSİTESİ,1005
Devlet,Aksaray,AKSARAY ÜNİVERSİTESİ,1008
Devlet,Amasya,AMASYA ÜNİVERSİTESİ,1009
Devlet,Ankara,ANKARA HACI BAYRAM VELİ ÜNİVERSİTESİ,1117


In this notebook we are going to:
* extend df to include departments and their features.
* fill the df with feature values.

## 1- Add department names

In [3]:
def get_department_names_codes(driver,base_url, city,uni_code):
    uni_code_department_names_codes = set()  
    driver.get(base_url + str(uni_code) )  
    driver.implicitly_wait(2)
    anchor_departments =  driver.find_elements(by=By.XPATH, value="//a[@data-parent='#']")
    for anchor_department  in anchor_departments:
        department_code = anchor_department.get_attribute("href")[anchor_department.get_attribute("href").index("=")+1:]  
        department_name = anchor_department.find_element(by=By.XPATH, value="div").get_attribute("innerText")
        place_name = anchor_department.find_element(by=By.XPATH, value="small").get_attribute("innerText")        
        # Exclude abroad programs 
        excluded_strings = ["KKTC", "UOLP"]
        if all(exclude_str not in department_name for exclude_str in excluded_strings)  and "KKTC" not in place_name:
            if uni_code==1110: # We deal with Sağlık Bilimleri Üniversitesi seperately
                if (city=="Ankara" and "Ankara" not in department_name) or (city!="Ankara" and "Ankara" in department_name) or\
                    (city in ["Adana","Erzurum","Bursa","Trabzon","İzmir","Kayseri"] and city not in place_name):
                    department_name = None
                if department_name!=None and "(" in department_name:
                        department_name = department_name[:department_name.index("(")].strip()    
            
            if department_name!=None: 
                uni_code_department_names_codes.add((uni_code,department_name,department_code)) 
                
    return uni_code_department_names_codes

In [4]:
df_new = pd.DataFrame() 
for _,city,_,uni_code in tqdm(df.index):
    uni_code_department_names_codes = get_department_names_codes(driver,base_url,city, uni_code) 
    midx = pd.MultiIndex.from_tuples(uni_code_department_names_codes,names=["uni_code","dep_name","dep_code"])
   # midx = pd.MultiIndex.from_product([[uni_code],department_names,department_codes] ,names=["uni_code","dep_name","dep_code"]) 
    df_temp = pd.DataFrame(index = midx)
    df_new = pd.concat( (df_new,df.loc[pd.IndexSlice[:,city,:,uni_code],:].join(df_temp)) ) #.loc[pd.IndexSlice[:, :,uni_code,:],:]#data.loc[:, pd.IndexSlice[:, ["a", "b"]]]

# rename df_new as df again and shift type to first index
df = df_new.reorder_levels([1,2,3,0,4,5]) 
df.head()

100%|██████████| 207/207 [07:23<00:00,  2.14s/it]


uni_type,city,uni_name,uni_code,dep_name,dep_code
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,İnşaat Mühendisliği (İngilizce),110410046
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Yapay Zeka Mühendisliği (İngilizce),110410258
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Elektrik-Elektronik Mühendisliği (İngilizce),110410019
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Malzeme Bilimi ve Mühendisliği (İngilizce),110410272
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Enerji Sistemleri Mühendisliği (İngilizce),110410225


Sort the new department-index.

In [5]:
df = df.sort_index(level=[0,1,2,4],key=lambda x: pd.Index([locale.strxfrm(e) for e in x] ))
df.head()

uni_type,city,uni_name,uni_code,dep_name,dep_code
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Bilgisayar Mühendisliği (İngilizce),110410244
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Biyomühendislik (İngilizce),110410064
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Elektrik-Elektronik Mühendisliği (İngilizce),110410019
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Endüstri Mühendisliği (İngilizce),110410037
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Enerji Sistemleri Mühendisliği (İngilizce),110410225


## 2- Add other features of departments

In [6]:
cities = sorted( df.index.get_level_values(1).unique().tolist() )
len(cities)

81

In [7]:
department_features = ["quota","placements","not_registered","enrollments","faculty","entrance_score_type","scholarship","num_female","num_male","num_city_same","num_city_different","region","language"]
df[department_features[:4]] = 0         # the first four columns are numeric(integers)
df[department_features[2:-1]] = np.nan  # other columns are categorical
df[department_features[-1]] = "Türkçe" # default education language is Turkish
df[cities]= 0                           # add number of students coming from cities

In [8]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,quota,placements,not_registered,enrollments,faculty,entrance_score_type,scholarship,num_female,num_male,num_city_same,...,Yalova,Yozgat,Zonguldak,Çanakkale,Çankırı,Çorum,İstanbul,İzmir,Şanlıurfa,Şırnak
uni_type,city,uni_name,uni_code,dep_name,dep_code,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Bilgisayar Mühendisliği (İngilizce),110410244,0,0,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Biyomühendislik (İngilizce),110410064,0,0,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Elektrik-Elektronik Mühendisliği (İngilizce),110410019,0,0,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Endüstri Mühendisliği (İngilizce),110410037,0,0,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Enerji Sistemleri Mühendisliği (İngilizce),110410225,0,0,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0


### Extract the language if  exists in department name
In this step we :
* remove some expressions like 'Açıköğretim' (open education) ,  'Uzaktan Öğretim' (distant education), 'İÖ' (evening education)
      which will be extracted seperately and saved as features.
* extract language feature if it exists inside parentheses and save it in the corresponding language column.

As a result a department name  such as Psikoloji (İngilizce)  will be Psikoloji while language column will be "İngilizce".

In [9]:
print("Number of deparment names before extracting language and removing expressions in parantheses",len(df.index.get_level_values(4).unique()))

Number of deparment names before extracting language and removing expressions in parantheses 1353


In [10]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

languages_set= {'Almanca', 'Arapça', 'Bulgarca', 'Ermenice', 'Fransızca', 'Korece', 'Lehçe', 'Rusça',  'Çince', 'İngilizce','İspanyolca', 'İtalyanca'}
pattern = r'\((.*?)\)'  # Matches text inside parentheses
new_department_index = {}

for uni_type, city, uni_name, uni_code,dep_name,dep_code in df.index:
    parentheses_content = set(re.findall(pattern, dep_name))
    if parentheses_content:
        if parentheses_content & languages_set:    
            language, =parentheses_content & languages_set
            df.loc[(uni_type, city, uni_name, uni_code,dep_name,dep_code),"language"] = language # set the language value 
        new_dep_name= dep_name[:dep_name.index("(")].strip() # remove all parentheses
        new_department_index[dep_name] = new_dep_name

In [11]:
df = df.rename(index=new_department_index, level=4)
print("Number of deparment names before extracting language",len(df.index.get_level_values(4).unique()))

Number of deparment names before extracting language 350


We can see that departments like are converted. For instance "Bilgisayar Mühendisliği (İngilizce)" has become "Bilgisayar Mühendisliği"
and its language feature is changed as "İngilizce".

In [12]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,quota,placements,not_registered,enrollments,faculty,entrance_score_type,scholarship,num_female,num_male,num_city_same,num_city_different,region,language,Adana,Adıyaman,Afyonkarahisar,Aksaray,Amasya,Ankara,Antalya,Ardahan,Artvin,Aydın,Ağrı,Balıkesir,Bartın,Batman,Bayburt,Bilecik,Bingöl,Bitlis,Bolu,Burdur,Bursa,Denizli,Diyarbakır,Düzce,Edirne,Elazığ,Erzincan,Erzurum,Eskişehir,Gaziantep,Giresun,Gümüşhane,Hakkari,Hatay,Isparta,Iğdır,Kahramanmaraş,Karabük,Karaman,Kars,Kastamonu,Kayseri,Kilis,Kocaeli,Konya,Kütahya,Kırklareli,Kırıkkale,Kırşehir,Malatya,Manisa,Mardin,Mersin,Muğla,Muş,Nevşehir,Niğde,Ordu,Osmaniye,Rize,Sakarya,Samsun,Siirt,Sinop,Sivas,Tekirdağ,Tokat,Trabzon,Tunceli,Uşak,Van,Yalova,Yozgat,Zonguldak,Çanakkale,Çankırı,Çorum,İstanbul,İzmir,Şanlıurfa,Şırnak
uni_type,city,uni_name,uni_code,dep_name,dep_code,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Bilgisayar Mühendisliği,110410244,0,0,,,,,,,,,,,İngilizce,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Biyomühendislik,110410064,0,0,,,,,,,,,,,İngilizce,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Elektrik-Elektronik Mühendisliği,110410019,0,0,,,,,,,,,,,İngilizce,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Endüstri Mühendisliği,110410037,0,0,,,,,,,,,,,İngilizce,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Devlet,Adana,ADANA ALPARSLAN TÜRKEŞ BİLİM VE TEKNOLOJİ ÜNİVERSİTESİ,1104,Enerji Sistemleri Mühendisliği,110410225,0,0,,,,,,,,,,,İngilizce,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# 3- Fill features 

In [13]:
imp_wait_time=1

In [14]:
def get_row_content(driver,searched_text,imp_wait_time = 1, add_strong=False):
    # Find the td element within the row_element 
    strong=""
    if add_strong:
        strong ="/strong"
    x_path = f"//tr[td{strong}[contains(text(),'{searched_text}')]]/td[last()]" # Selecting the last td element  
    td = WebDriverWait(driver, imp_wait_time).until( EC.presence_of_element_located((By.XPATH, x_path)) )  
    return td.get_attribute("innerText")

In [15]:
def close_pop_ups(driver,imp_wait_time):
    try:
        pop_up_element_xpath = "//span[@class='featherlight-close-icon featherlight-close']"
        WebDriverWait(driver,imp_wait_time).until( EC.presence_of_element_located((By.XPATH,pop_up_element_xpath) )  )
        #driver.implicitly_wait(5)
        pop_ups = driver.find_elements(by=By.XPATH, value=pop_up_element_xpath)
        for i in range(len(pop_ups)-1,-1,-1): 
            pop_ups[i].click() 
    except:
        pass

In [16]:
def get_cell_contents(driver, uni_name,dep_name,tab_name,imp_wait_time=1):
    result=False
    while not result:
        try:
            x_path_tab = f"//*[contains(text(),'{tab_name}')]" # find the tab  
            WebDriverWait(driver, imp_wait_time).until( EC.presence_of_element_located((By.XPATH, x_path_tab) )  ).click() #click the tab
            quota = int(get_row_content(driver,searched_text= "Toplam Kontenjan",add_strong = True) )   
            try:
                placements = int( get_row_content(driver,searched_text= "Toplam Yerleşen",add_strong = True)   )
            except:
                placements = 0            
            try:
                not_registered = int(get_row_content(driver,searched_text= "Yerleşip Kayıt Yaptırmayan")  )
            except:
                not_registered = 0
            scholarship = get_row_content(driver,searched_text= "Burs Türü") 
            faculty = get_row_content(driver,searched_text= "Fakülte")
            entrance_score_type = get_row_content(driver,searched_text= "Puan Türü")
            result=True
        except:
            print(f"Failed to get content of the department {dep_name} at {uni_name}")
            close_pop_ups(driver,imp_wait_time)            
            time.sleep(5)
            imp_wait_time = imp_wait_time + 1
            pass        
    return quota, placements,not_registered,scholarship,faculty,entrance_score_type

In [17]:
def fill_features(driver, df, uni_name,dep_name,dep_code, sleep_time=2, base_dep_url="https://yokatlas.yok.gov.tr/lisans.php?y="):    
    driver.implicitly_wait(imp_wait_time)
    driver.get(base_dep_url + dep_code )
    #  time.sleep(sleep_time)  
    driver.implicitly_wait(imp_wait_time)
    close_pop_ups(driver,imp_wait_time)   # close pop-ups
    df.loc[pd.IndexSlice[:,:,:,:,:,dep_code], ["quota","placements","not_registered","scholarship","faculty","entrance_score_type"]] \
        = get_cell_contents(driver, uni_name,dep_name, tab_name = "Genel Bilgiler")  
    driver.implicitly_wait(imp_wait_time) 

In [None]:
#call fill_features
for _, _ , uni_name, _, dep_name, dep_code in tqdm(df.index):
    fill_features(driver, df, uni_name,dep_name,dep_code)

In [21]:
df["enrollments"] = df["placements"]- df["not_registered"]
df.to_csv("df.csv")