In [1]:
# Import Dependencies
from bs4 import BeautifulSoup
import os
import pandas as pd

### Ranking Page

In [2]:
# Ranking Page
filepath = os.path.join("sources/World University Rankings - Ranking.html")
with open(filepath, encoding='utf-8') as file:
    htmlRanking = file.read()

In [3]:
# Create BeautifulSoup object; parse with 'html.parser'
soupRanking = BeautifulSoup(htmlRanking, 'html.parser')

In [4]:
# Retrieve the anchors for all university names
anchorTitles = soupRanking.find_all('a', class_='ranking-institution-title')

# Create empty list
universitiesNames = []

# loop over results to get the data
for title in anchorTitles:
    name = title.text
    universitiesNames.append(name)

In [5]:
# Retrieve the parent divs for locations
anchorLocations = soupRanking.find_all('div', class_='location')

# Create empty list
universitiesLocations = []

# Loop over results to get the data
for locations in anchorLocations:
    country = locations.find('a').text
    universitiesLocations.append(country)

In [6]:
# Retrieve the rows for get stats - Number of Students
trRole = soupRanking.find_all('td', class_='stats_number_students')

# Create empty list
numberStudents = []

# Loop over results to get the data
for tr in trRole:
    number = tr.text
    numberStudents.append(number)

In [7]:
# Retrieve the rows for get stats - Student Staff Ratio
trRole = soupRanking.find_all('td', class_='stats stats_student_staff_ratio')

# Create empty list
studentsStaffRatio = []

# Loop over results to get the data
for tr in trRole:
    ratio = tr.text
    studentsStaffRatio.append(ratio)

In [8]:
# Retrieve the rows for get stats - % of international students
trRole = soupRanking.find_all('td', class_='stats stats_pc_intl_students')

# Create empty list
percIntStudents = []

# Loop over results to get the data
for tr in trRole:
    perc = tr.text
    percIntStudents.append(perc)

In [9]:
# Retrieve the rows for get stats - female/male ratio
trRole = soupRanking.find_all('td', class_='stats stats_female_male_ratio')

# Create empty list
genderRatio = []

# Loop over results to get the data
for tr in trRole:
    ratio = tr.text
    genderRatio.append(ratio)

In [10]:
# Create dictionary
rankingTable = {
    'title' : universitiesNames,
    'location' : universitiesLocations,
    'number students' : numberStudents,
    'students staff ratio' : studentsStaffRatio,
    'perc intl students' : percIntStudents,
    'gender ratio' : genderRatio
}

In [11]:
# Transform dict to dataframe
ranking_df = pd.DataFrame.from_dict(rankingTable)
ranking_df.set_index(['title', 'location'], inplace=True)
ranking_df

Unnamed: 0_level_0,Unnamed: 1_level_0,number students,students staff ratio,perc intl students,gender ratio
title,location,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
University of Oxford,United Kingdom,20774,11.1,41%,46 : 54
Stanford University,United States,16223,7.4,23%,44 : 56
Harvard University,United States,21261,9.3,25%,49 : 51
California Institute of Technology,United States,2238,6.3,33%,36 : 64
Massachusetts Institute of Technology,United States,11276,8.4,34%,39 : 61
...,...,...,...,...,...
Yuan Ze University,Taiwan,8188,19.7,7%,42 : 58
Yuriy Fedkovych Chernivtsi National University,Ukraine,12616,10.7,0%,57 : 43
Zagazig University,Egypt,156270,24.4,2%,54 : 46
University of Zagreb,Croatia,59336,15.3,3%,59 : 41


In [12]:
ranking_df.to_csv('universities_ranking.csv', header=True, encoding='utf-8-sig')

## Scores Page

In [13]:
# Scores Page
filepath = os.path.join("sources/World University Rankings - Scores.html")
with open(filepath, encoding='utf-8') as file:
    htmlScores = file.read()

In [14]:
# Create BeautifulSoup object; parse with 'html.parser'
soupScores = BeautifulSoup(htmlScores, 'html.parser')

In [15]:
# Retrieve the anchors for all university names
anchorTitles = soupScores.find_all('a', class_='ranking-institution-title')

# Create empty list
universitiesNames = []

# loop over results to get the data
for title in anchorTitles:
    name = title.text
    universitiesNames.append(name)

In [16]:
# Retrieve the parent divs for locations
anchorLocations = soupScores.find_all('div', class_='location')

# Create empty list
universitiesLocations = []

# Loop over results to get the data
for locations in anchorLocations:
    country = locations.find('a').text
    universitiesLocations.append(country)

In [17]:
# Retrieve the rows for get stats - Overall Score
trRole = soupScores.find_all('td', class_='scores overall-score')

# Create empty list
overallScore = []

# Loop over results to get the data
for tr in trRole:
    score = tr.text
    overallScore.append(score)

In [18]:
# Retrieve the rows for get stats - Teaching Score
trRole = soupScores.find_all('td', class_='scores teaching-score')

# Create empty list
teachingScore = []

# Loop over results to get the data
for tr in trRole:
    score = tr.text
    teachingScore.append(score)

In [19]:
# Retrieve the rows for get stats - Research Score
trRole = soupScores.find_all('td', class_='scores research-score')

# Create empty list
researchScore = []

# Loop over results to get the data
for tr in trRole:
    score = tr.text
    researchScore.append(score)

In [20]:
# Retrieve the rows for get stats - Citations Score
trRole = soupScores.find_all('td', class_='scores citations-score')

# Create empty list
citationsScore = []

# Loop over results to get the data
for tr in trRole:
    score = tr.text
    citationsScore.append(score)

In [21]:
# Retrieve the rows for get stats - Industry Income Score
trRole = soupScores.find_all('td', class_='scores industry_income-score')

# Create empty list
industryIncomeScore = []

# Loop over results to get the data
for tr in trRole:
    score = tr.text
    industryIncomeScore.append(score)

In [22]:
# Retrieve the rows for get stats - International Outlook Score
trRole = soupScores.find_all('td', class_='scores international_outlook-score')

# Create empty list
intlOutlookScore = []

# Loop over results to get the data
for tr in trRole:
    score = tr.text
    intlOutlookScore.append(score)

In [23]:
# Create dictionary
scoreTable = {
    'title' : universitiesNames,
    'location' : universitiesLocations,
    'overall score' : overallScore,
    'teaching score' : teachingScore,
    'research score' : researchScore,
    'citations score' : citationsScore,
    'industry income score' : industryIncomeScore,
    'intl outlook score' : intlOutlookScore
}

In [24]:
# Transform dict to dataframe
score_df = pd.DataFrame.from_dict(scoreTable)
score_df.set_index(['title', 'location'], inplace=True)
score_df

Unnamed: 0_level_0,Unnamed: 1_level_0,overall score,teaching score,research score,citations score,industry income score,intl outlook score
title,location,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
University of Oxford,United Kingdom,95.6,91.3,99.6,98.0,68.7,96.4
Stanford University,United States,94.9,92.2,96.7,99.9,90.1,79.5
Harvard University,United States,94.8,94.4,98.8,99.4,46.8,77.7
California Institute of Technology,United States,94.5,92.5,96.9,97.0,92.7,83.6
Massachusetts Institute of Technology,United States,94.4,90.7,94.4,99.7,90.4,90.0
...,...,...,...,...,...,...,...
Yuan Ze University,Taiwan,10.3–25.0,18.8,14.1,19.7,50.1,30.9
Yuriy Fedkovych Chernivtsi National University,Ukraine,10.3–25.0,17.9,7.7,5.3,33.4,22.2
Zagazig University,Egypt,10.3–25.0,13.7,7.4,42.3,33.4,40.6
University of Zagreb,Croatia,10.3–25.0,19.5,13.2,27.2,40.3,34.3


In [25]:
score_df.to_csv('universities_scores.csv', header=True, encoding='utf-8-sig')

## Merge

In [26]:
# Concatenate Dataframes
universities_df = pd.concat([ranking_df, score_df], axis=1)
universities_df

Unnamed: 0_level_0,Unnamed: 1_level_0,number students,students staff ratio,perc intl students,gender ratio,overall score,teaching score,research score,citations score,industry income score,intl outlook score
title,location,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
University of Oxford,United Kingdom,20774,11.1,41%,46 : 54,95.6,91.3,99.6,98.0,68.7,96.4
Stanford University,United States,16223,7.4,23%,44 : 56,94.9,92.2,96.7,99.9,90.1,79.5
Harvard University,United States,21261,9.3,25%,49 : 51,94.8,94.4,98.8,99.4,46.8,77.7
California Institute of Technology,United States,2238,6.3,33%,36 : 64,94.5,92.5,96.9,97.0,92.7,83.6
Massachusetts Institute of Technology,United States,11276,8.4,34%,39 : 61,94.4,90.7,94.4,99.7,90.4,90.0
...,...,...,...,...,...,...,...,...,...,...,...
Yuan Ze University,Taiwan,8188,19.7,7%,42 : 58,10.3–25.0,18.8,14.1,19.7,50.1,30.9
Yuriy Fedkovych Chernivtsi National University,Ukraine,12616,10.7,0%,57 : 43,10.3–25.0,17.9,7.7,5.3,33.4,22.2
Zagazig University,Egypt,156270,24.4,2%,54 : 46,10.3–25.0,13.7,7.4,42.3,33.4,40.6
University of Zagreb,Croatia,59336,15.3,3%,59 : 41,10.3–25.0,19.5,13.2,27.2,40.3,34.3


In [27]:
universities_df.to_csv('universities_rankings_scores.csv', header=True, encoding='utf-8-sig')