In [58]:
GENED_COURSES_URL = "https://gened.aua.am/courses-and-their-clusters/"

In [59]:
import requests
response = requests.get(GENED_COURSES_URL)
if response.status_code != 200:
  raise Exception(f"Failed to fetch courses with status {response.status_code}: {response.content}")

raw_html = response.text
"Fetched raw HTML courses successfully"

'Fetched raw HTML courses successfully'

In [60]:
import bs4
soup = bs4.BeautifulSoup(raw_html, "html.parser")
courses_table = soup.find_all("table")[1]
if not courses_table:
  raise Exception(f"AUA Gened courses table is missing. Are you accessing the correct table?")

col_names_html_row = courses_table.find_all("tr")[0]
if not col_names_html_row:
  raise Exception("AUA Gened coruse names row is missing. Are you accessing the correct row?")


col_names = [td.text for td in col_names_html_row.find_all("td")]
col_names

['Subject Code', 'Course Number', 'Course Title', 'Themes']

In [61]:
course_rows_html = courses_table.find_all("tr")[1:]
courses_data = []
for course_row_html in course_rows_html:
  course_row = [td.text for td in course_row_html.find_all("td")]
  courses_data.append(course_row)

courses_data[:5]

[['BUS', '101', 'Introduction to Business', '4,5'],
 ['BUS', '105', 'Foundations of Management', '5'],
 ['BUS',
  '109',
  'Single-Variable Calculus for Business and Economics (Previously Business Math) (Not open to CS, DS, ES majors)\xa0',
  '7,8,9'],
 ['BUS',
  '110',
  'Applied Statistics\xa0 (Not open to CS, DS, ES majors)',
  '7,8,9'],
 ['BUS', '112', 'Social, Legal & Ethical Environment of Business', '3, 6']]

In [62]:
import pandas as pd
courses_df = pd.DataFrame(courses_data, columns=col_names)
courses_df

Unnamed: 0,Subject Code,Course Number,Course Title,Themes
0,BUS,101,Introduction to Business,45
1,BUS,105,Foundations of Management,5
2,BUS,109,Single-Variable Calculus for Business and Econ...,789
3,BUS,110,"Applied Statistics (Not open to CS, DS, ES ma...",789
4,BUS,112,"Social, Legal & Ethical Environment of Business","3, 6"
...,...,...,...,...
240,PH,101,Basics of Healthy Lifestyle,7
241,PH,102,Understanding Substance Use and Addictions (Pr...,46
242,PH,201,Global Health,47
243,PH,202,"Causes, Treatment and Prevention of Cancer",47


In [63]:
courses_df['Themes'] = courses_df['Themes'].str.replace(" ", "")
courses_df

Unnamed: 0,Subject Code,Course Number,Course Title,Themes
0,BUS,101,Introduction to Business,45
1,BUS,105,Foundations of Management,5
2,BUS,109,Single-Variable Calculus for Business and Econ...,789
3,BUS,110,"Applied Statistics (Not open to CS, DS, ES ma...",789
4,BUS,112,"Social, Legal & Ethical Environment of Business",36
...,...,...,...,...
240,PH,101,Basics of Healthy Lifestyle,7
241,PH,102,Understanding Substance Use and Addictions (Pr...,46
242,PH,201,Global Health,47
243,PH,202,"Causes, Treatment and Prevention of Cancer",47


In [64]:
courses_df['Subject Code'].unique()

array(['BUS', 'CBE', 'CHSS', 'CHSS\xa0', 'CS', 'CSE', 'EC', 'EC\xa0',
       'ECON', 'ENGS', 'ESS', 'ENV', 'EPIC', 'FND', 'LAW', 'PG', 'PH'],
      dtype=object)

In [65]:
courses_df['Subject Code'] = courses_df['Subject Code'].str.strip()
courses_df['Subject Code'].unique()

array(['BUS', 'CBE', 'CHSS', 'CS', 'CSE', 'EC', 'ECON', 'ENGS', 'ESS',
       'ENV', 'EPIC', 'FND', 'LAW', 'PG', 'PH'], dtype=object)

In [66]:
courses_df['Subject Code'].value_counts()

Subject Code
CHSS    102
EC       35
CSE      32
ENV      15
PG       13
BUS       8
LAW       7
ESS       7
FND       6
CS        6
ECON      5
PH        5
ENGS      2
CBE       1
EPIC      1
Name: count, dtype: int64

In [67]:
del courses_df["Course Title"]
courses_df

Unnamed: 0,Subject Code,Course Number,Themes
0,BUS,101,45
1,BUS,105,5
2,BUS,109,789
3,BUS,110,789
4,BUS,112,36
...,...,...,...
240,PH,101,7
241,PH,102,46
242,PH,201,47
243,PH,202,47


In [None]:
courses_df.to_csv("./.localdata/aua__geneds-cleaned-full.csv", index=False)