# Web Scraping: Crawling FAP class timetable and schedule on Google Calendar 

A program that available to automatically scrape the class timetable data for a specific semester from my university academic portal website (fap.fpt.edu - FPT University).

The schedule information is subsequently automatically updated as events in my personal Google calendar so that I can receive notifications about the schedule via phone or email and be aware of any changes to the classroom, online/offline, lecturer, etc.

### Notes:
- Create and store edu email & password in `personal_credentials.txt` to access FAP.
- Predefine the information of the classes, semesters, subjects, etc. in [info.txt](/info.txt).
- [Authorizing Requests to the Google Calendar API](https://developers.google.com/calendar/api/guides/auth)
- Raw timetable data is stored in the `timetable.json` file

In [59]:
# import necessary libraries (Selenium, BeautifulSoup, json, time)

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from time import sleep
import re
import json

In [60]:
# init setup of wed driver

driver = webdriver.Chrome("./chromedriver.exe")
# driver.maximize_window()
url = 'https://fap.fpt.edu.vn/Student.aspx'
driver.get(url)

# storing the current window handle to get back to dashboard
main_page = driver.current_window_handle

  driver = webdriver.Chrome("./chromedriver.exe")


In [61]:
# get email, password to access FAP

credentials = open('personal_credentials.txt').readlines()
email = credentials[0].strip()
password = credentials[1].strip()

In [62]:
# automate login FAP and access class timetable page

campus_select = Select(driver.find_element(By.ID, 'ctl00_mainContent_ddlCampus'))
campus_select.select_by_value('4') # campus FUHCM

sleep(2)

login_email_btn = driver.find_element(By.XPATH, '//*[@id="loginform"]/center/div/div[2]/div/div/div')
login_email_btn.click()

# changing the handles to access login page
for handle in driver.window_handles:
    if handle != main_page:
        login_page = handle
        
# change the control to login page        
driver.switch_to.window(login_page)
sleep(2)

# input email
driver.find_element(By.XPATH, '//*[@id ="identifierId"]').send_keys(email)
sleep(2)

# submit email
driver.find_element(By.XPATH, '//*[@id="identifierNext"]/div/button').click()
sleep(2)

# input password 
driver.find_element(By.XPATH, '//*[@id="password"]/div[1]/div/div[1]/input').send_keys(password)
sleep(2)

# submit password 
driver.find_element(By.XPATH, '//*[@id ="passwordNext"]').click()
sleep(2)

# change control to main page
driver.switch_to.window(main_page)
sleep(5)

# click view 'class timetable'
driver.find_element(By.XPATH, '//a[@href="Schedule/TimeTable.aspx"]').click()
sleep(2)


In [63]:
# get campus, semester, class and subjects information

info = open('info.txt').readlines()

campus = info[0].strip().split()[0]
term = info[1].strip().split()[0]
term_id = info[2].strip().split()[0]
class_ = info[3].strip().split()[0]
s = info[4].strip()
subjects = re.findall("(.*)#.*", s)[0].strip().split()

campus, term, term_id, class_, subjects

('4', 'Summer2022', '49', 'SE1618', ['SWP391', 'SWT301'])

In [64]:
# select semester 
try:
    # if current displayed semester is not the one we need 
    cur_term = driver.find_element(By.XPATH, "//a[contains(text(), '" + term + "')]")    
    cur_term.click()
except:
    # if current displayed semester is correct
    cur_term = driver.find_element(By.XPATH, "//b[contains(text(), '" + term + "')]")    

sleep(2)

In [65]:
# select class (e.g: SE1618)
driver.find_element(By.XPATH, '//a[@href="?campus=' + campus + '&term=' + term_id + '&group=' + class_ + '"]').click()

In [66]:
# parse HTML using BeautifulSoup

html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

with open('data.html', 'w', encoding='utf-8') as soup_html:
    print(soup, file=soup_html)

In [67]:
# dictionary for time frame of the slots

start_slot = {
    '1': '07:00:00+07:00',
    '2': '08:45:00+07:00',
    '3': '10:30:00+07:00',
    '4': '12:30:00+07:00',
    '5': '14:15:00+07:00',
    '6': '16:00:00+07:00',
}
end_slot = {
    '1': '08:30:00+07:00',
    '2': '10:15:00+07:00',
    '3': '12:00:00+07:00',
    '4': '14:00:00+07:00',
    '5': '15:45:00+07:00',
    '6': '17:30:00+07:00',
}

In [68]:
# parse timetable data to 'timetable.json'

table_subjects = soup.findChild("div", id="ctl00_mainContent_divDetail")
my_timetable = dict()

for cur_subject in table_subjects:
    
    subject_title = cur_subject.caption.get_text().split()[0] # caption
    
    if subject_title in subjects:
        print(subject_title)
        # my_timetable.append({'subject': subject_title})
        data = cur_subject.tbody
        rows = data.findAll('tr')
        slot_list = []
        for row in rows:
            cur_slot = dict()
            slot = row.findAll('td')
            raw_slot_dates = slot[0].text.strip().split()[1].split("/") # replace("/", "-")
            # raw_slot_dates.reverse() 
            raw_slot_dates = reversed(raw_slot_dates)
            slot_date = "-".join(raw_slot_dates)
            
            slot_number = slot[1].text.strip()[0]
            start_time = start_slot[slot_number]
            end_time = end_slot[slot_number]
            
            slot_room = slot[2].text.strip()
            slot_teacher = slot[3].text.strip()
            
            slot_start_time = slot_date + 'T' + start_time
            slot_end_time = slot_date + 'T' + end_time
            
            cur_slot["start_time"] = slot_start_time
            cur_slot["end_time"] = slot_end_time
            cur_slot["room"] = slot_room
            cur_slot["teacher"] = slot_teacher
            
            slot_list.append(cur_slot)
            # print(slot_number, slot_start_time, slot_end_time, slot_room, slot_teacher)
        
        my_timetable[subject_title] = slot_list
        
# print(my_timetable)
json_object = json.dumps(my_timetable, indent=4, ensure_ascii=False)

# Writing to timetable.json
with open("timetable.json", "w", encoding='utf-8') as outfile:
    outfile.write(json_object)
    
# print(len(table_subjects))

SWP391
SWT301


In [69]:
# connect to Google Calendar API

import datetime
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import pickle

# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/calendar']

CREDENTIALS_FILE = 'credentials.json'

def get_calendar_service():
    creds = None
    # The file token.pickle stores the user's access and refresh tokens, and is  created automatically when the authorization flow completes for the first time.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                CREDENTIALS_FILE, SCOPES)
            creds = flow.run_local_server(port=0)

        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('calendar', 'v3', credentials=creds)
    return service

get_calendar_service()

<googleapiclient.discovery.Resource at 0x27aa702b6d0>

In [71]:
# create events in Google calendar

from datetime import datetime, timedelta

service = get_calendar_service()    

with open('timetable.json', 'r', encoding='utf-8') as openfile:

    json_object = json.load(openfile)

# print(json_object)
for item in json_object:
    subject_slot_list = json_object[item]
    for slot in subject_slot_list:
        print(slot['start_time'], slot['end_time'], slot['room'], slot['teacher'])
        print()
        event_result = service.events().insert(calendarId='primary',
            body={
                "summary": item,
                "description": "Room: " + slot['room'] + ",\nTeacher: " + slot['teacher'],
                "start": {
                    "dateTime": slot['start_time'], 
                    "timeZone": 'Asia/Ho_Chi_Minh'
                },
                "end": {
                    "dateTime": slot['end_time'], 
                    "timeZone": 'Asia/Ho_Chi_Minh'
                },
                "reminders": {
                    "useDefault": "true",
                },
            }
        ).execute()
        # print("created event")
        # print("id: ", event_result['id'])
        # print("summary: ", event_result['summary'])
        # print("starts at: ", event_result['start']['dateTime'])
        # print("ends at: ", event_result['end']['dateTime'])
    


2022-05-09T16:00:00+07:00 2022-05-09T17:30:00+07:00 Phòng Ảo_12 VanVTT10

2022-05-11T16:00:00+07:00 2022-05-11T17:30:00+07:00 Phòng Ảo_12 VanVTT10

2022-05-13T16:00:00+07:00 2022-05-13T17:30:00+07:00 Phòng Ảo_12 VanVTT10

2022-05-16T16:00:00+07:00 2022-05-16T17:30:00+07:00 117 VanVTT10

2022-05-18T16:00:00+07:00 2022-05-18T17:30:00+07:00 117 VanVTT10

2022-05-20T16:00:00+07:00 2022-05-20T17:30:00+07:00 117 VanVTT10

2022-05-23T16:00:00+07:00 2022-05-23T17:30:00+07:00 301 VanVTT10

2022-05-25T16:00:00+07:00 2022-05-25T17:30:00+07:00 222 VanVTT10

2022-05-27T16:00:00+07:00 2022-05-27T17:30:00+07:00 101 VanVTT10

2022-05-30T16:00:00+07:00 2022-05-30T17:30:00+07:00 117 VanVTT10

2022-06-01T16:00:00+07:00 2022-06-01T17:30:00+07:00 117 VanVTT10

2022-06-03T16:00:00+07:00 2022-06-03T17:30:00+07:00 117 VanVTT10

2022-06-06T16:00:00+07:00 2022-06-06T17:30:00+07:00 024 VanVTT10

2022-06-08T16:00:00+07:00 2022-06-08T17:30:00+07:00 006 VanVTT10

2022-06-10T16:00:00+07:00 2022-06-10T17:30:00+07:00 

Done programming!

---

Check the google calendar to check whether timetable is updated!