This repository has been archived by the owner on Nov 12, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
tasks.py
209 lines (157 loc) · 7.61 KB
/
tasks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
"""
Interact with your school website with telegram!
Copyright (c) 2016-2017 Paolo Barbolini <paolo@paolo565.org>
Released under the MIT license
"""
from bs4 import BeautifulSoup
import botogram
import requests
import urllib
from . import models
from . import utils
class Tasks(botogram.components.Component):
"""All of the TelegramSchoolBot tasks"""
component_name = "tsb-tasks"
def __init__(self, config, db):
self.config = config
self.db = db
self.add_timer(3600, self.run)
def query_main_page(self):
response = requests.get(self.config["school_website"], headers=utils.REQUESTS_HEADERS)
if response.status_code != 200:
raise ValueError("Failed to query the main page, server responded with response code: %i" % (response.status_code,))
if "text/html" not in response.headers['Content-Type']:
print("Failed to query the main page, server responded with content type: %s" % (response.headers['Content-Type'],))
return None
parsed_html = BeautifulSoup(response.text, "html.parser")
# Find the url of the calendar article
calendar_articles = []
left_content = parsed_html.find("div", {"id": "jsn-pleft"})
left_links = left_content.find_all("a")
for link in left_links:
tag = link.find("span")
if tag is None:
continue
text = tag.text
if not ("Orario" in text and "lezioni" in text):
continue
calendar_articles.append(urllib.parse.urljoin(self.config["school_website"], link.get("href")))
# Generate the list of posts
posts = []
post_titles = parsed_html.find_all("h2", {"class": "contentheading"})
post_urls = parsed_html.find_all("p", {"class": "readmore"})
for i in range(0, len(post_urls)):
title = post_titles[i].text.strip()
url = urllib.parse.urljoin(self.config["school_website"], post_urls[i].find("a").get("href"))
posts.append(models.Post(url=url, title=title))
return calendar_articles, posts
def query_calendar_article(self, url):
response = requests.get(url, headers=utils.REQUESTS_HEADERS)
if response.status_code != 200:
raise ValueError("Failed to query the calendar article page, server responded with response code: %i" % (response.status_code,))
if "text/html" not in response.headers['Content-Type']:
print("Failed to query the calendar article page, server responded with content type: %s" % (response.headers['Content-Type'],))
return None
# Find the url of the orario facile page
parsed_html = BeautifulSoup(response.text, "html.parser")
post_content = parsed_html.find("div", {"id": "jsn-mainbody"})
post_urls = post_content.find_all("a")
for link in post_urls:
href = link.get("href")
lower_href = href.lower()
if not lower_href.startswith("/web_orario") and not lower_href.startswith("/weborario"):
continue
calendar_url = urllib.parse.urljoin(url, href)
return calendar_url
def query_calendar(self, url):
response = requests.get(url, headers=utils.REQUESTS_HEADERS)
if response.status_code != 200:
raise ValueError("Failed to query the calendar page, server responded with response code: %i" % (response.status_code,))
if "text/html" not in response.headers['Content-Type']:
print("Failed to query the calendar page, server responded with content type: %s" % (response.headers['Content-Type'],))
return None
pages = []
# Generate the list of pages about classes, teachers and classrooms
parsed_html = BeautifulSoup(response.text, "html.parser")
links = parsed_html.find_all("a")
for link in links:
href = link.get("href")
if href.startswith("Classi/"):
type = "class"
elif href.startswith("Docenti/"):
type = "teacher"
elif href.startswith("Aule/"):
type = "classroom"
else:
continue
pages.append(models.Page(type=type, name=link.text, url=urllib.parse.urljoin(url, href)))
return pages
def update_pages_table(self, pages):
session = self.db.Session()
database_pages = session.query(models.Page).all()
database_pages = list(database_pages)
# Add missing pages
for page in pages:
exists = any(page.type == database_page.type and page.name == database_page.name and page.url == database_page.url for database_page in database_pages)
if not exists:
session.add(page)
# Remove removed pages
for database_page in database_pages:
exists = any(page.type == database_page.type and page.name == database_page.name and page.url == database_page.url for page in pages)
if not exists:
session.delete(database_page)
session.commit()
def update_posts_table_and_notify(self, bot, posts):
writes = []
session = self.db.Session()
post_urls = [post.url for post in posts]
database_posts = session.query(models.Post).filter(models.Post.url.in_(post_urls))
database_posts = list(database_posts)
for local_post in posts:
if not any(local_post.url == database_post.url for database_post in database_posts):
writes.append(local_post)
if len(writes) == 0:
return
# Generate the text of the message
messages = []
if len(writes) == 1:
messages.append("<b>È uscito il seguente articolo:</b>")
else:
messages.append("<b>Sono usciti i seguenti articoli:</b>")
for write in writes:
messages.append("▪️ <a href=\"%s\">%s</a>" % (write.url, write.title))
message = "\n".join(messages)
# Send the message to the subscribers
for subscriber in session.query(models.Subscriber).all():
try:
chat = bot.chat(subscriber.chat_id)
chat.send(message)
except botogram.api.ChatUnavailableError as e:
print("ChatUnavailableError - Removing subscriber", subscriber.chat_id, str(e))
session.delete(subscriber)
except botogram.api.APIError as e:
# This should fall under the ChatUnavailableError but botogram doesnt recognize it
if "deactivated" in e.description:
print("APIError - Removing subscriber", subscriber.chat_id, e.description)
session.delete(subscriber)
else:
print(e)
for write in writes:
session.add(write)
# Commit new pages only at this point because it's better sending
# a notification more than one time rather than skipping some
# users because of an unexpected error
session.commit()
def run(self, bot):
calendar_articles, posts = self.query_main_page()
# This default makes all of the classes, teachers and classrooms
# go away if we can't find the page listing them
calendar_pages = []
for article in calendar_articles:
calendar_url = self.query_calendar_article(article)
if calendar_url is None:
continue
calendar_pages = self.query_calendar(calendar_url)
break
self.update_pages_table(calendar_pages)
self.update_posts_table_and_notify(bot, posts)