From 5f6d8448bcac2bdf7aced85c37f2964bffbec790 Mon Sep 17 00:00:00 2001 From: dansubak Date: Tue, 16 Sep 2025 12:53:44 -0400 Subject: [PATCH 1/5] Add browser header for podcast extraction --- learning_resources/etl/podcast.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/learning_resources/etl/podcast.py b/learning_resources/etl/podcast.py index 0b07b84a7b..eedcd7f32a 100644 --- a/learning_resources/etl/podcast.py +++ b/learning_resources/etl/podcast.py @@ -19,6 +19,8 @@ CONFIG_FILE_REPO = "mitodl/open-podcast-data" CONFIG_FILE_FOLDER = "podcasts" TIMESTAMP_FORMAT = "%a, %d %b %Y %H:%M:%S %z" +BROWSER_UA_HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} + log = logging.getLogger() @@ -128,7 +130,7 @@ def extract(): for playlist_config in configs: rss_url = playlist_config["rss_url"] try: - response = requests.get(rss_url) # noqa: S113 + response = requests.get(rss_url, headers=BROWSER_UA_HEADERS) # noqa: S113 response.raise_for_status() feed = bs(response.content, "xml") From e1b178c7f1e41134032add870fba8c59bfe66266 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Sep 2025 16:56:56 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- learning_resources/etl/podcast.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/learning_resources/etl/podcast.py b/learning_resources/etl/podcast.py index eedcd7f32a..238c6ca8f1 100644 --- a/learning_resources/etl/podcast.py +++ b/learning_resources/etl/podcast.py @@ -19,7 +19,9 @@ CONFIG_FILE_REPO = "mitodl/open-podcast-data" CONFIG_FILE_FOLDER = "podcasts" TIMESTAMP_FORMAT = "%a, %d %b %Y %H:%M:%S %z" -BROWSER_UA_HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} +BROWSER_UA_HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36" +} log = logging.getLogger() From f3057f8a30f1bd126eb9bc8dde98d17e08f8f6ef Mon Sep 17 00:00:00 2001 From: dansubak Date: Tue, 16 Sep 2025 13:45:27 -0400 Subject: [PATCH 3/5] User agent line too long --- learning_resources/etl/podcast.py | 1 - 1 file changed, 1 deletion(-) diff --git a/learning_resources/etl/podcast.py b/learning_resources/etl/podcast.py index 238c6ca8f1..080a7ad4d1 100644 --- a/learning_resources/etl/podcast.py +++ b/learning_resources/etl/podcast.py @@ -23,7 +23,6 @@ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36" } - log = logging.getLogger() From eb99c74ee66adac640fb8d7ccced48c02fb63d9f Mon Sep 17 00:00:00 2001 From: dansubak Date: Tue, 16 Sep 2025 13:47:59 -0400 Subject: [PATCH 4/5] Line too long --- learning_resources/etl/podcast.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/learning_resources/etl/podcast.py b/learning_resources/etl/podcast.py index 080a7ad4d1..c03d7b3da4 100644 --- a/learning_resources/etl/podcast.py +++ b/learning_resources/etl/podcast.py @@ -20,7 +20,9 @@ CONFIG_FILE_FOLDER = "podcasts" TIMESTAMP_FORMAT = "%a, %d %b %Y %H:%M:%S %z" BROWSER_UA_HEADERS = { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36" + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/39.0.2171.95 Safari/537.36" } log = logging.getLogger() From 0fa4e497591dffb627f2d10e8894ce9e2d22a7fc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Sep 2025 17:48:29 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- learning_resources/etl/podcast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/learning_resources/etl/podcast.py b/learning_resources/etl/podcast.py index c03d7b3da4..c1256003ca 100644 --- a/learning_resources/etl/podcast.py +++ b/learning_resources/etl/podcast.py @@ -21,8 +21,8 @@ TIMESTAMP_FORMAT = "%a, %d %b %Y %H:%M:%S %z" BROWSER_UA_HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/39.0.2171.95 Safari/537.36" + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/39.0.2171.95 Safari/537.36" } log = logging.getLogger()