mxrch · justanotherosinter · May 24, 2023 · May 24, 2023 · May 24, 2023
diff --git a/ghunt/cli.py b/ghunt/cli.py
@@ -25,6 +25,11 @@ def parse_and_run():
     parser_drive = subparsers.add_parser('drive', help="Get information on a Drive file or folder.")
     parser_drive.add_argument("file_id", help="Example: 1N__vVu4c9fCt4EHxfthUNzVOs_tp8l6tHcMBnpOZv_M")
     parser_drive.add_argument('--json', type=str, help="File to write the JSON output to.")
+
+    ### YouTube module
+    parser_youtube = subparsers.add_parser('youtube', help="Get information on a YouTube channel (doesn't work with channels created after Google removed IDs from the page source, and relies on the page having been archived by Wayback Machine.")
+    parser_youtube.add_argument("channel_url", help="Example: https://www.youtube.com/@YouTube")
+    parser_youtube.add_argument('--json', type=str, help="File to write the JSON output to.")
 
     ### Parsing
     args = parser.parse_args(args=None if sys.argv[1:] else ['--help'])
@@ -44,4 +49,7 @@ def process_args(args: argparse.Namespace):
             trio.run(gaia.hunt, None, args.gaia_id, args.json)
         case "drive":
             from ghunt.modules import drive
-            trio.run(drive.hunt, None, args.file_id, args.json)
+            trio.run(drive.hunt, None, args.file_id, args.json)
+        case "youtube":
+            from ghunt.modules import youtube
+            trio.run(youtube.hunt, None, args.channel_url, args.json)
diff --git a/ghunt/modules/youtube.py b/ghunt/modules/youtube.py
@@ -0,0 +1,24 @@
+from ghunt.helpers.utils import get_httpx_client
+from ghunt import globals as gb
+
+import requests, re, waybackpy, argparse, trio, httpx
+
+async def hunt(as_client: httpx.AsyncClient, channel_url: str, json_file: bool=None):
+    # later: add a way to change this later
+    User_Agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0"
+    r = requests.get(channel_url)
+    matchChannelID = re.search("(https?:\/\/)(www\.)?youtube\.com\/(channel)\/[\w-]+", r.text)
+    # later: maybe add a prompt here to ask the user if the channel ID looks valid? and if it doesn't, it can iterate through all the different channel IDs and ask for each one? Also, add error handling if no match found
+    channelIDURL = matchChannelID.group(0)
+    # later: add a way to let the user
+    # later: switch to memento API for access to more archives?
+    waybackpy_url_object = waybackpy.Url(channelIDURL, User_Agent)
+    nearest_archive_url = waybackpy_url_object.near(year=2019)
+    # later: add a way to see if there are any archives at all, and if there are any before Plus IDs were removed?
+    rArchived = requests.get(nearest_archive_url)
+    # later: add error handling if request fails
+    matchGAIAID = re.search("(?:https?:\/\/plus.google.com\/)([0-9]+)", rArchived.text)
+    # later: add error handling if no match found
+    gaia_id = matchGAIAID.group(1)
+    from ghunt.modules import gaia
+    await gaia.hunt(None, gaia_id, json_file)