From d18f23d6f0f049290d6c1fa8e1cd50096f275efe Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Thu, 1 Oct 2020 01:48:27 -0400 Subject: [PATCH 01/23] Provide a ScrapingClient that doesn't need API access Also adds the ability to list activities using web scraping instead of the API. The activities are returned as `ScrapedActivity` objects that are mostly compatible with the normal `Activity` objects that are returned by the list activities function that uses the API. --- stravaweblib/webclient.py | 207 +++++++++++++++++++++++++++++++++----- 1 file changed, 182 insertions(+), 25 deletions(-) diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index 4e1e5e9..a3baee9 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -6,22 +6,92 @@ import functools import json import time +import uuid from bs4 import BeautifulSoup import requests import stravalib +from stravalib.attributes import Attribute, TimestampAttribute, TimeIntervalAttribute +from stravalib.model import Activity, BaseEntity -__all__ = ["WebClient", "FrameType", "DataFormat", "ExportFile", "ActivityFile"] +__all__ = ["WebClient", "ScrapingClient", "FrameType", "DataFormat", "ExportFile", "ActivityFile", "ScrapedActivity"] BASE_URL = "https://www.strava.com" +# Used for filtering when scraping the activity list +ACTIVITY_WORKOUT_TYPES = { + "Ride": {None: 10, "Race": 11, "Workout": 12}, + "Run": {None: 0, "Race": 1, "Long Run": 2, "Workout": 3} +} ExportFile = namedtuple("ExportFile", ("filename", "content")) ActivityFile = ExportFile # TODO: deprecate and remove +class ScrapingError(ValueError): + """An error that is retured when something fails during scraping + + This can happen because something on the website changed. + """ + + +class ScrapedActivity(BaseEntity): + """ + Represents an Activity (ride, run, etc.) that was scraped from the website + + The attributes are compatible with stravalib.model.Activity where they exist + """ + + id = Attribute(int) + name = Attribute(str) + description = Attribute(str) + type = Attribute(str) + workout_type = Attribute(str) + + start_date = TimestampAttribute() + distance = Attribute(float) + moving_time = TimeIntervalAttribute() + elapsed_time = TimeIntervalAttribute() + total_elevation_gain = Attribute(float) + suffer_score = Attribute(int) + calories = Attribute(float) + gear_id = Attribute(str) + + # True if the activity has GPS coordinates + # False for trainers, manual activities, etc + has_latlng = Attribute(bool) + + trainer = Attribute(bool) + commute = Attribute(bool) + private = Attribute(bool) + flagged = Attribute(bool) + + def from_dict(self, d): + bike_id = d.pop("bike_id", None) + shoes_id = d.pop("athlete_gear_id", None) + if bike_id: + d["gear_id"] = "b{}".format(bike_id) + elif shoes_id: + d["gear_id"] = "g{}".format(shoes_id) + + d["start_date"] = d.pop("start_time") + d["distance"] = d.pop("distance_raw") + d["moving_time"] = d.pop("moving_time_raw") + d["elapsed_time"] = d.pop("elapsed_time_raw") + d["total_elevation_gain"] = d.pop("elevation_gain_raw") + + wt = d.pop("workout_type") + if d["type"] in ACTIVITY_WORKOUT_TYPES: + for k, v in ACTIVITY_WORKOUT_TYPES[d["type"]].items(): + if wt == v: + d["workout_type"] = k + break + + return super().from_dict(d) + + class DataFormat(enum.Enum): ORIGINAL = "original" GPX = "gpx" @@ -48,10 +118,11 @@ def __str__(self): return str(self.name).replace("_", " ").title() -class WebClient(stravalib.Client): +class ScrapingClient: """ - An extension to the stravalib Client that fills in some of the gaps in - the official API using web scraping. + A client that uses web scraping to interface with Strava. + + Can be used as a mixin to add the extra methods to the main stravalib.Client """ def __init__(self, *args, **kwargs): @@ -75,20 +146,8 @@ def __init__(self, *args, **kwargs): else: raise ValueError("'jwt' or both of 'email' and 'password' are required") - # Init the normal stravalib client with remaining args super().__init__(*args, **kwargs) - # Verify that REST API and Web API correspond to the same Strava user account - if self.access_token is not None: - rest_id = str(self.get_athlete().id) - web_id = self._session.cookies.get('strava_remember_id') - if rest_id != web_id: - raise stravalib.exc.LoginFailed("API and web credentials are for different accounts") - else: - # REST API does not have an access_token (yet). Should we verify the match after - # exchange_code_for_token()? - pass - @property def jwt(self): return self._session.cookies.get('strava_remember_token') @@ -155,6 +214,89 @@ def _login_with_password(self, email, password): if not resp.is_redirect or resp.next.url == "{}/login".format(BASE_URL): raise stravalib.exc.LoginFailed("Couldn't log in to website, check creds") + def scrape_activities(self, keywords=None, activity_type=None, workout_type=None, + commute=False, is_private=False, indoor=False, gear_id=None): + """A scraping-based alternative to stravalib.Client.get_activities() + + Note that when using multiple parameters they are treated as AND, not OR + + :param keywords: Text to search for + :param activity_type: The type of the activity. See stravalib.model:Activity.TYPES + :param workout_type: The type of workout ("Race", "Workout", etc) + :param commute: Only return activities marked as commutes + :param is_private: Only return private activities + :param indoor: Only return indoor/trainer activities + :param gear_id: Only return activities using this gear + + :yield: ScrapedActivity objects + """ + + if activity_type is not None and activity_type not in Activity.TYPES: + raise ValueError( + "Invalid activity type. Must be one of: {}".format(",".join(Activity.TYPES)) + ) + + if activity_type in ACTIVITY_WORKOUT_TYPES: + workout_type = ACTIVITY_WORKOUT_TYPES[activity_type].get(workout_type) + if workout_type is None: + raise ValueError( + "Invalid workout type for a {}. Must be one of: {}".format( + activity_type, + ", ".join(ACTIVITY_WORKOUT_TYPES[activity_type].keys()) + ) + ) + elif workout_type is not None or gear_id is not None: + raise ValueError( + "Can only filter using workout type of gear when activity type is one of: {}".format( + ", ".join(ACTIVITY_WORKOUT_TYPES.keys()) + ) + ) + + page = 1 + per_page = 20 + search_session_id = uuid.uuid4() + + conv_bool = lambda x: "" if not x else "true" + + while True: + resp = self._session.get( + "{}/athlete/training_activities".format(BASE_URL), + headers= { + "Accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript", + #"X-CSRF-Token": next(iter(self.csrf.values())), + "X-Requested-With": "XMLHttpRequest", + }, + params={ + "search_session_id": search_session_id, + "page": page, + "per_page": per_page, + "keywords": keywords, + "new_activity_only": "false", + "activity_type": activity_type or "", + "commute": conv_bool(commute), + "private_activities": conv_bool(is_private), + "trainer": conv_bool(indoor), + "gear": gear_id or "", + } + ) + if resp.status_code != 200: + raise stravalib.exc.Fault( + "Failed to list activities (status code {})".format(resp.status_code) + ) + try: + data = resp.json()["models"] + except (ValueError, TypeError, KeyError) as e: + raise ScrapingError( + "Invalid JSON response from Strava" + ) from e + + for activity in data: + yield ScrapedActivity(**activity) + + # No results = stop requesting pages + if not data: + break + def delete_activity(self, activity_id): """ Deletes the specified activity. @@ -201,8 +343,7 @@ def _make_export_file(resp, id_): content=resp.iter_content(chunk_size=16*1024) # 16KB ) - def get_activity_data(self, activity_id, fmt=DataFormat.ORIGINAL, - json_fmt=None): + def get_activity_data(self, activity_id, fmt=DataFormat.ORIGINAL, json_fmt=None): """ Get a file containing the provided activity's data @@ -231,6 +372,9 @@ def get_activity_data(self, activity_id, fmt=DataFormat.ORIGINAL, fmt = DataFormat.classify(fmt) url = "{}/activities/{}/export_{}".format(BASE_URL, activity_id, fmt) resp = self._session.get(url, stream=True, allow_redirects=False) + + # Gives a 302 back to the activity URL when trying to export a manual activity + # TODO: Does this also happen with other errors? if resp.status_code != 200: raise stravalib.exc.Fault("Status code '{}' received when trying " "to download an activity" @@ -246,7 +390,8 @@ def get_activity_data(self, activity_id, fmt=DataFormat.ORIGINAL, return self._make_export_file(resp, activity_id) - def _parse_date(self, date_str): + @staticmethod + def _parse_date(date_str): if not date_str: return None if date_str.lower() == "since beginning": @@ -254,7 +399,7 @@ def _parse_date(self, date_str): return datetime.utcfromtimestamp(0).date() try: return datetime.strptime(date_str, "%b %d, %Y").date() - except ValueError as e: + except ValueError: return None @functools.lru_cache() @@ -278,12 +423,15 @@ def _get_all_bike_components(self, bike_id): "Failed to load bike details page (status code: {})".format(resp.status_code), ) - soup = BeautifulSoup(resp.text, 'html.parser') - for table in soup.find_all('table'): - if table.find('thead'): + soup = BeautifulSoup(resp.text, 'html5lib') + table = None + for t in soup.find_all('table'): + if t.find('thead'): + table = t break - else: - raise ValueError("Bike component table not found in the HTML - layout update?") + + if not table: + raise ScrapingError("Bike component table not found in the HTML - layout update?") components = [] for row in table.tbody.find_all('tr'): @@ -363,6 +511,15 @@ def get_route_data(self, route_id, fmt=DataFormat.GPX): return self._make_export_file(resp, route_id) +# Mix in the ScrapingClient to inherit all its methods +class WebClient(ScrapingClient, stravalib.Client): + """ + An extension to the stravalib Client that fills in some of the gaps in + the official API using web scraping. + + Requires a JWT or both of email and password + """ + # Inherit parent documentation for WebClient.__init__ WebClient.__init__.__doc__ = stravalib.Client.__init__.__doc__ + \ From cc9e606114e4d7cf95798d41748a0ac0767391b8 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Sat, 3 Oct 2020 23:51:37 -0400 Subject: [PATCH 02/23] Add the ability to scrape photos --- stravaweblib/webclient.py | 84 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 5 deletions(-) diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index a3baee9..7518c70 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -5,17 +5,22 @@ import enum import functools import json +import re import time import uuid from bs4 import BeautifulSoup import requests import stravalib -from stravalib.attributes import Attribute, TimestampAttribute, TimeIntervalAttribute -from stravalib.model import Activity, BaseEntity +from stravalib.attributes import (Attribute, TimestampAttribute, + TimeIntervalAttribute, LocationAttribute) +from stravalib.model import Activity, BaseEntity, BoundEntity -__all__ = ["WebClient", "ScrapingClient", "FrameType", "DataFormat", "ExportFile", "ActivityFile", "ScrapedActivity"] +__all__ = [ + "WebClient", "ScrapingClient", "FrameType", "DataFormat", "ExportFile", + "ActivityFile", "ScrapedActivity", "ScrapedPhoto" +] BASE_URL = "https://www.strava.com" @@ -26,6 +31,8 @@ "Run": {None: 0, "Race": 1, "Long Run": 2, "Workout": 3} } +PHOTOS_REGEX = re.compile(r"var photosJson\s*=\s*(\[.*\]);") + ExportFile = namedtuple("ExportFile", ("filename", "content")) ActivityFile = ExportFile # TODO: deprecate and remove @@ -37,7 +44,38 @@ class ScrapingError(ValueError): """ -class ScrapedActivity(BaseEntity): +class ScrapedPhoto(BaseEntity): + """Represents a photo scraped from Strava's activity details page + + The attributes are compatible with stravalib.models.ActivityPhoto where + they exist. + """ + + unique_id = Attribute(str) + activity_id = Attribute(int) + athlete_id = Attribute(int) + caption = Attribute(str) + + location = LocationAttribute() + + urls = Attribute(dict) # dimension: url + + def from_dict(self, d): + d["unique_id"] = d.pop("photo_id") + d["athlete_id"] = d.pop("owner_id") + + # The caption has unicode escapes (ie. \uFFFF) embedded in the string + d["caption"] = d.pop("caption_escaped", "").encode("utf-8").decode("unicode_escape") + d["urls"] = { + str(min(dim.values())): d.pop(name) + for name, dim in d.pop("dimensions").items() + } + d["location"] = [d.pop("lat"), d.pop("lng")] + + return super().from_dict(d) + + +class ScrapedActivity(BoundEntity): """ Represents an Activity (ride, run, etc.) that was scraped from the website @@ -68,6 +106,20 @@ class ScrapedActivity(BaseEntity): private = Attribute(bool) flagged = Attribute(bool) + _photos = None + + @property + def photos(self): + """Returns a list of ScrapedPhoto objects""" + if self._photos is None: + self.assert_bind_client() + self._photos = self.bind_client.scrape_activity_photos(self.id) + return self._photos + + @property + def total_photo_count(self): + return len(self.photos) + def from_dict(self, d): bike_id = d.pop("bike_id", None) shoes_id = d.pop("athlete_gear_id", None) @@ -214,6 +266,28 @@ def _login_with_password(self, email, password): if not resp.is_redirect or resp.next.url == "{}/login".format(BASE_URL): raise stravalib.exc.LoginFailed("Couldn't log in to website, check creds") + def scrape_activity_photos(self, activity_id): + """Get photos for an activity""" + resp = self._session.get("{}/activities/{}".format(BASE_URL, activity_id)) + resp.raise_for_status() + + soup = BeautifulSoup(resp.content, 'html5lib') + try: + script = next((x for x in soup.find_all("script") if "var photosJson" in x.text)) + except StopIteration: + raise ScrapingError("Failed to find photo data in page") + + m = PHOTOS_REGEX.search(script.text) + if not m: + raise ScrapingError("Failed to extract photo data from page") + + try: + photos = json.loads(m.group(1)) + except (TypeError, ValueError) as e: + raise ScrapingError("Failed to parse extracted photo data") from e + + return [ScrapedPhoto(**p) for p in photos] + def scrape_activities(self, keywords=None, activity_type=None, workout_type=None, commute=False, is_private=False, indoor=False, gear_id=None): """A scraping-based alternative to stravalib.Client.get_activities() @@ -291,7 +365,7 @@ def scrape_activities(self, keywords=None, activity_type=None, workout_type=None ) from e for activity in data: - yield ScrapedActivity(**activity) + yield ScrapedActivity(bind_client=self, **activity) # No results = stop requesting pages if not data: From 5cab40dc86440b6d5060dac82e1f26f9e3be07d3 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Sat, 3 Oct 2020 23:52:25 -0400 Subject: [PATCH 03/23] Remove caching for bike component scraping This should be done by the library consumer if it's needed --- stravaweblib/webclient.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index 7518c70..094538d 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -3,7 +3,6 @@ from collections import namedtuple from datetime import date, datetime import enum -import functools import json import re import time @@ -476,7 +475,6 @@ def _parse_date(date_str): except ValueError: return None - @functools.lru_cache() def _get_all_bike_components(self, bike_id): """ Get all components for the specified bike From 0f55bd30c2e931ba71958190b2b3d40789991d17 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Sun, 4 Oct 2020 02:30:28 -0400 Subject: [PATCH 04/23] Make the ScrapingClient somewhat api-compatible It's not going to be perfect, but the idea is that for the most basic of cases it should be a pretty close replacement. The goal is to keep the amount of work to support both API and scraping-based clients to a minimum. To support this, the WebClient now uses delegation instead of inheritance to add scraper-based functionality. This enables the `ScrapingClient` class to use the same function names without automatically overriding the `stravalib.Client` functions when used through the `WebClient` class. --- stravaweblib/webclient.py | 84 +++++++++++++++++++++++++++++++++------ 1 file changed, 71 insertions(+), 13 deletions(-) diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index 094538d..0bf1509 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -3,6 +3,7 @@ from collections import namedtuple from datetime import date, datetime import enum +import functools import json import re import time @@ -112,7 +113,7 @@ def photos(self): """Returns a list of ScrapedPhoto objects""" if self._photos is None: self.assert_bind_client() - self._photos = self.bind_client.scrape_activity_photos(self.id) + self._photos = self.bind_client.get_activity_photos(self.id) return self._photos @property @@ -265,8 +266,13 @@ def _login_with_password(self, email, password): if not resp.is_redirect or resp.next.url == "{}/login".format(BASE_URL): raise stravalib.exc.LoginFailed("Couldn't log in to website, check creds") - def scrape_activity_photos(self, activity_id): - """Get photos for an activity""" + def get_activity_photos(self, activity_id): + """A scraping-based alternative to stravalib.Client.get_activity_photos + + :param activity_id: The activity for which to fetch photos. + + :return: A list of ScrapedPhoto objects + """ resp = self._session.get("{}/activities/{}".format(BASE_URL, activity_id)) resp.raise_for_status() @@ -287,9 +293,10 @@ def scrape_activity_photos(self, activity_id): return [ScrapedPhoto(**p) for p in photos] - def scrape_activities(self, keywords=None, activity_type=None, workout_type=None, - commute=False, is_private=False, indoor=False, gear_id=None): - """A scraping-based alternative to stravalib.Client.get_activities() + def get_activities(self, keywords=None, activity_type=None, workout_type=None, + commute=False, is_private=False, indoor=False, gear_id=None, + before=None, after=None, limit=None): + """A scraping-based alternative to stravalib.Client.get_activities Note that when using multiple parameters they are treated as AND, not OR @@ -301,6 +308,14 @@ def scrape_activities(self, keywords=None, activity_type=None, workout_type=None :param indoor: Only return indoor/trainer activities :param gear_id: Only return activities using this gear + Parameters for compatibility with stravalib.Client.get_activities: + + :param before: Result will start with activities whose start date is + before specified date. (UTC) + :param after: Result will start with activities whose start date is after + specified value. (UTC) + :param limit: How many maximum activities to return. + :yield: ScrapedActivity objects """ @@ -325,6 +340,10 @@ def scrape_activities(self, keywords=None, activity_type=None, workout_type=None ) ) + before = stravalib.Client._utc_datetime_to_epoch(None, before or datetime.max) + after = stravalib.Client._utc_datetime_to_epoch(None, after or datetime.min) + + num_yielded = 0 page = 1 per_page = 20 search_session_id = uuid.uuid4() @@ -364,11 +383,20 @@ def scrape_activities(self, keywords=None, activity_type=None, workout_type=None ) from e for activity in data: - yield ScrapedActivity(bind_client=self, **activity) + # Respect the limit + if limit is not None and num_yielded >= limit: + return + + activity = ScrapedActivity(bind_client=self, **activity) - # No results = stop requesting pages + # Respect the before and after filters + if after < activity.start_date.timestamp() < before: + yield activity + num_yielded += 1 + + # No results = done if not data: - break + return def delete_activity(self, activity_id): """ @@ -583,8 +611,8 @@ def get_route_data(self, route_id, fmt=DataFormat.GPX): return self._make_export_file(resp, route_id) -# Mix in the ScrapingClient to inherit all its methods -class WebClient(ScrapingClient, stravalib.Client): + +class WebClient(stravalib.Client): """ An extension to the stravalib Client that fills in some of the gaps in the official API using web scraping. @@ -592,9 +620,18 @@ class WebClient(ScrapingClient, stravalib.Client): Requires a JWT or both of email and password """ + def __new__(cls, *args, **kwargs): + self = super().__new__(cls) + + # Prepend __init__'s docstring with the parent classes one + cls.__init__.__doc__ = super().__init__.__doc__ + cls.__init__.__doc__ -# Inherit parent documentation for WebClient.__init__ -WebClient.__init__.__doc__ = stravalib.Client.__init__.__doc__ + \ + # Delegate certain methods and properties to the scraper instance + for fcn in ("delete_activity", "get_bike_components", "get_activity_data", "jwt", "csrf"): + setattr(cls, fcn, cls._delegate(ScrapingClient, fcn)) + return self + + def __init__(self, *args, **kwargs): """ :param email: The email of the account to log into :type email: str @@ -604,6 +641,7 @@ class WebClient(ScrapingClient, stravalib.Client): :param jwt: The JWT of an existing session. If not specified, email and password are required. + Can be accessed from the `.jwt` property. :type jwt: str :param csrf: A dict of the form: `{: }`. @@ -611,3 +649,23 @@ class WebClient(ScrapingClient, stravalib.Client): Can be accessed from the `.csrf` property. :type csrf: dict """ + sc_kwargs = { + k: kwargs.pop(k, None) for k in ("email", "password", "jwt", "csrf") + } + self._scraper = ScrapingClient(**sc_kwargs) + super().__init__(*args, **kwargs) + + @staticmethod + def _delegate(cls, name): + func = getattr(cls, name) + is_prop = isinstance(func, property) + + @functools.wraps(func) + def delegator(self, *args, **kwargs): + if is_prop: + return getattr(self._scraper, name) + return getattr(self._scraper, name)(*args, **kwargs) + + if is_prop: + delegator = property(delegator) + return delegator From 720958689598b2a7110d9fb4665e2944ee074a47 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Tue, 3 Nov 2020 22:59:45 -0500 Subject: [PATCH 05/23] Provide convenience functions for requesting data --- stravaweblib/webclient.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index 0bf1509..0fefed7 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -5,6 +5,7 @@ import enum import functools import json +import logging import re import time import uuid @@ -22,6 +23,7 @@ "ActivityFile", "ScrapedActivity", "ScrapedPhoto" ] +__log__ = logging.getLogger(__name__) BASE_URL = "https://www.strava.com" @@ -193,8 +195,10 @@ def __init__(self, *args, **kwargs): if jwt: self._login_with_jwt(jwt) + __log__.info("Resumed session using JWT '%s'", jwt) elif email and password: self._login_with_password(email, password) + __log__.info("Logged in as '%s'", email) else: raise ValueError("'jwt' or both of 'email' and 'password' are required") @@ -210,14 +214,29 @@ def csrf(self): self._csrf = self._get_csrf_token() return self._csrf + def request(self, method, service, *args, **kwargs): + """Request a URL from Strava + + :service: The URL to send the request to without the base URL + """ + return self._session.request(method, "https://www.strava.com/{}".format(service), *args, **kwargs) + + def request_head(self, service, *args, **kwargs): + return self.request("HEAD", service, *args, **kwargs) + + def request_get(self, service, *args, **kwargs): + return self.request("GET", service, *args, **kwargs) + + def request_post(self, service, *args, **kwargs): + return self.request("POST", service, *args, **kwargs) + def _get_csrf_token(self): """Get a CSRF token Uses the about page because it's small and doesn't redirect based on if the client is logged in or not. """ - login_html = self._session.get("{}/about".format(BASE_URL)).text - soup = BeautifulSoup(login_html, 'html.parser') + soup = BeautifulSoup(self.request_get("about").text, 'html5lib') try: head = soup.head @@ -253,8 +272,8 @@ def _login_with_jwt(self, jwt): def _login_with_password(self, email, password): """Log into the website using a username and password""" - resp = self._session.post( - "{}/session".format(BASE_URL), + resp = self.request_post( + "session", allow_redirects=False, data={ "email": email, @@ -263,7 +282,7 @@ def _login_with_password(self, email, password): **self.csrf } ) - if not resp.is_redirect or resp.next.url == "{}/login".format(BASE_URL): + if not resp.is_redirect or resp.next.url.endswith("/login"): raise stravalib.exc.LoginFailed("Couldn't log in to website, check creds") def get_activity_photos(self, activity_id): From 24eec45893c6c80b23b921fdc9385dad4e2a9190 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Tue, 3 Nov 2020 23:07:54 -0500 Subject: [PATCH 06/23] Ensure scraping and API are accessing the same account --- stravaweblib/webclient.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index 0fefed7..32fc520 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -214,6 +214,10 @@ def csrf(self): self._csrf = self._get_csrf_token() return self._csrf + @property + def athlete_id(self): + return int(self._session.cookies.get('strava_remember_id')) + def request(self, method, service, *args, **kwargs): """Request a URL from Strava @@ -674,6 +678,9 @@ def __init__(self, *args, **kwargs): self._scraper = ScrapingClient(**sc_kwargs) super().__init__(*args, **kwargs) + if self._scraper.athlete_id != self.get_athlete().id: + raise ValueError("API and web credentials are for different accounts") + @staticmethod def _delegate(cls, name): func = getattr(cls, name) From f30882e039137d065b7c1a8e35d4add0444fba0d Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Tue, 3 Nov 2020 23:11:27 -0500 Subject: [PATCH 07/23] Change default fallback for JSON activity downloads The default used to be to just download the JSON blob. It was changed to request the GPX format instead since this is a more standardized format for an activity. --- stravaweblib/webclient.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index 32fc520..3e2cf44 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -483,19 +483,26 @@ def get_activity_data(self, activity_id, fmt=DataFormat.ORIGINAL, json_fmt=None) :param json_fmt: The backup format to request in the event that the `fmt` was DataFormat.ORIGINAL and the request returned - a JSON blob (happens for uploads from mobile apps). - Using `None` (default) will cause the JSON blob to be - returned. - :type json_fmt: :class:`DataFormat` or None + a JSON blob (happens for uploads from older mobile apps). + Using `DataFormat.ORIGINAL` will cause the JSON blob to + be returned. + (defaults to DataFormat.GPX) + :type json_fmt: :class:`DataFormat` :return: A namedtuple with `filename` and `content` attributes: - `filename` is the filename that Strava suggests for the file - `contents` is an iterator that yields file contents as bytes :rtype: :class:`ExportFile` """ - fmt = DataFormat.classify(fmt) - url = "{}/activities/{}/export_{}".format(BASE_URL, activity_id, fmt) - resp = self._session.get(url, stream=True, allow_redirects=False) + __log__.debug("Getting data (in %s format) for activity %s", fmt, activity_id) + + fmt = DataFormat(fmt) + json_fmt = DataFormat(json_fmt) + resp = self.request_get( + "activities/{}/export_{}".format(activity_id, fmt), + stream=True, + allow_redirects=False + ) # Gives a 302 back to the activity URL when trying to export a manual activity # TODO: Does this also happen with other errors? @@ -504,13 +511,12 @@ def get_activity_data(self, activity_id, fmt=DataFormat.ORIGINAL, json_fmt=None) "to download an activity" "".format(resp.status_code)) - # In the case of downloading JSON, the Content-Type header will - # correctly be set to 'application/json' - if (json_fmt and fmt == DataFormat.ORIGINAL and + # When downloading JSON, the Content-Type header will set to 'application/json' + # If the json_fmt is not DataFormat.ORIGINAL, try the download again asking + # for the json_fmt. + if (fmt == DataFormat.ORIGINAL and json_fmt != fmt and resp.headers['Content-Type'].lower() == 'application/json'): - if json_fmt == DataFormat.ORIGINAL.value: - raise ValueError("`json_fmt` parameter cannot be DataFormat.ORIGINAL") - return self.get_activity_data(activity_id, fmt=json_fmt) + return self.get_activity_data(activity_id, fmt=json_fmt, json_fmt=DataFormat.ORIGINAL) return self._make_export_file(resp, activity_id) From 26334b370418b915d14d47e716ddab3dd0234b7e Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Wed, 4 Nov 2020 10:15:36 -0500 Subject: [PATCH 08/23] Increase compatibility for get_activity_photos Now accepts (but ignores) parameters that the `stravalib` version accepts --- stravaweblib/webclient.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index 3e2cf44..ad63de6 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -289,10 +289,12 @@ def _login_with_password(self, email, password): if not resp.is_redirect or resp.next.url.endswith("/login"): raise stravalib.exc.LoginFailed("Couldn't log in to website, check creds") - def get_activity_photos(self, activity_id): + def get_activity_photos(self, activity_id, size=None, only_instagram=None): """A scraping-based alternative to stravalib.Client.get_activity_photos :param activity_id: The activity for which to fetch photos. + :param size: [unused] (for compatbility with stravalib) + :param only_instagram: [unused] (for compatibility with stravalib) :return: A list of ScrapedPhoto objects """ From dac6cc3027102b58217ba0e849558d3e8f6b0b58 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Wed, 4 Nov 2020 10:21:27 -0500 Subject: [PATCH 09/23] Improve get_activites function - Make pagination actually work (forgot to increment page number) - Handle stopping based on the `before` param - Properly handle workout types --- stravaweblib/webclient.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index ad63de6..f3c2529 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -344,6 +344,7 @@ def get_activities(self, keywords=None, activity_type=None, workout_type=None, :yield: ScrapedActivity objects """ + __log__.debug("Getting activities") if activity_type is not None and activity_type not in Activity.TYPES: raise ValueError( "Invalid activity type. Must be one of: {}".format(",".join(Activity.TYPES)) @@ -376,11 +377,11 @@ def get_activities(self, keywords=None, activity_type=None, workout_type=None, conv_bool = lambda x: "" if not x else "true" while True: - resp = self._session.get( - "{}/athlete/training_activities".format(BASE_URL), + __log__.debug("Getting page %s of activities", page) + resp = self.request_get( + "athlete/training_activities", headers= { "Accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript", - #"X-CSRF-Token": next(iter(self.csrf.values())), "X-Requested-With": "XMLHttpRequest", }, params={ @@ -394,6 +395,7 @@ def get_activities(self, keywords=None, activity_type=None, workout_type=None, "private_activities": conv_bool(is_private), "trainer": conv_bool(indoor), "gear": gear_id or "", + "order": "start_date_local DESC" # Return in reverse-chronological order } ) if resp.status_code != 200: @@ -403,9 +405,11 @@ def get_activities(self, keywords=None, activity_type=None, workout_type=None, try: data = resp.json()["models"] except (ValueError, TypeError, KeyError) as e: - raise ScrapingError( - "Invalid JSON response from Strava" - ) from e + raise ScrapingError("Invalid JSON response from Strava") from e + + # No results = done + if not data: + return for activity in data: # Respect the limit @@ -415,13 +419,20 @@ def get_activities(self, keywords=None, activity_type=None, workout_type=None, activity = ScrapedActivity(bind_client=self, **activity) # Respect the before and after filters - if after < activity.start_date.timestamp() < before: - yield activity - num_yielded += 1 + # Will see activities from neweset to oldest so can do less + # work to limit by time + ts = activity.start_date.timestamp() + if ts < after: + # Activity is too new, no more results + return + elif ts > before: + # Activity is too old, don't yield it + continue - # No results = done - if not data: - return + yield activity + num_yielded += 1 + + page += 1 def delete_activity(self, activity_id): """ From 3cb2f48eb1631cace3668725de8a325ca888ddab Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Wed, 4 Nov 2020 10:30:31 -0500 Subject: [PATCH 10/23] Refactor `delete_activity` to use `request_post` --- stravaweblib/webclient.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index f3c2529..965763d 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -441,8 +441,9 @@ def delete_activity(self, activity_id): :param activity_id: The activity to delete. :type activity_id: int """ - resp = self._session.post( - "{}/activities/{}".format(BASE_URL, activity_id), + __log__.debug("Deleting activity %s", activity_id) + resp = self.request_post( + "activities/{}".format(activity_id), allow_redirects=False, data={ "_method": "delete", @@ -450,7 +451,7 @@ def delete_activity(self, activity_id): } ) - if not resp.is_redirect or resp.next.url != "{}/athlete/training".format(BASE_URL): + if not resp.is_redirect or not resp.next.url.endswith("/athlete/training"): raise stravalib.exc.Fault( "Failed to delete activity (status code: {})".format(resp.status_code), ) From 0e2edb1146d60c2857e6ee5b04bd509849da8c93 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Wed, 4 Nov 2020 11:27:17 -0500 Subject: [PATCH 11/23] WIP - Move models to a separate file - Add more detailed scraping of activity details - Add more detailed scraping of bike data --- stravaweblib/model.py | 319 ++++++++++++++++++++++++++++++++++++++ stravaweblib/webclient.py | 298 +++++++++++++---------------------- 2 files changed, 428 insertions(+), 189 deletions(-) create mode 100644 stravaweblib/model.py diff --git a/stravaweblib/model.py b/stravaweblib/model.py new file mode 100644 index 0000000..47a37af --- /dev/null +++ b/stravaweblib/model.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python + +import enum +from datetime import date, datetime + +from stravalib.attributes import (Attribute, DateAttribute, TimestampAttribute, + TimeIntervalAttribute, LocationAttribute) +from stravalib.model import (BaseEntity, BoundEntity, LoadableEntity, + Bike as _Bike) +from stravalib import unithelper as uh + + +def _parse_component_date(date_str): + if not date_str: + return None + if date_str.lower() == "since beginning": + # Different from no date, but don't know exactly when it was + return datetime.utcfromtimestamp(0).date() + try: + return datetime.strptime(date_str, "%b %d, %Y").date() + except ValueError: + return None + + +def _dict_modify(d, prev, target, overwrite=True, default=None, fcn=None): + """Translate the prev key to target + + Only non-None values will be set + + if overwrite is true, the target key will be overwritten even if something truthy is already there + default controls if anything should be used if the prev key is not available + l is a lambda function that the value will be passed through before being set. + """ + if not overwrite and d.get(target): + return + + t = d.pop(prev, default) + if t is None: + return + if fcn: + t = fcn(t) + if t is None: + return + d[target] = t + + +class DataFormat(enum.Enum): + ORIGINAL = "original" + GPX = "gpx" + TCX = "tcx" + + def __str__(self): + return str(self.value) + + +class FrameType(enum.Enum): + MOUNTAIN_BIKE = 1 + CROSS_BIKE = 2 + ROAD_BIKE = 3 + TIME_TRIAL_BIKE = 4 + + def __str__(self): + return str(self.name).replace("_", " ").title() + + @classmethod + def from_str(cls, s): + if isinstance(s, cls): + return s + return cls[s.replace(" ", "_").upper().replace("TT_", "TIME_TRIAL_")] + + +class ExpandableEntity(LoadableEntity): + """Allows for an object to be "expanded" on demand""" + + _expanded = False + _expandable = set() + + def __getattribute__(self, k): + if k != "_expandable" and k in self._expandable and not self._expanded: + self.expand() + return super().__getattribute__(k) + + def _do_expand(self, d, overwrite=True): + if self._expanded: + return + + if overwrite: + self.from_dict(d) + self._expanded = True + return + + # Only set non-null attributes + # Mark as expanded before doing the expansion so __getatttribute__ + # doesn't cause infinte recursion + try: + self._expanded = True + self.from_dict({ + k: v for k, v in d.items() + if not getattr(self, k, None) + }) + except Exception: + self._expanded = False + raise + + def expand(self): + # Needs to call self._do_expand with some data + raise NotImplementedError() + + +class ScrapedGear(BaseEntity): + """Represents gear scraped from Strava + + The attributes are compatible with stravalib.model.Gear where they exist + """ + id = Attribute(str) + name = Attribute(str) + distance = Attribute(float, units=uh.meters) + primary = Attribute(bool) + brand_name = Attribute(str) + model_name = Attribute(str) + description = Attribute(str) + + def from_dict(self, d): + _dict_modify(d, "display_name", "name", overwrite=False) + _dict_modify(d, "default", "primary", overwrite=False) + _dict_modify(d, "total_distance", "distance", overwrite=False, + fcn=lambda x: float(x.replace(",", "")) * 1000) + + return super().from_dict(d) + + def __repr__(self): + return "<{} id={} name={!r}>".format( + self.__class__.__name__, + self.id, + self.name + ) + + +class ScrapedShoe(ScrapedGear): + """Represents a pair of shoes scraped from Strava + + The attributes are compatible with stravalib.model.Shoe where they exist + """ + pass + + +class ScrapedBikeComponent(BaseEntity): + """Represents a bike component scraped from Strava""" + + id = Attribute(int) + type = Attribute(str) + brand_name = Attribute(str) + model_name = Attribute(str) + added = DateAttribute() + removed = DateAttribute() + distance = Attribute(int, units=uh.meters) + + def from_dict(self, d): + # Parse and convert dates into something DateAttribute can understand + _dict_modify(d, "added", "added", fcn=_parse_component_date) + _dict_modify(d, "removed", "removed", fcn=_parse_component_date) + + return super().from_dict(d) + + def __repr__(self): + return "<{} id={} type={!r}>".format( + self.__class__.__name__, + self.id, + self.type + ) + + +class _BikeData(ExpandableEntity): + """Mixin class to add weight and components to a Bike""" + frame_type = Attribute(FrameType) + components = Attribute(list) + weight = Attribute(float, units=uh.kg) + + _expandable = {"weight", "components"} + + def expand(self): + """Expand the bike with more details using scraping""" + self.assert_bind_client() + self._do_expand(self.bind_client.get_bike_details(self.id)) + + def components_on_date(self, on_date): + """Get bike components installed on the specified date + + :type on_date: None or datetime.date or datetime.datetime + (datetimes will lose time-precision) + """ + if on_date is None: + return self.components + + if isinstance(on_date, datetime): + on_date = on_date.date() + + return [ + c for c in self.components + if (c.added or date.min) <= on_date <= (c.removed or date.max) + ] + + def from_dict(self, d): + # Upgrade the frame_type to the enum + _dict_modify(d, "frame_type", "frame_type", fcn=lambda x: FrameType(x)) + return super().from_dict(d) + + +class Bike(_BikeData, _Bike) : + __doc__ = _Bike.__doc__ + """ + Scraping adds weight and components attributes + """ + + +class ScrapedBike(ScrapedGear, _BikeData): + """Represents a bike scraped from Strava + + The attributes are compatible with stravalib.models.Bike where they exist. + """ + + _expandable = {'frame_type', 'brand_name', 'model_name'} + + def from_dict(self, d): + # Upgrade the scraped frame_type string to the enum + _dict_modify(d, "frame_type", "frame_type", fcn=lambda x: FrameType.from_str(x)) + return super().from_dict(d) + + +class ScrapedActivityPhoto(BaseEntity): + """Represents a photo scraped from Strava's activity details page + + The attributes are compatible with stravalib.models.ActivityPhoto where + they exist. + """ + + unique_id = Attribute(str) + activity_id = Attribute(int) + athlete_id = Attribute(int) + caption = Attribute(str) + + location = LocationAttribute() + + urls = Attribute(dict) # dimension: url + + def from_dict(self, d): + _dict_modify(d, "photo_id", "unique_id") + _dict_modify(d, "owner_id", "athlete_id") + + # The caption has unicode escapes (ie. \uFFFF) embedded in the string + _dict_modify(d, "caption_escaped", "caption", fcn=lambda x: x.encode("utf-8").decode("unicode_escape")) + + if "dimensions" in d: + d["urls"] = { + str(min(dim.values())): d.pop(name) + for name, dim in d.pop("dimensions").items() + } + if "lat" in d and "lng" in d: + d["location"] = [d.pop("lat"), d.pop("lng")] + + return super().from_dict(d) + + +class ScrapedActivity(ExpandableEntity): + """ + Represents an Activity (ride, run, etc.) that was scraped from the website + + The attributes are compatible with stravalib.model.Activity where they exist + """ + + name = Attribute(str) + description = Attribute(str) + type = Attribute(str) + workout_type = Attribute(str) + + start_date = TimestampAttribute() + distance = Attribute(float) + moving_time = TimeIntervalAttribute() + elapsed_time = TimeIntervalAttribute() + total_elevation_gain = Attribute(float) + suffer_score = Attribute(int) + calories = Attribute(float) + gear_id = Attribute(str) + + # True if the activity has GPS coordinates + # False for trainers, manual activities, etc + has_latlng = Attribute(bool) + + trainer = Attribute(bool) + commute = Attribute(bool) + private = Attribute(bool) + flagged = Attribute(bool) + + manual = Attribute(bool) + photos = Attribute(list) # list of ScrapedActivityPhoto objects + device_name = Attribute(str) + + _expandable = {"photos", "manual", "device_name"} + + def expand(self): + """Expand the activity with more details using scraping""" + self.assert_bind_client() + self._do_expand(self.bind_client.get_extra_activity_details(self.id), overwrite=False) + + @property + def total_photo_count(self): + return len(self.photos) + + def from_dict(self, d): + # Only 1 of these will set the gear_id + _dict_modify(d, "bike_id", "gear_id", fcn=lambda x: "b{}".format(x)) + _dict_modify(d, "athlete_gear_id", "gear_id", fcn=lambda x: "g{}".format(x)) + + _dict_modify(d, "start_time", "start_date") + _dict_modify(d, "distance_raw", "distance") + _dict_modify(d, "moving_time_raw", "moving_time") + _dict_modify(d, "elapsed_time_raw", "elapsed_time") + _dict_modify(d, "elevation_gain_raw", "elevation_gain") + + return super().from_dict(d) diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index 965763d..48d4393 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -1,8 +1,8 @@ +#!/usr/bin/env python from base64 import b64decode import cgi from collections import namedtuple -from datetime import date, datetime -import enum +from datetime import datetime import functools import json import logging @@ -13,19 +13,14 @@ from bs4 import BeautifulSoup import requests import stravalib -from stravalib.attributes import (Attribute, TimestampAttribute, - TimeIntervalAttribute, LocationAttribute) -from stravalib.model import Activity, BaseEntity, BoundEntity +from stravalib.model import Activity, Bike as _Bike +from stravaweblib.model import (DataFormat, ScrapedShoe, Bike, ScrapedBike, + ScrapedBikeComponent, ScrapedActivity, + ScrapedActivityPhoto, ScrapedAthlete) -__all__ = [ - "WebClient", "ScrapingClient", "FrameType", "DataFormat", "ExportFile", - "ActivityFile", "ScrapedActivity", "ScrapedPhoto" -] - __log__ = logging.getLogger(__name__) -BASE_URL = "https://www.strava.com" # Used for filtering when scraping the activity list ACTIVITY_WORKOUT_TYPES = { @@ -33,7 +28,11 @@ "Run": {None: 0, "Race": 1, "Long Run": 2, "Workout": 3} } +# Regexes for pulling information out of the activity details page PHOTOS_REGEX = re.compile(r"var photosJson\s*=\s*(\[.*\]);") +PAGE_VIEW_REGEX = re.compile(r"pageView\s*=\s*new\s+Strava.Labs.Activities.Pages.(\S+)PageView\([\"']?\d+[\"']?,\s*[\"']([^\"']+)") + +NON_NUMBERS = re.compile(r'[^\d\.]') ExportFile = namedtuple("ExportFile", ("filename", "content")) ActivityFile = ExportFile # TODO: deprecate and remove @@ -46,132 +45,6 @@ class ScrapingError(ValueError): """ -class ScrapedPhoto(BaseEntity): - """Represents a photo scraped from Strava's activity details page - - The attributes are compatible with stravalib.models.ActivityPhoto where - they exist. - """ - - unique_id = Attribute(str) - activity_id = Attribute(int) - athlete_id = Attribute(int) - caption = Attribute(str) - - location = LocationAttribute() - - urls = Attribute(dict) # dimension: url - - def from_dict(self, d): - d["unique_id"] = d.pop("photo_id") - d["athlete_id"] = d.pop("owner_id") - - # The caption has unicode escapes (ie. \uFFFF) embedded in the string - d["caption"] = d.pop("caption_escaped", "").encode("utf-8").decode("unicode_escape") - d["urls"] = { - str(min(dim.values())): d.pop(name) - for name, dim in d.pop("dimensions").items() - } - d["location"] = [d.pop("lat"), d.pop("lng")] - - return super().from_dict(d) - - -class ScrapedActivity(BoundEntity): - """ - Represents an Activity (ride, run, etc.) that was scraped from the website - - The attributes are compatible with stravalib.model.Activity where they exist - """ - - id = Attribute(int) - name = Attribute(str) - description = Attribute(str) - type = Attribute(str) - workout_type = Attribute(str) - - start_date = TimestampAttribute() - distance = Attribute(float) - moving_time = TimeIntervalAttribute() - elapsed_time = TimeIntervalAttribute() - total_elevation_gain = Attribute(float) - suffer_score = Attribute(int) - calories = Attribute(float) - gear_id = Attribute(str) - - # True if the activity has GPS coordinates - # False for trainers, manual activities, etc - has_latlng = Attribute(bool) - - trainer = Attribute(bool) - commute = Attribute(bool) - private = Attribute(bool) - flagged = Attribute(bool) - - _photos = None - - @property - def photos(self): - """Returns a list of ScrapedPhoto objects""" - if self._photos is None: - self.assert_bind_client() - self._photos = self.bind_client.get_activity_photos(self.id) - return self._photos - - @property - def total_photo_count(self): - return len(self.photos) - - def from_dict(self, d): - bike_id = d.pop("bike_id", None) - shoes_id = d.pop("athlete_gear_id", None) - if bike_id: - d["gear_id"] = "b{}".format(bike_id) - elif shoes_id: - d["gear_id"] = "g{}".format(shoes_id) - - d["start_date"] = d.pop("start_time") - d["distance"] = d.pop("distance_raw") - d["moving_time"] = d.pop("moving_time_raw") - d["elapsed_time"] = d.pop("elapsed_time_raw") - d["total_elevation_gain"] = d.pop("elevation_gain_raw") - - wt = d.pop("workout_type") - if d["type"] in ACTIVITY_WORKOUT_TYPES: - for k, v in ACTIVITY_WORKOUT_TYPES[d["type"]].items(): - if wt == v: - d["workout_type"] = k - break - - return super().from_dict(d) - - -class DataFormat(enum.Enum): - ORIGINAL = "original" - GPX = "gpx" - TCX = "tcx" - - def __str__(self): - return str(self.value) - - @classmethod - def classify(cls, value): - for x in cls: - if x.value == str(value): - return x - raise ValueError("Invalid format '{}'".format(value)) - - -class FrameType(enum.Enum): - MOUNTAIN_BIKE = 1 - CROSS_BIKE = 2 - ROAD_BIKE = 3 - TIME_TRIAL_BIKE = 4 - - def __str__(self): - return str(self.name).replace("_", " ").title() - - class ScrapingClient: """ A client that uses web scraping to interface with Strava. @@ -289,6 +162,55 @@ def _login_with_password(self, email, password): if not resp.is_redirect or resp.next.url.endswith("/login"): raise stravalib.exc.LoginFailed("Couldn't log in to website, check creds") + def get_extra_activity_details(self, activity_id): + """Scapes the full activity page for various details + + Returns a dict of the properties + """ + __log__.debug("Getting extra information for activity %s", activity_id) + resp = self.request_get("activities/{}".format(activity_id)) + if not resp.ok: + raise stravalib.exc.Fault("Failed to load activity page to get details") + + ret = {} + + soup = BeautifulSoup(resp.text, 'html5lib') + + summary = soup.find("div", class_="activity-summary-container") + if summary: + name = summary.find("h1", class_="activity-name") + if name: + ret["name"] = name.text.strip() + description = summary.find("div", class_="activity-description") + if description: + ret["description"] = description.text.strip() + device = summary.find("div", class_="device") + if device: + ret["device_name"] = device.text.strip() + + for script in soup.find_all("script"): + if "var pageView;" in script.text: + m = PAGE_VIEW_REGEX.search(script.text) + if not m: + __log__.error("Failed to extract manual and type data from page") + continue + ret["manual"] = m.group(1).lower() == "manual" + ret["type"] = m.group(2) + + elif "var photosJson" in script.text: + m = PHOTOS_REGEX.search(script.text) + if not m: + __log__.error("Failed to extract photo data from page") + continue + try: + photos = json.loads(m.group(1)) + except (TypeError, ValueError) as e: + __log__.error("Failed to parse extracted photo data", exc_info=True) + continue + ret["photos"] = [ScrapedActivityPhoto(**p) for p in photos] + + return ret + def get_activity_photos(self, activity_id, size=None, only_instagram=None): """A scraping-based alternative to stravalib.Client.get_activity_photos @@ -296,27 +218,9 @@ def get_activity_photos(self, activity_id, size=None, only_instagram=None): :param size: [unused] (for compatbility with stravalib) :param only_instagram: [unused] (for compatibility with stravalib) - :return: A list of ScrapedPhoto objects + :return: A list of ScrapedActivityPhoto objects """ - resp = self._session.get("{}/activities/{}".format(BASE_URL, activity_id)) - resp.raise_for_status() - - soup = BeautifulSoup(resp.content, 'html5lib') - try: - script = next((x for x in soup.find_all("script") if "var photosJson" in x.text)) - except StopIteration: - raise ScrapingError("Failed to find photo data in page") - - m = PHOTOS_REGEX.search(script.text) - if not m: - raise ScrapingError("Failed to extract photo data from page") - - try: - photos = json.loads(m.group(1)) - except (TypeError, ValueError) as e: - raise ScrapingError("Failed to parse extracted photo data") from e - - return [ScrapedPhoto(**p) for p in photos] + return self.get_extra_activity_details(activity_id).get("photos", None) def get_activities(self, keywords=None, activity_type=None, workout_type=None, commute=False, is_private=False, indoor=False, gear_id=None, @@ -416,6 +320,14 @@ def get_activities(self, keywords=None, activity_type=None, workout_type=None, if limit is not None and num_yielded >= limit: return + # Translate workout types from ints back to strings + wt = activity.pop("workout_type") + if activity["type"] in ACTIVITY_WORKOUT_TYPES: + for k, v in ACTIVITY_WORKOUT_TYPES[activity["type"]].items(): + if wt == v: + activity["workout_type"] = k + break + activity = ScrapedActivity(bind_client=self, **activity) # Respect the before and after filters @@ -534,49 +446,57 @@ def get_activity_data(self, activity_id, fmt=DataFormat.ORIGINAL, json_fmt=None) return self._make_export_file(resp, activity_id) - @staticmethod - def _parse_date(date_str): - if not date_str: - return None - if date_str.lower() == "since beginning": - # Different from no date, but don't know exactly when it was - return datetime.utcfromtimestamp(0).date() - try: - return datetime.strptime(date_str, "%b %d, %Y").date() - except ValueError: - return None - - def _get_all_bike_components(self, bike_id): + def get_bike_details(self, bike_id): """ - Get all components for the specified bike + Scrape the details of the specified bike :param bike_id: The id of the bike to retreive components for (must start with a "b") :type bike_id: str """ + __log__.debug("Getting bike details for bike %s", bike_id) if not bike_id.startswith('b'): raise ValueError("Invalid bike id (must start with 'b')") - # chop off the leading "b" - url = "{}/bikes/{}".format(BASE_URL, bike_id[1:]) - - resp = self._session.get(url, allow_redirects=False) + resp = self.request_get( + "bikes/{}".format(bike_id[1:]), # chop off the leading "b" + allow_redirects=False + ) if resp.status_code != 200: raise stravalib.exc.Fault( "Failed to load bike details page (status code: {})".format(resp.status_code), ) soup = BeautifulSoup(resp.text, 'html5lib') + + ret = {} + + # Get data about the bike + gear_table = soup.find("div", class_="gear-details").find("table") + for k, v in zip( + ["frame_type", "brand_name", "model_name", "weight"], + [x.text for x in gear_table.find_all("td")][1::2] + ): + if not k: + continue + if k == "weight": + # Strip non-number chars ("kg") + # TODO: other units? + v = float(NON_NUMBERS.sub('', v)) + ret[k.lower()] = v + + # Get component data table = None for t in soup.find_all('table'): if t.find('thead'): table = t break + else: + raise ScrapingError( + "Bike component table not found in the HTML - layout update?" + ) - if not table: - raise ScrapingError("Bike component table not found in the HTML - layout update?") - - components = [] + ret["components"] = [] for row in table.tbody.find_all('tr'): cells = row.find_all('td') text = [cell.text.strip() for cell in cells] @@ -591,16 +511,16 @@ def _get_all_bike_components(self, bike_id): component_id = cells[6].find('a', text="Delete")['href'].rsplit("/", 1)[-1] - components.append({ - 'id': component_id, - 'type': text[0], - 'brand': text[1], - 'model': text[2], - 'added': self._parse_date(text[3]), - 'removed': self._parse_date(text[4]), - 'distance': distance - }) - return components + ret["components"].append(ScrapedBikeComponent( + id=component_id, + type=text[0], + brand_name=text[1], + model_name=text[2], + added=text[3], + removed=text[4], + distance=distance + )) + return ret def get_bike_components(self, bike_id, on_date=None): """ From 99d0b775daeee30f88682ce211c95d169fd8d82d Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Tue, 10 Nov 2020 23:03:07 -0500 Subject: [PATCH 12/23] Pull unicode_escapes out --- stravaweblib/model.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/stravaweblib/model.py b/stravaweblib/model.py index 47a37af..6cab1c0 100644 --- a/stravaweblib/model.py +++ b/stravaweblib/model.py @@ -21,6 +21,10 @@ def _parse_component_date(date_str): except ValueError: return None +def _decode_unicode_escapes(s): + """Decodes unicode escapes (\xFFFF) enbeddded in a string""" + return s.encode("utf-8").decode("unicode_escape") + def _dict_modify(d, prev, target, overwrite=True, default=None, fcn=None): """Translate the prev key to target @@ -247,7 +251,7 @@ def from_dict(self, d): _dict_modify(d, "owner_id", "athlete_id") # The caption has unicode escapes (ie. \uFFFF) embedded in the string - _dict_modify(d, "caption_escaped", "caption", fcn=lambda x: x.encode("utf-8").decode("unicode_escape")) + _dict_modify(d, "caption_escaped", "caption", fcn=_decode_unicode_escapes) if "dimensions" in d: d["urls"] = { From 2ae1c3d69733f7127fe114d2f6abb6f77a8eb659 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Wed, 4 Nov 2020 11:34:16 -0500 Subject: [PATCH 13/23] Implement a replacement for `get_activity` --- stravaweblib/webclient.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index 48d4393..d13cbb2 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -346,6 +346,19 @@ def get_activities(self, keywords=None, activity_type=None, workout_type=None, page += 1 + def get_activity(self, activity_id): + """A scraping-based alternative to stravalib.Client.get_activity + + Note that this actually performs a search for the activity using + `get_activities` to get most of the information. Generally, it would be + more efficient to use `get_activities` to find the activities directly. + """ + d = self.get_extra_activity_details(activity_id) + for x in self.get_activities(keywords=d["name"], activity_type=d["type"]): + if x.id == activity_id: + x._do_expand(d, overwrite=False) + return x + def delete_activity(self, activity_id): """ Deletes the specified activity. From 030b3a3572e631891f9d96e592d826a2dc931c2e Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Wed, 4 Nov 2020 11:32:27 -0500 Subject: [PATCH 14/23] Add scraped components to Bikes returned from get_gear Replaces `get_bike_components` --- setup.py | 2 +- stravaweblib/webclient.py | 45 ++++++++++++++------------------------- 2 files changed, 17 insertions(+), 30 deletions(-) diff --git a/setup.py b/setup.py index 81c1798..9104b99 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ packages=["stravaweblib"], python_requires=">=3.4.0", install_requires=[ - "stravalib>=0.6.6,<1.0.0", + "stravalib>=0.10.4,<1.0.0", "beautifulsoup4>=4.6.0,<5.0.0", ], ) diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index d13cbb2..d594046 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -535,29 +535,6 @@ def get_bike_details(self, bike_id): )) return ret - def get_bike_components(self, bike_id, on_date=None): - """ - Get components for the specified bike - - :param bike_id: The id of the bike to retreive components for - (must start with a "b") - :type bike_id: str - - :param on_date: Only return components on the bike for this day. If - `None`, return all components regardless of date. - :type on_date: None or datetime.date or datetime.datetime - """ - components = self._get_all_bike_components(bike_id) - - # Filter by the on_date param - if on_date: - if isinstance(on_date, datetime): - on_date = on_date.date() - return [c for c in components if \ - (c['added'] or date.min) <= on_date <= (c['removed'] or date.max)] - else: - return components - def get_route_data(self, route_id, fmt=DataFormat.GPX): """ Get a file containing the provided route's data @@ -596,14 +573,15 @@ class WebClient(stravalib.Client): Requires a JWT or both of email and password """ - def __new__(cls, *args, **kwargs): + def __new__(cls, *_, **__): self = super().__new__(cls) - # Prepend __init__'s docstring with the parent classes one - cls.__init__.__doc__ = super().__init__.__doc__ + cls.__init__.__doc__ + # Prepend some docstrings with the parent classes one + for fcn in ("__init__", "get_gear"): + getattr(cls, fcn).__doc__ = getattr(super(), fcn).__doc__ + getattr(cls, fcn).__doc__ # Delegate certain methods and properties to the scraper instance - for fcn in ("delete_activity", "get_bike_components", "get_activity_data", "jwt", "csrf"): + for fcn in ("delete_activity", "get_activity_data", "jwt", "csrf"): setattr(cls, fcn, cls._delegate(ScrapingClient, fcn)) return self @@ -634,9 +612,18 @@ def __init__(self, *args, **kwargs): if self._scraper.athlete_id != self.get_athlete().id: raise ValueError("API and web credentials are for different accounts") + def get_gear(self, gear_id): + """ + Returned Bikes will have scraped attributes lazily added + """ + gear = super().get_gear(gear_id) + if isinstance(gear, _Bike): + return Bike(bind_client=self._scraper, **gear.to_dict()) + return gear + @staticmethod - def _delegate(cls, name): - func = getattr(cls, name) + def _delegate(clazz, name): + func = getattr(clazz, name) is_prop = isinstance(func, property) @functools.wraps(func) From c13b6bded16bcab9941a1b4c2eb5c9e038500c60 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Wed, 4 Nov 2020 11:55:02 -0500 Subject: [PATCH 15/23] Implement a scraping-based `get_gear` function --- stravaweblib/webclient.py | 44 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index d594046..68d3dbe 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -563,6 +563,50 @@ def get_route_data(self, route_id, fmt=DataFormat.GPX): return self._make_export_file(resp, route_id) + def get_all_bikes(self): + """Scrape all bike information from Strava + + :yield: `ScrapedBike` objects + """ + __log__.debug("Getting all bike data") + resp = self.request_get("athletes/{}/gear/bikes".format(self.athlete_id)) + if not resp.ok: + raise stravalib.exc.Fault("Failed to get list of bikes") + try: + yield from ( + ScrapedBike( + bind_client=self, + id="b{}".format(b.pop("id")), # add "b" to gear id + **b + ) + for b in resp.json() + ) + except (TypeError, ValueError) as e: + raise ScrapingError("Failed to parse bike data") from e + + def get_all_shoes(self): + """Scrape all shoe information from Strava + + :yield: `ScrapedShoe` objects + """ + __log__.debug("Getting all shoe data") + resp = self.request_get("athletes/{}/gear/shoes".format(self.athlete_id)) + if not resp.ok: + raise stravalib.exc.Fault("Failed to get list of shoes") + try: + yield from (ScrapedShoe(**s) for s in resp.json()) + except (TypeError, ValueError) as e: + raise ScrapingError("Failed to parse shoe data") from e + + def get_gear(self, gear_id): + """A scraping-based replacement for `stravalib.Client.get_gear`""" + try: + if gear_id.startswith("b"): + return next(x for x in self.get_all_bikes() if x.id == gear_id) + else: + return next(x for x in self.get_all_shoes() if x.id == gear_id) + except StopIteration: + raise KeyError("No gear with id '{}' found".format(gear_id)) class WebClient(stravalib.Client): From 4eba61aa5fede95a2eb92222ddfd9365499925ab Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Wed, 4 Nov 2020 22:26:23 -0500 Subject: [PATCH 16/23] Use EntityCollection type for lists of entities --- stravaweblib/model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stravaweblib/model.py b/stravaweblib/model.py index 6cab1c0..7654051 100644 --- a/stravaweblib/model.py +++ b/stravaweblib/model.py @@ -6,7 +6,7 @@ from stravalib.attributes import (Attribute, DateAttribute, TimestampAttribute, TimeIntervalAttribute, LocationAttribute) from stravalib.model import (BaseEntity, BoundEntity, LoadableEntity, - Bike as _Bike) + EntityCollection, Bike as _Bike) from stravalib import unithelper as uh @@ -177,7 +177,7 @@ def __repr__(self): class _BikeData(ExpandableEntity): """Mixin class to add weight and components to a Bike""" frame_type = Attribute(FrameType) - components = Attribute(list) + components = Attribute(EntityCollection(ScrapedBikeComponent)) weight = Attribute(float, units=uh.kg) _expandable = {"weight", "components"} @@ -295,7 +295,7 @@ class ScrapedActivity(ExpandableEntity): flagged = Attribute(bool) manual = Attribute(bool) - photos = Attribute(list) # list of ScrapedActivityPhoto objects + photos = Attribute(EntityCollection(ScrapedActivityPhoto)) device_name = Attribute(str) _expandable = {"photos", "manual", "device_name"} From dde930baa5be43a1328f4fdfddf810d13c687949 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Wed, 4 Nov 2020 22:40:20 -0500 Subject: [PATCH 17/23] Refactor how lazy loading works --- stravaweblib/model.py | 146 +++++++++++++++++++++++------------------- 1 file changed, 81 insertions(+), 65 deletions(-) diff --git a/stravaweblib/model.py b/stravaweblib/model.py index 7654051..52dc4ac 100644 --- a/stravaweblib/model.py +++ b/stravaweblib/model.py @@ -73,42 +73,71 @@ def from_str(cls, s): return cls[s.replace(" ", "_").upper().replace("TT_", "TIME_TRIAL_")] -class ExpandableEntity(LoadableEntity): - """Allows for an object to be "expanded" on demand""" - - _expanded = False - _expandable = set() - - def __getattribute__(self, k): - if k != "_expandable" and k in self._expandable and not self._expanded: - self.expand() - return super().__getattribute__(k) - - def _do_expand(self, d, overwrite=True): - if self._expanded: - return - - if overwrite: - self.from_dict(d) - self._expanded = True - return - - # Only set non-null attributes - # Mark as expanded before doing the expansion so __getatttribute__ - # doesn't cause infinte recursion - try: - self._expanded = True - self.from_dict({ - k: v for k, v in d.items() - if not getattr(self, k, None) - }) - except Exception: - self._expanded = False - raise - - def expand(self): - # Needs to call self._do_expand with some data - raise NotImplementedError() +class MetaLazy(type): + """A metaclass that returns subclasses of the class of the passed in Attribute + + This is used with the LazyLoaded class wrapper below to dynamically create + lazy-loaded subclasses. + + Also, it names the returned types LazyLoaded + """ + def __call__(cls, attr, *args, **kwargs): + attr_cls = attr.__class__ + cls = cls.__class__(cls.__name__ + attr_cls.__name__, (cls, attr_cls), {}) + return super(MetaLazy, cls).__call__(attr, *args, **kwargs) + + +class LazyLoaded(metaclass=MetaLazy): + """Class wrapper that handles lazy-loading an Attribute as it is requested""" + + def __init__(self, attr, fcn=None, key=None): + """Set up the LazyLoaded wrapper + + Can expand attributes individually using a lambda function (fcn), or + multiple attributes at a time via an `expand` function defined on the + class that houses it (key). + + Using `fcn`-based attributes is recommended when each attribute needs + to be retrieved separately. Using `key`-based attributes is recommended + when multiple attributes can be retrieved at the same time. + + :param attr: The `Attribute` to wrap (ie. `Attribute(int)`) + :param fcn: This function will be called the first time the attribute + is requested. The result will be set as the attribute value. + :param key: The key of the attribute in the lazyload cache. The lazyload + cache is stored on the parent class. When this attribute is + requested and the key in not in the cache, the `load_attribute` + function on the parent class is called and the result is + added to the cache. At this point, the key is poped out of + the cache and set as the attribute variable. + """ + if not (bool(fcn) ^ bool(key)): + raise ValueError("One of fcn or key (not both) is required") + self._fcn = fcn + self._key = key + # Mimic the child Attribute's properties + super().__init__( + type_=attr.type, + resource_states=attr.resource_states, + units=attr.units + ) + + def __get__(self, obj, clazz): + if obj is not None and obj not in self.data: + if self._fcn: + # Call the provided function to load the attribute + value = self._fcn(obj) + elif self._key: + if not hasattr(obj, "_lazyload_cache"): + obj._lazyload_cache = {} + + # Use obj.load_attribute() to ensure the object is in the cache + if self._key not in obj._lazyload_cache: + obj._lazyload_cache.update(obj.load_attribute(self._key)) + value = obj._lazyload_cache.pop(self._key) + + self.__set__(obj, value) + return super().__get__(obj, clazz) class ScrapedGear(BaseEntity): @@ -174,18 +203,20 @@ def __repr__(self): ) -class _BikeData(ExpandableEntity): +class _BikeData(LoadableEntity): """Mixin class to add weight and components to a Bike""" - frame_type = Attribute(FrameType) - components = Attribute(EntityCollection(ScrapedBikeComponent)) - weight = Attribute(float, units=uh.kg) - - _expandable = {"weight", "components"} + frame_type = LazyLoaded(Attribute(FrameType), key="frame_type") + components = LazyLoaded(EntityCollection(ScrapedBikeComponent), key="components") + weight = LazyLoaded(Attribute(float, units=uh.kg), key="weight") - def expand(self): + def load_attribute(self, _): """Expand the bike with more details using scraping""" self.assert_bind_client() - self._do_expand(self.bind_client.get_bike_details(self.id)) + + d = self.bind_client.get_bike_details(self.id) + # Upgrade the frame_type to the enum + _dict_modify(d, "frame_type", "frame_type", fcn=lambda x: FrameType.from_str(x)) + return d def components_on_date(self, on_date): """Get bike components installed on the specified date @@ -204,11 +235,6 @@ def components_on_date(self, on_date): if (c.added or date.min) <= on_date <= (c.removed or date.max) ] - def from_dict(self, d): - # Upgrade the frame_type to the enum - _dict_modify(d, "frame_type", "frame_type", fcn=lambda x: FrameType(x)) - return super().from_dict(d) - class Bike(_BikeData, _Bike) : __doc__ = _Bike.__doc__ + """ @@ -222,13 +248,6 @@ class ScrapedBike(ScrapedGear, _BikeData): The attributes are compatible with stravalib.models.Bike where they exist. """ - _expandable = {'frame_type', 'brand_name', 'model_name'} - - def from_dict(self, d): - # Upgrade the scraped frame_type string to the enum - _dict_modify(d, "frame_type", "frame_type", fcn=lambda x: FrameType.from_str(x)) - return super().from_dict(d) - class ScrapedActivityPhoto(BaseEntity): """Represents a photo scraped from Strava's activity details page @@ -264,7 +283,7 @@ def from_dict(self, d): return super().from_dict(d) -class ScrapedActivity(ExpandableEntity): +class ScrapedActivity(LoadableEntity): """ Represents an Activity (ride, run, etc.) that was scraped from the website @@ -294,16 +313,13 @@ class ScrapedActivity(ExpandableEntity): private = Attribute(bool) flagged = Attribute(bool) - manual = Attribute(bool) - photos = Attribute(EntityCollection(ScrapedActivityPhoto)) - device_name = Attribute(str) - - _expandable = {"photos", "manual", "device_name"} + manual = LazyLoaded(Attribute(bool), key="manual") + photos = LazyLoaded(EntityCollection(ScrapedActivityPhoto), key="photos") + device_name = LazyLoaded(Attribute(str), key="device_name") - def expand(self): - """Expand the activity with more details using scraping""" + def load_attribute(self, _): self.assert_bind_client() - self._do_expand(self.bind_client.get_extra_activity_details(self.id), overwrite=False) + return self.bind_client.get_extra_activity_details(self.id) @property def total_photo_count(self): From 6b68efd0a97fc98bcb1658f4f450bd6d3573aca9 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Mon, 9 Nov 2020 14:48:58 -0500 Subject: [PATCH 18/23] Allow LazyLoaded Attributes to behave like properties --- stravaweblib/model.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/stravaweblib/model.py b/stravaweblib/model.py index 52dc4ac..1559b07 100644 --- a/stravaweblib/model.py +++ b/stravaweblib/model.py @@ -90,7 +90,7 @@ def __call__(cls, attr, *args, **kwargs): class LazyLoaded(metaclass=MetaLazy): """Class wrapper that handles lazy-loading an Attribute as it is requested""" - def __init__(self, attr, fcn=None, key=None): + def __init__(self, attr, *, fcn=None, key=None, property=False): """Set up the LazyLoaded wrapper Can expand attributes individually using a lambda function (fcn), or @@ -101,6 +101,9 @@ class that houses it (key). to be retrieved separately. Using `key`-based attributes is recommended when multiple attributes can be retrieved at the same time. + If `property` is True, the attribute will be loaded each time it is + requested. This makes the attribute act more like a property. + :param attr: The `Attribute` to wrap (ie. `Attribute(int)`) :param fcn: This function will be called the first time the attribute is requested. The result will be set as the attribute value. @@ -108,11 +111,14 @@ class that houses it (key). cache is stored on the parent class. When this attribute is requested and the key in not in the cache, the `load_attribute` function on the parent class is called and the result is - added to the cache. At this point, the key is poped out of - the cache and set as the attribute variable. + added to the cache. At this point, the key is popped out of + the cache and set as the attribute variable. If the key is + not in the cache, `None` is set at the value of the attribute. + :param property: Don't store the result of the lazy load """ if not (bool(fcn) ^ bool(key)): raise ValueError("One of fcn or key (not both) is required") + self._property = property self._fcn = fcn self._key = key # Mimic the child Attribute's properties @@ -123,7 +129,7 @@ class that houses it (key). ) def __get__(self, obj, clazz): - if obj is not None and obj not in self.data: + if obj is not None and (self._property or obj not in self.data): if self._fcn: # Call the provided function to load the attribute value = self._fcn(obj) @@ -133,12 +139,22 @@ def __get__(self, obj, clazz): # Use obj.load_attribute() to ensure the object is in the cache if self._key not in obj._lazyload_cache: - obj._lazyload_cache.update(obj.load_attribute(self._key)) - value = obj._lazyload_cache.pop(self._key) + obj._lazyload_cache.update(obj.load_attribute(self._key) or {}) + value = obj._lazyload_cache.pop(self._key, None) + + if self._property: + return value self.__set__(obj, value) return super().__get__(obj, clazz) + def __set__(self, obj, val): + if self._property: + raise AttributeError( + "Can't set {} property on {!r}".format(self.__class__.__name__, obj) + ) + super().__set__(obj, val) + class ScrapedGear(BaseEntity): """Represents gear scraped from Strava From a22ea26e8f5b19c2838318f87fe67791513be40a Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Thu, 19 Aug 2021 23:18:48 -0400 Subject: [PATCH 19/23] Add ScrapedAthlete --- stravaweblib/model.py | 60 ++++++++++++++++++ stravaweblib/webclient.py | 125 +++++++++++++++++++++++++++++++++++++- 2 files changed, 182 insertions(+), 3 deletions(-) diff --git a/stravaweblib/model.py b/stravaweblib/model.py index 1559b07..acef410 100644 --- a/stravaweblib/model.py +++ b/stravaweblib/model.py @@ -353,3 +353,63 @@ def from_dict(self, d): _dict_modify(d, "elevation_gain_raw", "elevation_gain") return super().from_dict(d) + + +class ScrapedAthlete(LoadableEntity): + """ + Represents Athlete data scraped from the website + + The attributes are compatible with stravalib.model.Athlete where they exist + """ + firstname = Attribute(str) + lastname = Attribute(str) + # Dynamically compute the display name in the same way Strava does + name = LazyLoaded( + Attribute(str), + fcn=lambda x: "{} {}".format(x.firstname or "", x.lastname or "").strip(), + property=True + ) + + profile = Attribute(str) + photos = EntityCollection(ScrapedActivityPhoto) + challenges = Attribute(list) + + city = Attribute(str) + state = Attribute(str) + country = Attribute(str) + location = LocationAttribute() + + bikes = LazyLoaded(EntityCollection(ScrapedBike), key="bikes") + shoes = LazyLoaded(EntityCollection(ScrapedShoe), key="shoes") + + def load_attribute(self, key): + self.assert_bind_client() + if key == "bikes": + v = self.bind_client.get_all_bikes(self.id) + elif key == "shoes": + v = self.bind_client.get_all_shoes(self.id) + else: + return + return {key: v} + + def from_dict(self, d): + # Merge geo subdict into the main dict + d.update(d.pop("geo", {})) + + _dict_modify(d, "photo", "profile_medium") + _dict_modify(d, "photo_large", "profile") + _dict_modify(d, "first_name", "firstname") + _dict_modify(d, "last_name", "lastname") + _dict_modify(d, "gender", "sex") + _dict_modify(d, "lat_lng", "location") + + # According to some code returned in the HTML, Strava computes the + # display name using " ". He we make an attempt to break + # the display name back up into it's parts. This is only for + # compatibility with the stravalib API - you should always use obj.name + name = d.pop("name", None) + if name and "firstname" not in d and "lastname" not in d: + # total guess: assume more last names have spaces than first + d["firstname"], d["lastname"] = name.split(" ", 1) + + return super().from_dict(d) diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index 68d3dbe..5e38597 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -29,7 +29,9 @@ } # Regexes for pulling information out of the activity details page -PHOTOS_REGEX = re.compile(r"var photosJson\s*=\s*(\[.*\]);") +PHOTOS_REGEX = re.compile(r"var\s+photosJson\s*=\s*(\[.*\]);") +ATHLETE_REGEX = re.compile(r"var\s+currentAthlete\s*=\s*new\s+Strava.Models.CurrentAthlete\(({.*})\);") +CHALLENGE_IDS_REGEX = re.compile(r"var\s+trophiesAnalyticsProperties\s*=\s*{.*challenge_id:\s*\[(\[[\d\s,]*\])\]") PAGE_VIEW_REGEX = re.compile(r"pageView\s*=\s*new\s+Strava.Labs.Activities.Pages.(\S+)PageView\([\"']?\d+[\"']?,\s*[\"']([^\"']+)") NON_NUMBERS = re.compile(r'[^\d\.]') @@ -563,11 +565,16 @@ def get_route_data(self, route_id, fmt=DataFormat.GPX): return self._make_export_file(resp, route_id) - def get_all_bikes(self): + def get_all_bikes(self, athlete_id=None): """Scrape all bike information from Strava :yield: `ScrapedBike` objects """ + # Return minimal information from the athlete page if this isn't the + # currently-logged in athlete. + if int(athlete_id) != self.athlete_id: + return self.get_athlete(athlete_id).bikes + __log__.debug("Getting all bike data") resp = self.request_get("athletes/{}/gear/bikes".format(self.athlete_id)) if not resp.ok: @@ -584,11 +591,16 @@ def get_all_bikes(self): except (TypeError, ValueError) as e: raise ScrapingError("Failed to parse bike data") from e - def get_all_shoes(self): + def get_all_shoes(self, athlete_id=None): """Scrape all shoe information from Strava :yield: `ScrapedShoe` objects """ + # Return minimal information from the athlete page if this isn't the + # currently-logged in athlete. + if int(athlete_id) != self.athlete_id: + return self.get_athlete(athlete_id).shoes + __log__.debug("Getting all shoe data") resp = self.request_get("athletes/{}/gear/shoes".format(self.athlete_id)) if not resp.ok: @@ -608,6 +620,113 @@ def get_gear(self, gear_id): except StopIteration: raise KeyError("No gear with id '{}' found".format(gear_id)) + def get_athlete(self, athlete_id=None): + """A scraping-based replacement for `stravalib.Client.get_athlete`""" + if athlete_id is None: + athlete_id = self.athlete_id + + athlete_id = int(athlete_id) + + __log__.debug("Getting athlete %s", athlete_id) + resp = self.request_get("athletes/{}".format(athlete_id)) + if not resp.ok: + raise stravalib.exc.Fault("Failed to get athlete {}".format(athlete_id)) + + ret = {} + soup = BeautifulSoup(resp.text, 'html5lib') + + for script in soup.find_all("script"): + # This method only works on the currently-logged in athlete but + # returns much more data. + if athlete_id == self.athlete_id and "Strava.Models.CurrentAthlete" in script.text: + m = ATHLETE_REGEX.search(script.text) + if not m: + __log__.error("Failed to extract detailed athlete data") + continue + try: + ret.update(json.loads(m.group(1))) + except (TypeError, ValueError) as e: + __log__.error("Failed to parse extracted athlete data", exc_info=True) + continue + + elif "var trophiesAnalyticsProperties" in script.text: + m = CHALLENGE_IDS_REGEX.search(script.text) + if not m: + __log__.error("Failed to extract completed challenges") + continue + try: + ret["challenges"] = json.loads(m.group(1)) + except (TypeError, ValueError) as e: + __log__.error("Failed to parse extracted challenge data", exc_info=True) + continue + + elif "var photosJson" in script.text: + # Exact same as activity pages + m = PHOTOS_REGEX.search(script.text) + if not m: + __log__.error("Failed to extract photo data from page") + break + try: + photos = json.loads(m.group(1)) + except (TypeError, ValueError) as e: + __log__.error("Failed to parse extracted photo data", exc_info=True) + break + ret["photos"] = [ScrapedActivityPhoto(**p) for p in photos] + + # Failed the detailed scrape or not getting the currently-logged in athlete + # (this method works for all athletes) + if "id" not in ret: + ret["id"] = athlete_id + # There are multiple headings depending on the level of access + for heading in soup.find_all("div", class_="profile-heading"): + name = heading.find("h1", class_="athlete-name") + if name: + ret["name"] = name.text.strip() + + location = heading.find("div", class_="location") + if location: + ret["city"], ret["state"], ret["country"] = [x.strip() for x in location.text.split(",")] + + profile = heading.find("img", class_="avatar-img") + if profile: + ret["profile"] = profile["src"] + + # Scrape basic gear info from the sidebar if not getting the logged + # in athlete. + # By providing minimal data for non-logged-in athletes, no more data + # will be lazy-loaded by the bikes and shoes attributes. This is what + # we want since the lazy-load would just call this function again. + # However, when getting the logged in athlete's gear, we don't want to + # set anything since the lazy-load will use the more detailed + # get_all_bikes/gear functions instead of this one. + if athlete_id != self.athlete_id: + ret["bikes"] = [] + ret["shoes"] = [] + for gear in soup.select("div.section.stats.gear"): + if "bikes" in gear["class"]: + type_ = "bikes" + cls = ScrapedBike + elif "shoes" in gear["class"]: + type_ = "shoes" + cls = ScrapedShoe + else: + continue + + for row in gear.find("table").find_all("tr"): + name, dist = row.find_all("td") + link=name.find("a") + gear_id = None + if link and type_ == "bikes": + gear_id = "b{}".format(link["href"].rsplit("/", 1)[-1]) + + ret[type_].append(cls( + id=gear_id, + name=name.text.strip(), + distance=int(float(NON_NUMBERS.sub('', dist.text.strip())) * 1000), + )) + + return ScrapedAthlete(bind_client=self, **ret) + class WebClient(stravalib.Client): """ From e5c8cd67d4efb3401d74334cc8fae3a33db57af6 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Mon, 10 Jan 2022 13:46:17 -0500 Subject: [PATCH 20/23] WIP - Tweak LazyLoaded - Add scraping for challenges - Tweak gear access --- stravaweblib/model.py | 187 +++++++++++++++++++++++++++----------- stravaweblib/webclient.py | 115 ++++++++++++++++++++++- 2 files changed, 244 insertions(+), 58 deletions(-) diff --git a/stravaweblib/model.py b/stravaweblib/model.py index acef410..551c3e5 100644 --- a/stravaweblib/model.py +++ b/stravaweblib/model.py @@ -5,8 +5,9 @@ from stravalib.attributes import (Attribute, DateAttribute, TimestampAttribute, TimeIntervalAttribute, LocationAttribute) -from stravalib.model import (BaseEntity, BoundEntity, LoadableEntity, - EntityCollection, Bike as _Bike) +from stravalib.model import (BaseEntity, BoundEntity, LoadableEntity as _LoadableEntity, + IdentifiableEntity, EntityCollection, EntityAttribute, + Athlete as _Athlete, Bike as _Bike) from stravalib import unithelper as uh @@ -111,10 +112,18 @@ class that houses it (key). cache is stored on the parent class. When this attribute is requested and the key in not in the cache, the `load_attribute` function on the parent class is called and the result is - added to the cache. At this point, the key is popped out of - the cache and set as the attribute variable. If the key is - not in the cache, `None` is set at the value of the attribute. + added to the cache. Any future accesses will return the value + from the cache. If the key is not in the cache, `None` is + returned. :param property: Don't store the result of the lazy load + + Special cases: + - If a lazy-loaded attribute is None, lazy-loading will be attempted + each time it is accessed. This allows for null values to be updated + with new data. + - If the load_attribute function returns None for a property, it will + not be attempted again. + """ if not (bool(fcn) ^ bool(key)): raise ValueError("One of fcn or key (not both) is required") @@ -129,24 +138,27 @@ class that houses it (key). ) def __get__(self, obj, clazz): - if obj is not None and (self._property or obj not in self.data): - if self._fcn: - # Call the provided function to load the attribute - value = self._fcn(obj) - elif self._key: - if not hasattr(obj, "_lazyload_cache"): - obj._lazyload_cache = {} + if obj is None or not (self._property or self.data.get(obj) is None): + return super().__get__(obj, clazz) + + if self._fcn: + # Call the provided function to load the attribute + value = self._fcn(obj) + if value is not None and not self._property: + self.__set__(obj, value) + return value + elif self._key: + if not hasattr(obj, "_lazyload_cache"): + obj._lazyload_cache = {} - # Use obj.load_attribute() to ensure the object is in the cache - if self._key not in obj._lazyload_cache: - obj._lazyload_cache.update(obj.load_attribute(self._key) or {}) - value = obj._lazyload_cache.pop(self._key, None) + # Use obj.load_attribute() to ensure the object is in the cache + if self._key not in obj._lazyload_cache: + obj._lazyload_cache.update(obj.load_attribute(self._key) or {}) - if self._property: - return value + # Don't set it on the object, keep accessing out of the cache + return obj._lazyload_cache.get(self._key, None) - self.__set__(obj, value) - return super().__get__(obj, clazz) + raise AssertionError("No fcn or key?") def __set__(self, obj, val): if self._property: @@ -156,6 +168,13 @@ def __set__(self, obj, val): super().__set__(obj, val) +# TODO: probably delete this +class LoadableEntity(_LoadableEntity): + + def load_attribute(self, key): + return {} + + class ScrapedGear(BaseEntity): """Represents gear scraped from Strava @@ -219,20 +238,16 @@ def __repr__(self): ) -class _BikeData(LoadableEntity): +class _ScrapedBikeData(LoadableEntity): """Mixin class to add weight and components to a Bike""" - frame_type = LazyLoaded(Attribute(FrameType), key="frame_type") + components = LazyLoaded(EntityCollection(ScrapedBikeComponent), key="components") weight = LazyLoaded(Attribute(float, units=uh.kg), key="weight") - def load_attribute(self, _): + def load_attribute(self, key): """Expand the bike with more details using scraping""" self.assert_bind_client() - - d = self.bind_client.get_bike_details(self.id) - # Upgrade the frame_type to the enum - _dict_modify(d, "frame_type", "frame_type", fcn=lambda x: FrameType.from_str(x)) - return d + return self.bind_client.get_bike_details(self.id) def components_on_date(self, on_date): """Get bike components installed on the specified date @@ -252,17 +267,28 @@ def components_on_date(self, on_date): ] -class Bike(_BikeData, _Bike) : +class Bike(_ScrapedBikeData, _Bike) : __doc__ = _Bike.__doc__ + """ Scraping adds weight and components attributes """ + def from_object(self, b): + self.from_dict(b.to_dict()) + return self -class ScrapedBike(ScrapedGear, _BikeData): + +class ScrapedBike(ScrapedGear, _ScrapedBikeData): """Represents a bike scraped from Strava The attributes are compatible with stravalib.models.Bike where they exist. """ + # NOTE: These are here to take advantage of the load_attributes function + # of the _ScrapedBikeData class in case the ScrapedBike was + # constructed from a regular bike without the attributes set. + frame_type = LazyLoaded(Attribute(FrameType), key="frame_type") + brand_name = LazyLoaded(Attribute(str), key="brand_name") + model_name = LazyLoaded(Attribute(str), key="model_name") + description = LazyLoaded(Attribute(str), key="description") class ScrapedActivityPhoto(BaseEntity): @@ -333,7 +359,10 @@ class ScrapedActivity(LoadableEntity): photos = LazyLoaded(EntityCollection(ScrapedActivityPhoto), key="photos") device_name = LazyLoaded(Attribute(str), key="device_name") - def load_attribute(self, _): + def load_attribute(self, key): + if key not in {"manual", "photos", "device_name"}: + return super().load_attribute(key) + self.assert_bind_client() return self.bind_client.get_extra_activity_details(self.id) @@ -355,14 +384,44 @@ def from_dict(self, d): return super().from_dict(d) -class ScrapedAthlete(LoadableEntity): - """ - Represents Athlete data scraped from the website +class ScrapedChallenge(IdentifiableEntity): + + url = Attribute(str) + name = Attribute(str) + subtitle = Attribute(str) + teaser = Attribute(str) + overview = Attribute(str) + badge_url = Attribute(str) + + start_date = TimestampAttribute() + end_date = TimestampAttribute() + + def trophy_url(self, percent_complete=100): + """Return a url for a trophy image for the percentage complete + + Note that not all challenges have images for all percentages. Using + 100 should always work. + """ + if not self.badge_url: + return + base, ext = self.badge_url.rsplit(".", 1) + return "{}-{}.{}".format(base, percent_complete, ext) + + def from_dict(self, d): + #_dict_modify(d, "title", "name") + _dict_modify(d, "description", "overview") + _dict_modify(d, "url", "badge_url") + _dict_modify(d, "share_url", "url") + return super().from_dict(d) + + +class _AthleteData(LoadableEntity): + """Mixin class to add photos, challenges, and a name to an Athlete""" + photos = LazyLoaded(EntityCollection(ScrapedActivityPhoto), key="photos") + challenges = LazyLoaded(Attribute(list), key="challenges") + bikes = LazyLoaded(EntityCollection(ScrapedBike), key="bikes") + shoes = LazyLoaded(EntityCollection(ScrapedShoe), key="shoes") - The attributes are compatible with stravalib.model.Athlete where they exist - """ - firstname = Attribute(str) - lastname = Attribute(str) # Dynamically compute the display name in the same way Strava does name = LazyLoaded( Attribute(str), @@ -370,27 +429,47 @@ class ScrapedAthlete(LoadableEntity): property=True ) - profile = Attribute(str) - photos = EntityCollection(ScrapedActivityPhoto) - challenges = Attribute(list) - - city = Attribute(str) - state = Attribute(str) - country = Attribute(str) - location = LocationAttribute() - - bikes = LazyLoaded(EntityCollection(ScrapedBike), key="bikes") - shoes = LazyLoaded(EntityCollection(ScrapedShoe), key="shoes") - def load_attribute(self, key): self.assert_bind_client() + + # TODO: bikes and shoes only returns scraping-based data if key == "bikes": - v = self.bind_client.get_all_bikes(self.id) + return {"bikes": self.bind_client.get_all_bikes(self.id)} elif key == "shoes": - v = self.bind_client.get_all_shoes(self.id) + return {"shoes": self.bind_client.get_all_shoes(self.id)} + elif key in {"photos", "challenges"}: + d = self.bind_client.get_athlete(self.id) + return { + "photos": d.photos, + "challenges": d.challenges, + } else: - return - return {key: v} + return super().load_attribute(key) + + +class Athlete(_AthleteData, _Athlete): + __doc__ = _Athlete.__doc__ + """ + Scraping adds photos, challenges, and name attributes + """ + def from_object(self, a): + self.from_dict(a.to_dict()) + return self + + +class ScrapedAthlete(_AthleteData): + """ + Represents Athlete data scraped from the website + + The attributes are compatible with stravalib.model.Athlete where they exist + """ + firstname = Attribute(str) + lastname = Attribute(str) + + profile = Attribute(str) + city = Attribute(str) + state = Attribute(str) + country = Attribute(str) + location = LocationAttribute() def from_dict(self, d): # Merge geo subdict into the main dict diff --git a/stravaweblib/webclient.py b/stravaweblib/webclient.py index 5e38597..4d89abc 100644 --- a/stravaweblib/webclient.py +++ b/stravaweblib/webclient.py @@ -4,6 +4,7 @@ from collections import namedtuple from datetime import datetime import functools +import html import json import logging import re @@ -16,7 +17,8 @@ from stravalib.model import Activity, Bike as _Bike from stravaweblib.model import (DataFormat, ScrapedShoe, Bike, ScrapedBike, ScrapedBikeComponent, ScrapedActivity, - ScrapedActivityPhoto, ScrapedAthlete) + ScrapedActivityPhoto, Athlete, ScrapedAthlete, + ScrapedChallenge, FrameType) __log__ = logging.getLogger(__name__) @@ -33,6 +35,8 @@ ATHLETE_REGEX = re.compile(r"var\s+currentAthlete\s*=\s*new\s+Strava.Models.CurrentAthlete\(({.*})\);") CHALLENGE_IDS_REGEX = re.compile(r"var\s+trophiesAnalyticsProperties\s*=\s*{.*challenge_id:\s*\[(\[[\d\s,]*\])\]") PAGE_VIEW_REGEX = re.compile(r"pageView\s*=\s*new\s+Strava.Labs.Activities.Pages.(\S+)PageView\([\"']?\d+[\"']?,\s*[\"']([^\"']+)") +CHALLENGE_REGEX = re.compile(r"var\s+challenge\s*=\s*new\s+Strava.Models.Challenge\(({.*})\);") +CHALLENGE_DATE_REGEX = re.compile(r"(\S{3} \d{2}, \d{4}) to (\S{3} \d{2}, \d{4})") NON_NUMBERS = re.compile(r'[^\d\.]') @@ -498,6 +502,8 @@ def get_bike_details(self, bike_id): # Strip non-number chars ("kg") # TODO: other units? v = float(NON_NUMBERS.sub('', v)) + elif k == "frame_type": + v = FrameType.from_str(v) ret[k.lower()] = v # Get component data @@ -610,6 +616,14 @@ def get_all_shoes(self, athlete_id=None): except (TypeError, ValueError) as e: raise ScrapingError("Failed to parse shoe data") from e + def get_all_gear(self): + """Scrape all gear information from Strava + + :yield: `ScrapedBike` and `ScrapedShoe` objects + """ + yield from self.get_all_bikes() + yield from self.get_all_shoes() + def get_gear(self, gear_id): """A scraping-based replacement for `stravalib.Client.get_gear`""" try: @@ -632,7 +646,10 @@ def get_athlete(self, athlete_id=None): if not resp.ok: raise stravalib.exc.Fault("Failed to get athlete {}".format(athlete_id)) - ret = {} + ret = { + "photos": [], + "challenges": [], + } soup = BeautifulSoup(resp.text, 'html5lib') for script in soup.find_all("script"): @@ -727,6 +744,71 @@ def get_athlete(self, athlete_id=None): return ScrapedAthlete(bind_client=self, **ret) + def get_challenge(self, challenge_id): + """Get data about a challenge""" + __log__.debug("Getting details for challenge %s", challenge_id) + resp = self.request_get("challenges/{}".format(challenge_id)) + if not resp.ok: + raise stravalib.exc.Fault("Failed to get challenge {}".format(challenge_id)) + + data = {} + soup = BeautifulSoup(resp.text, 'html5lib') + react_data = soup.find("div", **{"data-react-class": "Show"}) + if react_data: + # Extract data from the react version of the page + data_str = html.unescape( + react_data["data-react-props"] + .replace(" ", " ") + .replace("\n", "\\n") + ) + try: + data = json.loads(data_str) + except (TypeError, ValueError) as e: + raise ScrapingError("Failed to parse extracted challenge data") from e + + # Get the descript + description_html = next(x for x in data["sections"] if x["title"] == "Overview")["content"][0]["text"].replace(" ", "") + data["description"] = BeautifulSoup(description_html, 'html5lib').text + data["name"] = data["header"]["name"] + data["subtitle"] = data["header"]["subtitle"] + data["teaser"] = data["summary"]["challenge"]["title"] + data["badge_url"] = data["header"]["challengeLogoUrl"] + data["share_url"] = "https://www.strava.com/challenges/{}".format(challenge_id) + + m = CHALLENGE_DATE_REGEX.search(data["summary"]["calendar"]["title"]) + if m: + try: + data["start_date"], data["end_date"] = [ + datetime.strptime(x, "%b %d, %Y") for x in m.groups() + ] + except ValueError: + __log__.error("Failed to parse dates {}".format(m.groups())) + else: + # Look for the data in the older-style page + for script in soup.find_all("script"): + if "Strava.Models.Challenge" in script.text: + break + else: + raise ScrapingError("Failed to scrape challenge data {}".format(challenge_id)) + + m = CHALLENGE_REGEX.search(script.text) + if not m: + raise ScrapingError("Failed to extract challenge data from page") + + data_str = html.unescape(m.group(1)) + try: + data = json.loads(data_str) + except (TypeError, ValueError) as e: + raise ScrapingError("Failed to parse extracted challenge data") from e + + desc = soup.find("div", id="desc") + if desc: + data["description"] = desc.text + + data["id"] = challenge_id + + return ScrapedChallenge(**data) + class WebClient(stravalib.Client): """ @@ -740,7 +822,7 @@ def __new__(cls, *_, **__): self = super().__new__(cls) # Prepend some docstrings with the parent classes one - for fcn in ("__init__", "get_gear"): + for fcn in ("__init__", "get_gear", "get_athlete"): getattr(cls, fcn).__doc__ = getattr(super(), fcn).__doc__ + getattr(cls, fcn).__doc__ # Delegate certain methods and properties to the scraper instance @@ -775,15 +857,40 @@ def __init__(self, *args, **kwargs): if self._scraper.athlete_id != self.get_athlete().id: raise ValueError("API and web credentials are for different accounts") + def get_athlete(self, athlete_id=None): + """ + Returned Athletes will have scraped attributes lazily added. + Also, when accessing the bikes attribute, more scraped data will be available + """ + athlete = super().get_athlete(athlete_id) + # TODO: Should make the bind client this instance + # That way scraping/API functions can be mixed + return Athlete(bind_client=self._scraper).from_object(athlete) + def get_gear(self, gear_id): """ Returned Bikes will have scraped attributes lazily added """ gear = super().get_gear(gear_id) if isinstance(gear, _Bike): - return Bike(bind_client=self._scraper, **gear.to_dict()) + # TODO: Should make the bind client this instance + # That way scraping/API functions can be mixed + return Bike(bind_client=self._scraper).from_object(gear) return gear + def get_all_gear(self): + """Get all gear information from Strava + + :yield: `stravalib.model.Bike` and `stravalib.model.Shoe` instances + """ + athlete = self.get_athlete() + if athlete.bikes is None and athlete.shoes is None: + __log__.error("Failed to get gear data (missing profile:read_all scope?)") + return + + for gear in athlete.bikes + athlete.shoes: + yield self.get_gear(gear) + @staticmethod def _delegate(clazz, name): func = getattr(clazz, name) From 856de245203b845f1a17a1b28472049ae5a48fd3 Mon Sep 17 00:00:00 2001 From: Carey Metcalfe Date: Wed, 26 Jan 2022 16:01:03 -0500 Subject: [PATCH 21/23] Fix extracting data from script tags BeautifulSoup v4.9.0 changed how `.text` works for `