Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Canalespy gsheet helper functions #961

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests-mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
python-version: ['3.8', '3.11']
limited-dependencies: ['','TRUE']

runs-on: macos-latest
runs-on: macos-12

steps:

Expand Down
152 changes: 152 additions & 0 deletions parsons/google/google_sheets.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@

import gspread
from google.oauth2.service_account import Credentials
from gspread.exceptions import APIError
from requests.exceptions import HTTPError, ReadTimeout
import time
import utilities

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -444,6 +448,154 @@ def format_cells(self, spreadsheet_id, range, cell_format, worksheet=0):
ws.format(range, cell_format)
logger.info("Formatted worksheet")

def attempt_gsheet_method(self, method, max=6, wait_time=15, **kwargs):
"""
The Google Sheets API has notoriously strict rate limits (e.g. 60 calls per minute). This
function calls itself (i.e. is recursive) to help configure wait times and retry attempts
needed to wait out rate limit errors instead of letting them derail a script.
`Args:`
method: str
The name of the Parsons GoogleSheets method to be attempted
i: int
sharinetmc marked this conversation as resolved.
Show resolved Hide resolved
Where to start the retry count - defaults to 0; mostly needed for recursive calls
max: int
How many attempts to make before giving up - defaults to 4
wait_time: int
Number of seconds to wait between attempts - defaults to 15
kwargs: dict
Any arguments required by `method` - note that positional args will have to be named
`Returns:`
Whatever `method` is supposed to return

"""

def inner_attempt_gsheet_method(method, i, max=6, wait_time=15, **kwargs):
# Recursively account for nested methods as needed
nested_methods = method.split(".")

if len(nested_methods) == 1:
final_method = self
else:
final_method = self[nested_methods[0]]
nested_methods.pop(0)

try:

# If final_method isn't callable, then the API call is made in the loop, not below
for m in nested_methods:
final_method = getattr(final_method, m)

# Using getattr allows the method/attribute to be user-provided
if callable(final_method):
output = final_method(**kwargs)
else:
output = final_method

except (APIError, HTTPError, ReadTimeout, ConnectionError) as e:
# Lets get the ordinals right, because why not
if i % 10 == 1:
ordinal = "st"
elif i % 10 == 2:
ordinal = "nd"
else:
ordinal = "th"

logger.debug(f"trying to {method} for the {i}{ordinal} time")
if i < max:
time.sleep(wait_time)
return inner_attempt_gsheet_method(method, i + 1, max, wait_time, **kwargs)

else:
raise e

inner_attempt_gsheet_method(method, 0, max, wait_time, **kwargs)
return output

def combine_multiple_sheet_data(self, sheet_ids, worksheet_id=None):
"""
Combines data from multiple Google Sheets into a Parsons Table.
sharinetmc marked this conversation as resolved.
Show resolved Hide resolved
The spreadsheets will be treated as if they are concatenated, meaning columns would
need to align positionally with matching data types.
This function also adds a spreedsheet_id and spreadsheet_title
columns to the resulting table.

`Args:`
sheet_ids: str, list
The IDs of the Google Spreadsheets with that data to be combined. Can be a
comma-separated string of IDs or a list.

worksheet_id: str (optional)
If None, the first worksheet (ID = 0) is assumed.

`Returns:` obj
Parsons Table containing the concatenated data from all the sheets.
"""
id_col = "sheet_id"
sheet_id_list = []

# Parse different possible sheet_ids types
if isinstance(sheet_ids, list):
# Already a list!
sheet_id_list = sheet_ids

elif "," in sheet_ids:
# Comma-separated string
sheet_id_list = [x.strip() for x in sheet_ids.split(",")]

else:
raise ValueError(f"{sheet_ids} is not a valid string or list GSheet IDs")

# Non-DB table options yield a list, convert to Parsons table with default worksheet col
if sheet_id_list:
sheet_id_tbl = [{"sheet_id": x, "worksheet_id": 0} for x in sheet_id_list]

if not worksheet_id:
worksheet_id = "worksheet_id"

# Empty table to accumulate data from spreadsheets
combined = Table()

# Set for path to temp file to keep storage/memory in check for large lists
temp_files = []

logger.info(
f"Found {sheet_id_tbl.num_rows} Spreadsheets. Looping to get data from each one."
)

for sheet_id in sheet_id_tbl:

# Keep a lid on how many temp files result from materializing below
if len(temp_files) > 1:
utilities.files.close_temp_file(temp_files[0])
temp_files.remove(temp_files[0])

# Grab the sheet's data
data = self.attempt_gsheet_method(
"get_worksheet",
max=10,
wait_time=60,
spreadsheet_id=sheet_id[id_col],
worksheet=sheet_id[worksheet_id],
)
# Add the sheet ID as a column
data.add_column("spreadsheet_id", sheet_id[id_col])

# Retrieve sheet title (with attempts to handle rate limits) and add as a column
self.__sheet_obj = self.gspread_client.open_by_key(sheet_id[id_col])
sheet_title = str(self.attempt_gsheet_method("sheet_obj.title"))
del self.__sheet_obj
data.add_column("spreadsheet_title", sheet_title)
sharinetmc marked this conversation as resolved.
Show resolved Hide resolved

# Accumulate and materialize
combined.concat(data)
temp_files.append(combined.materialize_to_file())
sharinetmc marked this conversation as resolved.
Show resolved Hide resolved

if len(temp_files) > 1:
utilities.files.close_temp_file(temp_files[0])
temp_files.remove(temp_files[0])

return combined
sharinetmc marked this conversation as resolved.
Show resolved Hide resolved

def read_sheet(self, spreadsheet_id, sheet_index=0):
# Deprecated method v0.14 of Parsons.

Expand Down
27 changes: 27 additions & 0 deletions test/test_google/test_google_sheets.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,3 +220,30 @@ def test_share_spreadsheet(self):
)
permissions = self.google_sheets.get_spreadsheet_permissions(self.spreadsheet_id)
self.assertIn("bob@bob.com", permissions["emailAddress"])

def test_combine_multiple_sheet_data_attempt_gsheet_method(self):
spreadsheet_id_a = self.google_sheets.create_spreadsheet("parsons_test_01")
test_table_a = Table(
[
{"first": "Bob", "last": "Smith"},
{"first": "Sue", "last": "Doe"},
]
)
self.google_sheets.overwrite_sheet(spreadsheet_id_a, test_table_a)

spreadsheet_id_b = self.google_sheets.create_spreadsheet("parsons_test_02")
test_table_b = Table(
[
{"first": "Ted", "last": "Smith"},
{"first": "Susan", "last": "Kerry"},
]
)
self.google_sheets.overwrite_sheet(spreadsheet_id_b, test_table_b)

combined_data = self.google_sheets.combine_multiple_sheet_data(
[spreadsheet_id_a, spreadsheet_id_b]
)

self.assertEqual(
combined_data.num_rows, test_table_a.num_rows + test_table_b.num_rows
)