diff --git a/docs/io.rst b/docs/io.rst index c5ffcfe4..ffac2485 100644 --- a/docs/io.rst +++ b/docs/io.rst @@ -390,6 +390,29 @@ Avro files (fastavro) :start-after: begin_complex_schema :end-before: end_complex_schema +.. module:: petl.io.gsheet +.. _io_gsheet: + +Google Sheets (gspread) +^^^^^^^^^^^^^^^^^^^^^^^ + +.. warning:: + + This is a experimental feature. API and behavior may change between releases + with some possible breaking changes. + +.. note:: + + The following functions require `gspread + `_ to be installed, + e.g.:: + + $ pip install gspread + +.. autofunction:: petl.io.gsheet.fromgsheet +.. autofunction:: petl.io.gsheet.togsheet +.. autofunction:: petl.io.gsheet.appendgsheet + .. module:: petl.io.db .. _io_db: @@ -399,7 +422,7 @@ Databases .. note:: For reading and writing to databases, the following functions require - `SQLAlchemy ` its the database specific driver + `SQLAlchemy ` and the database specific driver to be installed along petl, e.g.:: $ pip install sqlalchemy @@ -433,10 +456,14 @@ in the source path of the file. $ pip install fsspec -The supported filesystems with their URI formats can be found in: +The supported filesystems with their URI formats can be found in fsspec +documentation: + +- `Built-in Implementations `__ +- `Other Known Implementations `__ -- fsspec `Built-in Implementations ` -- fsspec `Other Known Implementations ` +Remote sources +^^^^^^^^^^^^^^ .. autoclass:: petl.io.remotes.RemoteSource .. autoclass:: petl.io.remotes.SMBSource diff --git a/petl/io/__init__.py b/petl/io/__init__.py index 630735eb..09199dcf 100644 --- a/petl/io/__init__.py +++ b/petl/io/__init__.py @@ -43,3 +43,5 @@ from petl.io.remotes import RemoteSource from petl.io.remotes import SMBSource + +from petl.io.gsheet import fromgsheet, togsheet, appendgsheet diff --git a/petl/io/avro.py b/petl/io/avro.py index b2d3f023..a48cb123 100644 --- a/petl/io/avro.py +++ b/petl/io/avro.py @@ -573,4 +573,7 @@ def _ordered_dict_iterator(table): yield OrderedDict(items) +Table.toavro = toavro +Table.appendavro = appendavro + # endregion diff --git a/petl/io/gsheet.py b/petl/io/gsheet.py new file mode 100644 index 00000000..9c8293d6 --- /dev/null +++ b/petl/io/gsheet.py @@ -0,0 +1,235 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, print_function, division + +from petl.util.base import Table, iterdata +from petl.compat import text_type +from petl.errors import ArgumentError as PetlArgError + + +def _get_gspread_client(auth_info): + import gspread + + if isinstance(auth_info, gspread.Client): + return auth_info + if isinstance(auth_info, dict): + gd = gspread.service_account_from_dict(auth_info) + return gd + import google + + if isinstance(auth_info, google.oauth2.service_account.Credentials): + gc = gspread.authorize(auth_info) + return gc + if auth_info is None: + ga = gspread.service_account() + return ga + raise PetlArgError("gspread: Invalid account credentials") + + +def _open_spreadsheet(gspread_client, spreadsheet, open_by_key=False): + if open_by_key: + from gspread.exceptions import SpreadsheetNotFound + try: + wb = gspread_client.open_by_key(spreadsheet) + except SpreadsheetNotFound: + wb = gspread_client.open(spreadsheet) + elif spreadsheet is not None: + wb = gspread_client.open(spreadsheet) + else: + raise PetlArgError("gspread requires argument spreadsheet") + return wb + + +def _select_worksheet(wb, worksheet, find_or_create=False): + # Allow for user to specify no sheet, sheet index or sheet name + if worksheet is None: + ws = wb.sheet1 + elif isinstance(worksheet, int): + ws = wb.get_worksheet(worksheet) + elif isinstance(worksheet, text_type): + sheetname = text_type(worksheet) + if find_or_create: + if worksheet in [wbs.title for wbs in wb.worksheets()]: + ws = wb.worksheet(sheetname) + else: + ws = wb.add_worksheet(sheetname, 1, 1) + else: + # use text_type for cross version compatibility + ws = wb.worksheet(sheetname) + else: + raise PetlArgError("Only can find worksheet by name or by number") + return ws + + +def fromgsheet( + credentials_or_client, spreadsheet, worksheet=None, cell_range=None, + open_by_key=False +): + """ + Extract a table from a google spreadsheet. + + The `credentials_or_client` are used to authenticate with the google apis. + For more info, check `authentication`_. + + The `spreadsheet` can either be the key of the spreadsheet or its name. + + The `worksheet` argument can be omitted, in which case the first + sheet in the workbook is used by default. + + The `cell_range` argument can be used to provide a range string + specifying the top left and bottom right corners of a set of cells to + extract. (i.e. 'A1:C7'). + + Set `open_by_key` to `True` in order to treat `spreadsheet` as spreadsheet key. + + .. note:: + - Only the top level of google drive will be searched for the + spreadsheet filename due to API limitations. + - The worksheet name is case sensitive. + + Example usage follows:: + + >>> from petl import fromgsheet + >>> import gspread # doctest: +SKIP + >>> client = gspread.service_account() # doctest: +SKIP + >>> tbl1 = fromgsheet(client, 'example_spreadsheet', 'Sheet1') # doctest: +SKIP + >>> tbl2 = fromgsheet(client, '9zDNETemfau0uY8ZJF0YzXEPB_5GQ75JV', credentials) # doctest: +SKIP + + This functionality relies heavily on the work by @burnash and his great + `gspread module`_. + + .. _gspread module: http://gspread.readthedocs.io/ + .. _authentication: http://gspread.readthedocs.io/en/latest/oauth2.html + """ + + return GoogleSheetView( + credentials_or_client, + spreadsheet, + worksheet, + cell_range, + open_by_key, + ) + + +class GoogleSheetView(Table): + """Conects to a worksheet and iterates over its rows.""" + + def __init__( + self, credentials_or_client, spreadsheet, worksheet, cell_range, + open_by_key + ): + self.auth_info = credentials_or_client + self.spreadsheet = spreadsheet + self.worksheet = worksheet + self.cell_range = cell_range + self.open_by_key = open_by_key + + def __iter__(self): + gspread_client = _get_gspread_client(self.auth_info) + wb = _open_spreadsheet(gspread_client, self.spreadsheet, self.open_by_key) + ws = _select_worksheet(wb, self.worksheet) + # grab the range or grab the whole sheet + if self.cell_range is not None: + return self._yield_by_range(ws) + return self._yield_all_rows(ws) + + def _yield_all_rows(self, ws): + # no range specified, so return all the rows + for row in ws.get_all_values(): + yield tuple(row) + + def _yield_by_range(self, ws): + found = ws.get_values(self.cell_range) + for row in found: + yield tuple(row) + + +def togsheet( + table, credentials_or_client, spreadsheet, worksheet=None, cell_range=None, + share_emails=None, role="reader" +): + """ + Write a table to a new google sheet. + + The `credentials_or_client` are used to authenticate with the google apis. + For more info, check `authentication`_. + + The `spreadsheet` will be the title of the workbook created google sheets. + If there is a spreadsheet with same title a new one will be created. + + If `worksheet` is specified, the first worksheet in the spreadsheet + will be renamed to its value. + + The spreadsheet will be shared with all emails in `share_emails` with + `role` permissions granted. For more info, check `sharing`_. + + Returns: the spreadsheet key that can be used in `appendgsheet` further. + + + .. _sharing: https://developers.google.com/drive/v3/web/manage-sharing + + .. note:: + The `gspread`_ package doesn't support serialization of `date` and + `datetime` types yet. + + Example usage:: + + >>> from petl import fromcolumns, togsheet + >>> import gspread # doctest: +SKIP + >>> client = gspread.service_account() # doctest: +SKIP + >>> cols = [[0, 1, 2], ['a', 'b', 'c']] + >>> tbl = fromcolumns(cols) + >>> togsheet(tbl, client, 'example_spreadsheet') # doctest: +SKIP + """ + + gspread_client = _get_gspread_client(credentials_or_client) + wb = gspread_client.create(spreadsheet) + ws = wb.sheet1 + ws.resize(rows=1, cols=1) # make smallest table possible + # rename sheet if set + if worksheet is not None: + ws.update_title(title=worksheet) + # gspread indices start at 1, therefore row index insert starts at 1 + ws.append_rows(table, table_range=cell_range) + # specify the user account to share to + if share_emails is not None: + for user_email in share_emails: + wb.share(user_email, perm_type="user", role=role) + return wb.id + + +def appendgsheet( + table, credentials_or_client, spreadsheet, worksheet=None, + open_by_key=False, include_header=False +): + """ + Append a table to an existing google shoot at either a new worksheet + or the end of an existing worksheet. + + The `credentials_or_client` are used to authenticate with the google apis. + For more info, check `authentication`_. + + The `spreadsheet` is the name of the workbook to append to. + + The `worksheet` is the title of the worksheet to append to or create when it + does not exist yet. + + Set `open_by_key` to `True` in order to treat `spreadsheet` as spreadsheet key. + + Set `include_header` to `True` if you don't want omit fieldnames as the + first row appended. + + .. note:: + The sheet index cannot be used, and None is not an option. + """ + gspread_client = _get_gspread_client(credentials_or_client) + # be able to give filename or key for file + wb = _open_spreadsheet(gspread_client, spreadsheet, open_by_key) + # check to see if worksheet exists, if so append, otherwise create + ws = _select_worksheet(wb, worksheet, True) + rows = table if include_header else list(iterdata(table)) + ws.append_rows(rows) + return wb.id + + +Table.togsheet = togsheet +Table.appendgsheet = appendgsheet diff --git a/petl/test/io/test_gsheet.py b/petl/test/io/test_gsheet.py new file mode 100644 index 00000000..cf14e9eb --- /dev/null +++ b/petl/test/io/test_gsheet.py @@ -0,0 +1,267 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, print_function, division + +import datetime +import os +import json +import time + +import pytest + +from petl.compat import text_type +from petl.io.gsheet import fromgsheet, togsheet, appendgsheet +from petl.test.helpers import ieq, get_env_vars_named + +gspread = pytest.importorskip("gspread") +uuid = pytest.importorskip("uuid") + +# region helpers + + +def _get_gspread_credentials(): + json_path = os.getenv("PETL_GCP_JSON_PATH", None) + if json_path is not None and os.path.exists(json_path): + return json_path + json_props = get_env_vars_named("PETL_GCP_CREDS_") + if json_props is not None: + return json_props + user_path = os.path.expanduser("~/.config/gspread/service_account.json") + if os.path.isfile(user_path) and os.path.exists(user_path): + return user_path + return None + + +found_gcp_credentials = pytest.mark.skipif( + _get_gspread_credentials() is None, + reason="""SKIPPED. to/from gspread needs json credentials for testing. +In order to run google spreadsheet tests, follow the steps bellow: +1. Create a json authorization file, following the steps described at + http://gspread.readthedocs.io/en/latest/oauth2.html, and save to a local path +2. Point the envvar `PETL_GCP_JSON_PATH` to the json authorization file path +2. Or fill the properties inside the json authorization file in envrionment + variables named with prefix PETL_GCP_CREDS_: PETL_GCP_CREDS_project_id=petl +3. Or else save the file in one of the following paths: + unix: ~/.config/gspread/service_account.json + windows: %APPDATA%\\gspread\\service_account.json""" + ) + + +def _get_env_credentials(): + creds = _get_gspread_credentials() + if isinstance(creds, dict): + return creds + if isinstance(creds, text_type): + with open(creds, encoding="utf-8") as json_file: + creds = json.load(json_file) + return creds + return None + + +def _get_gspread_client(): + credentials = _get_env_credentials() + try: + if credentials is None: + gspread_client = gspread.service_account() + else: + gspread_client = gspread.service_account_from_dict(credentials) + except gspread.exceptions.APIError as ex: + pytest.skip("SKIPPED. to/from gspread authentication error: %s" % ex) + return None + return gspread_client + + +def _get_env_sharing_emails(): + emails = get_env_vars_named("PETL_GSHEET_EMAIL", remove_prefix=False) + if emails is not None: + return list(emails.values()) + return [] + + +def _get_gspread_test_params(): + filename = "test-{}".format(str(uuid.uuid4())) + gspread_client = _get_gspread_client() + emails = _get_env_sharing_emails() + return filename, gspread_client, emails + + +def _test_to_fromg_sheet(table, sheetname, cell_range, expected): + filename, gspread_client, emails = _get_gspread_test_params() + # test to from gsheet + spread_id = togsheet( + table, gspread_client, filename, worksheet=sheetname, share_emails=emails + ) + try: + result = fromgsheet( + gspread_client, filename, worksheet=sheetname, cell_range=cell_range + ) + # make sure the expected_result matches the result + ieq(expected, result) + finally: + # clean up created table + gspread_client.del_spreadsheet(spread_id) + + +def _test_append_from_gsheet(table_list, expected, sheetname=None): + filename, gspread_client, emails = _get_gspread_test_params() + # append from the second table from the list + table1 = table_list[0] + other_tables = table_list[1:] + # create the spreadshteet and the 1st sheet + spread_id = togsheet( + table1, gspread_client, filename, worksheet=sheetname, share_emails=emails + ) + try: + for tableN in other_tables: + appendgsheet( + tableN, gspread_client, spread_id, worksheet=sheetname, + open_by_key=True + ) + # read the result appended to the sheet + result = fromgsheet( + gspread_client, spread_id, worksheet=sheetname, open_by_key=True + ) + # make sure the expected_result matches the result + ieq(expected, result) + finally: + # clean up created table + gspread_client.del_spreadsheet(spread_id) + + +def teardown_function(): + # try to avoid: User rate limit exceeded. + time.sleep(3) + + +# endregion + +# region test cases data + +TEST_TABLE = [ + ["foo", "bar"], + ["A", "1"], + ["B", "2"], + ["C", "3"], + ["D", "random_stuff-in+_名字"], + ["é", "3/4/2012"], + ["F", "6"], +] + +# endregion + +# region test cases execution + + +@found_gcp_credentials +def test_tofromgsheet_01_basic(): + _test_to_fromg_sheet( TEST_TABLE[:], None, None, TEST_TABLE[:] ) + + +@found_gcp_credentials +def test_tofromgsheet_02_uneven_row(): + test_table_t1 = [x + ["3"] if i in [2] else x for i, x in enumerate(TEST_TABLE[:])] + test_table_f1 = [x + [""] if len(x) < 3 else x for x in test_table_t1[:]] + _test_to_fromg_sheet( test_table_t1, None, None, test_table_f1 ) + + +@found_gcp_credentials +def test_tofromgsheet_03_empty_table(): + _test_to_fromg_sheet( (), None, None, () ) + + +@found_gcp_credentials +def test_tofromgsheet_04_cell_range(): + test_table_f2 = [[x[1]] for x in TEST_TABLE[0:4]] + _test_to_fromg_sheet( TEST_TABLE[:], None, "B1:B4", test_table_f2 ) + + +@found_gcp_credentials +def test_tofromgsheet_05_sheet_title(): + _test_to_fromg_sheet( TEST_TABLE[:], "random_stuff-in+_名字", None, TEST_TABLE[:] ) + + +@found_gcp_credentials +@pytest.mark.xfail( + raises=TypeError, + reason="When this stop failing, uncomment datetime.date in TEST1 and TEST2" + ) +def test_tofromgsheet_06_datetime_date(): + test_table_dt = [[x[0], datetime.date(2012, 5, 6)] if i in [5] else x for i, x in enumerate(TEST_TABLE[:])] + _test_to_fromg_sheet( test_table_dt[:], None, "B1:B4", test_table_dt[:] ) + + +@found_gcp_credentials +def test_tofromgsheet_07_open_by_key(): + filename, gspread_client, emails = _get_gspread_test_params() + # test to from gsheet + table = TEST_TABLE[:] + # test to from gsheet + spread_id = togsheet(table, gspread_client, filename, share_emails=emails) + try: + result = fromgsheet(gspread_client, spread_id, open_by_key=True) + # make sure the expected_result matches the result + ieq(table, result) + finally: + # clean up created table + gspread_client.del_spreadsheet(spread_id) + + +@found_gcp_credentials +def test_tofromgsheet_08_recreate(): + filename, gspread_client, emails = _get_gspread_test_params() + # test to from gsheet + table1 = TEST_TABLE[:] + table2 = [[ x[0] , text_type(i)] if i > 0 else x for i, x in enumerate(table1)] + # test to from gsheet + spread_id = togsheet(table1, gspread_client, filename, share_emails=emails) + try: + result1 = fromgsheet(gspread_client, spread_id, open_by_key=True) + ieq(table1, result1) + spread_id2 = togsheet(table2, gspread_client, filename, share_emails=emails) + try: + result2 = fromgsheet(gspread_client, spread_id2, open_by_key=True) + ieq(table2, result2) + finally: + gspread_client.del_spreadsheet(spread_id2) + # make sure the expected_result matches the result + finally: + # clean up created table + gspread_client.del_spreadsheet(spread_id) + + +def _get_testcase_for_append(): + table_list = [TEST_TABLE[:], TEST_TABLE[:]] + expected = TEST_TABLE[:] + TEST_TABLE[1:] + return table_list, expected + + +@found_gcp_credentials +def test_appendgsheet_10_double(): + table_list, expected = _get_testcase_for_append() + _test_append_from_gsheet(table_list, expected) + + +@found_gcp_credentials +def test_appendgsheet_11_named_sheet(): + table_list, expected = _get_testcase_for_append() + _test_append_from_gsheet(table_list, expected, sheetname="petl_append") + + +@found_gcp_credentials +def test_appendgsheet_12_other_sheet(): + filename, gspread_client, emails = _get_gspread_test_params() + # test to append gsheet + table = TEST_TABLE[:] + table2 = TEST_TABLE[1:] + spread_id = togsheet(table, gspread_client, filename, share_emails=emails) + try: + appendgsheet(table, gspread_client, filename, worksheet="petl") + # get the results from the 2 sheets + result1 = fromgsheet(gspread_client, filename, worksheet=None) + ieq(result1, table) + result2 = fromgsheet(gspread_client, filename, worksheet="petl") + ieq(result2, table2) + finally: + gspread_client.del_spreadsheet(spread_id) + + +# endregion diff --git a/requirements-formats.txt b/requirements-formats.txt index c37c4fb1..c365dd95 100644 --- a/requirements-formats.txt +++ b/requirements-formats.txt @@ -11,3 +11,4 @@ xlrd>=2.0.1 xlwt>=1.3.0 fastavro>=0.24.2 ; python_version >= '3.4' fastavro==0.24.2 ; python_version < '3.0' +gspread>=3.4.0 ; python_version >= '3.4'