diff --git a/doc/source/io.rst b/doc/source/io.rst index bf8776d4bc396..852e7e6392a09 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3643,6 +3643,14 @@ data quickly, but it is not a direct replacement for a transactional database. You can access the management console to determine project id's by: +As of 0.15.2, the gbq module has a function ``generate_bq_schema`` which +will produce the dictionary representation of the schema. + +.. code-block:: python + + df = pandas.DataFrame({'A': [1.0]}) + gbq.generate_bq_schema(df, default_type='STRING') + .. warning:: To use this module, you will need a valid BigQuery account. See diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index eacccaa7cba92..b53d16ee7cec5 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -69,6 +69,7 @@ Enhancements - ``Timedelta`` arithmetic returns ``NotImplemented`` in unknown cases, allowing extensions by custom classes (:issue:`8813`). - ``Timedelta`` now supports arithemtic with ``numpy.ndarray`` objects of the appropriate dtype (numpy 1.8 or newer only) (:issue:`8884`). - Added ``Timedelta.to_timedelta64`` method to the public API (:issue:`8884`). +- Added ``gbq.generate_bq_schema`` function to the gbq module (:issue:`8325`). .. _whatsnew_0152.performance: diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 20c1e9f591081..572a8be5c65e8 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -444,3 +444,31 @@ def to_gbq(dataframe, destination_table, project_id=None, chunksize=10000, dataset_id, table_id = destination_table.rsplit('.',1) connector.load_data(dataframe, dataset_id, table_id, chunksize, verbose) + +def generate_bq_schema(df, default_type='STRING'): + """ Given a passed df, generate the associated big query schema. + + Parameters + ---------- + df : DataFrame + default_type : string + The default big query type in case the type of the column + does not exist in the schema. + """ + + type_mapping = { + 'i': 'INTEGER', + 'b': 'BOOLEAN', + 'f': 'FLOAT', + 'O': 'STRING', + 'S': 'STRING', + 'U': 'STRING', + 'M': 'TIMESTAMP' + } + + fields = [] + for column_name, dtype in df.dtypes.iteritems(): + fields.append({'name': column_name, + 'type': type_mapping.get(dtype.kind, default_type)}) + + return {'fields': fields} diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 0f595f75bc66f..2f79cc8ba1826 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -277,6 +277,17 @@ def test_google_upload_errors_should_raise_exception(self): with tm.assertRaises(gbq.UnknownGBQException): gbq.to_gbq(bad_df, 'pydata_pandas_bq_testing.new_test', project_id = PROJECT_ID) + def test_generate_bq_schema(self): + + df = tm.makeMixedDataFrame() + schema = gbq.generate_bq_schema(df) + + test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'}, + {'name': 'B', 'type': 'FLOAT'}, + {'name': 'C', 'type': 'STRING'}, + {'name': 'D', 'type': 'TIMESTAMP'}]} + + self.assertEqual(schema, test_schema) @classmethod def tearDownClass(cls):