Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new datasets #434

Merged
merged 7 commits into from
Aug 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 1 addition & 14 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,6 @@ jobs:
SB_TEST_PGPORT: 5433
PYTEST_FLAGS: ${{ matrix.pytest_flags }}

# optional step for running bigquery tests ----
- name: Set up Cloud SDK
if: ${{(contains(github.ref, 'bigquery') || contains(github.ref, 'refs/tags')) && matrix.latest}}
uses: google-github-actions/setup-gcloud@v0
with:
project_id: siuba-tests
service_account_key: ${{ secrets.GCP_SA_KEY }}
export_default_credentials: true
- name: Test bigquery
if: ${{(contains(github.ref, 'bigquery') || contains(github.ref, 'refs/tags')) && matrix.latest}}
run: |
pip install git+https://github.com/googleapis/python-bigquery-sqlalchemy.git pandas-gbq==0.15.0

test-bigquery:
name: "Test BigQuery"
runs-on: ubuntu-latest
Expand All @@ -91,7 +78,7 @@ jobs:
python -m pip install -r requirements.txt
python -m pip install -r requirements-test.txt
python -m pip install pytest-parallel
python -m pip install sqlalchemy-bigquery==1.3.0 pandas-gbq==0.15.0
python -m pip install sqlalchemy-bigquery==1.4 pandas-gbq==0.17
python -m pip install .
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v0
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
include siuba/data/*.csv
include siuba/data/*.csv.gz
include siuba/spec/series.yml
14 changes: 14 additions & 0 deletions siuba/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# siuba datasets

| name | R package |
| ---- | --------- |
| mtcars | [datasets] |
| penguins | [palmerspenguins] |
| penguins_raw | [palmerspenguins] |
| bandmembers | [dplyr] |
| bandinstruments | [dplyr] |
| bandinstruments2 | [dplyr] |

[datasets]: https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html
[dplyr]: https://dplyr.tidyverse.org/
[palmerspenguins]: https://github.com/allisonhorst/palmerpenguins/
68 changes: 48 additions & 20 deletions siuba/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,57 @@
import pandas as pd
import pkg_resources
__all__ = [
"mtcars",
"cars",
"penguins",
"penguins_raw",
"cars_sql",
"band_members",
"band_instruments",
"band_instruments2",
]

# mtcars ----------------------------------------------------------------------
_fname = pkg_resources.resource_filename("siuba.data", "mtcars.csv")
def __dir__():
return __all__

mtcars = pd.read_csv(_fname)
mtcars.__doc__ = """
mtcars data.

Source: Henderson and Velleman (1981), Building multiple regression models interactively. Biometrics, 37, 391–411.
def _load_data_csv_gz(name):
import pandas as pd
import pkg_resources

--- Original DataFrame docs below ---
""" + mtcars.__doc__
fname = pkg_resources.resource_filename("siuba.data", f"{name}.csv.gz")
return pd.read_csv(fname)


# cars ------------------------------------------------------------------------
cars = mtcars[["cyl", "mpg", "hp"]]
def _load_data_csv(name):
import pandas as pd
import pkg_resources

fname = pkg_resources.resource_filename("siuba.data", f"{name}.csv")
return pd.read_csv(fname)

# cars_sql --------------------------------------------------------------------
import siuba.sql.utils as _sql_utils
from siuba.sql import LazyTbl as _LazyTbl
cars_sql = _LazyTbl(
_sql_utils.mock_sqlalchemy_engine("postgresql"),
"cars",
["cyl", "mpg", "hp"]
)

def _load_data_cars_sql():
import siuba.sql.utils as _sql_utils
from siuba.sql import LazyTbl as _LazyTbl
cars_sql = _LazyTbl(
_sql_utils.mock_sqlalchemy_engine("postgresql"),
"cars",
["cyl", "mpg", "hp"]
)


def __getattr__(name):
if name not in __all__:
raise AttributeError(f"No dataset named: {name}")

if name == "cars":
return _load_data_csv_gz("mtcars")[["cyl", "mpg", "hp"]]

elif name == "cars_sql":
return _load_data_cars_sql()

elif name in {"band_members", "band_instruments", "band_instruments2"}:
return _load_data_csv(name)

return _load_data_csv_gz(name)

# cars_sql --------------------------------------------------------------------
4 changes: 4 additions & 0 deletions siuba/data/band_instruments.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name,plays
John,guitar
Paul,bass
Keith,guitar
4 changes: 4 additions & 0 deletions siuba/data/band_instruments2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
artist,plays
John,guitar
Paul,bass
Keith,guitar
4 changes: 4 additions & 0 deletions siuba/data/band_members.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name,band
Mick,Stones
John,Beatles
Paul,Beatles
33 changes: 0 additions & 33 deletions siuba/data/mtcars.csv

This file was deleted.

Binary file added siuba/data/mtcars.csv.gz
Binary file not shown.
Binary file added siuba/data/penguins.csv.gz
Binary file not shown.
Binary file added siuba/data/penguins_raw.csv.gz
Binary file not shown.
10 changes: 10 additions & 0 deletions siuba/tests/test_data_imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import importlib

def test_data_imports():
import siuba.data
from siuba.data import __all__

# note that we can't do import * inside a function, so programmatically fetch
# each dataset
for entry in __all__:
getattr(siuba.data, entry)