Skip to content

Commit

Permalink
Script to check restricted datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
rebkwok committed May 23, 2024
1 parent 7985c0a commit 0d419d2
Show file tree
Hide file tree
Showing 5 changed files with 192 additions and 33 deletions.
17 changes: 17 additions & 0 deletions README.check_restricted_datasets.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Check Restricted datasets

Generate a report on the repos that use restricted datasets.


## Setup

Follow the instructions in [README.repoupdater.md](README.repoupdater.md) to
clone all research repos into research/, or pull if already cloned.

## Usage

```
python check_restricted_datasets.py
```

It will print a report and generate a file at `restricted_dataset_report.md`.
77 changes: 77 additions & 0 deletions check_restricted_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from textops import find, cat, grepci
import os
from pathlib import Path
import glob
import requests

from opensafely.check import RESTRICTED_DATASETS, PERMISSIONS_URL, get_datasource_permissions

from repoupdater import BASE_PATH


def check():
permissions = get_datasource_permissions(PERMISSIONS_URL)

restricted = {}
for dataset in RESTRICTED_DATASETS:
restricted[dataset.name] = {"allowed": [], "using": []}
restricted[dataset.name]["allowed"] = sorted(
[
k.replace("opensafely/", "") for k, v in permissions.items()
if dataset.name in v["allow"]
]
)
function_names = [
f"\.{fname}" for fname in dataset.cohort_extractor_function_names
]

for filep in sorted(glob.glob(os.path.join(BASE_PATH, "*"))):
found = 0
path = Path(filep)
for ext in ["py", "sql", "ipynb"]:
if ext == "py":
# in python files, check for `tablename.` usage
table_names_to_search = [
f"{fname}\." for fname in dataset.ehrql_table_names
]
else:
table_names_to_search = dataset.ehrql_table_names
found = filep | find(f'*.{ext}') | cat() | grepci("|".join(function_names + table_names_to_search))
if found > 0:
project = requests.get(f"http://localhost:8000/api/v2/repo/{path.name}").json()

restricted[dataset.name]["using"].append(
{"name": path.name, "url": f"https://github.com/opensafely/{path.name}", "project_name": project["name"], "project_url": project["url"]}
)
break

print(f"\n============={dataset.name}============")
print("\nALLOWED\n-------")
print("\n".join(restricted[dataset.name]["allowed"]))
print("\nUSING\n-----")
print(
"\n".join(
[f"{repo['name']} - {repo['project_name']}" for repo in restricted[dataset.name]["using"]]
)
)

markdown = "# Restricted Dataset Use"
for dataset_name, data in restricted.items():
markdown += f"\n## {dataset_name}"
markdown += "\n### Allowed repos"
for repo in data["allowed"]:
markdown += f"\n - [{repo}](https://github.com/opensafely/{repo})"
markdown += "\n### Repos using dataset"
for repo_dict in data["using"]:
if repo_dict["name"] not in data["allowed"]:
repo_name = f"**{repo_dict['name']}**"
else:
repo_name = repo_dict['name']
markdown += f"\n - [{repo_name}]({repo_dict['url']}) (project: [{repo_dict['project_name']}]({repo_dict['project_url']}))"

outpath = Path("restricted_dataset_report.md")
outpath.write_text(markdown)


if __name__ == "__main__":
check()
3 changes: 2 additions & 1 deletion repoupdater.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ def update():

if os.path.exists(path):
os.chdir(path)
subprocess.run(["git", "checkout", "master"], check=True)
subprocess.run(["git", "checkout", "master"], check=False)
subprocess.run(["git", "checkout", "main"], check=False)
subprocess.run(["git", "pull"], check=True)
else:
subprocess.run(["git", "clone", repo.ssh_url, path], check=True)
Expand Down
2 changes: 2 additions & 0 deletions requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ pip-tools
google-cloud-bigquery
PyGithub
PyYaml
opensafely
python-textops3
126 changes: 94 additions & 32 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,38 +1,100 @@
#
# This file is autogenerated by pip-compile
# To update, run:
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements.in
# pip-compile
#
cachetools==4.1.1 # via google-auth
certifi==2020.6.20 # via requests
cffi==1.14.3 # via google-crc32c, pynacl
chardet==3.0.4 # via requests
click==7.1.2 # via pip-tools
deprecated==1.2.10 # via pygithub
google-api-core==1.22.2 # via google-cloud-bigquery, google-cloud-core
google-auth==1.21.3 # via google-api-core
google-cloud-bigquery==1.28.0 # via -r requirements.in
google-cloud-core==1.4.1 # via google-cloud-bigquery
google-crc32c==1.0.0 # via google-resumable-media
google-resumable-media==1.0.0 # via google-cloud-bigquery
googleapis-common-protos==1.52.0 # via google-api-core
idna==2.10 # via requests
pip-tools==5.3.1 # via -r requirements.in
protobuf==3.13.0 # via google-api-core, googleapis-common-protos
pyasn1-modules==0.2.8 # via google-auth
pyasn1==0.4.8 # via pyasn1-modules, rsa
pycparser==2.20 # via cffi
pygithub==1.55 # via -r requirements.in
pyjwt==2.4.0 # via pygithub
pynacl==1.5.0 # via pygithub
pytz==2020.1 # via google-api-core
pyyaml==5.3.1 # via -r requirements.in
requests==2.24.0 # via google-api-core, pygithub
rsa==4.6 # via google-auth
six==1.15.0 # via google-api-core, google-auth, google-cloud-bigquery, google-resumable-media, pip-tools, protobuf
urllib3==1.25.10 # via requests
wrapt==1.12.1 # via deprecated
addicted3==3.0.1
# via python-textops3
cachetools==4.1.1
# via google-auth
certifi==2020.6.20
# via requests
cffi==1.14.3
# via
# google-crc32c
# pynacl
chardet==3.0.4
# via
# python-textops3
# requests
click==7.1.2
# via pip-tools
deprecated==1.2.10
# via pygithub
google-api-core==1.22.2
# via
# google-cloud-bigquery
# google-cloud-core
google-auth==1.21.3
# via google-api-core
google-cloud-bigquery==1.28.0
# via -r requirements.in
google-cloud-core==1.4.1
# via google-cloud-bigquery
google-crc32c==1.0.0
# via google-resumable-media
google-resumable-media==1.0.0
# via google-cloud-bigquery
googleapis-common-protos==1.52.0
# via google-api-core
idna==2.10
# via requests
noattr==0.0.9
# via addicted3
opensafely==1.47.0
# via -r requirements.in
pip-tools==5.3.1
# via -r requirements.in
protobuf==3.13.0
# via
# google-api-core
# googleapis-common-protos
pyasn1==0.4.8
# via
# pyasn1-modules
# rsa
pyasn1-modules==0.2.8
# via google-auth
pycparser==2.20
# via cffi
pygithub==1.55
# via -r requirements.in
pyjwt==2.4.0
# via pygithub
pynacl==1.5.0
# via pygithub
python-dateutil==2.9.0.post0
# via python-textops3
python-slugify==8.0.4
# via python-textops3
python-textops3==3.2.1
# via -r requirements.in
pytz==2020.1
# via google-api-core
pyyaml==5.3.1
# via -r requirements.in
requests==2.24.0
# via
# google-api-core
# pygithub
rsa==4.6
# via google-auth
six==1.15.0
# via
# google-api-core
# google-auth
# google-cloud-bigquery
# google-resumable-media
# pip-tools
# protobuf
# python-dateutil
text-unidecode==1.3
# via python-slugify
urllib3==1.25.10
# via requests
wrapt==1.12.1
# via deprecated

# The following packages are considered to be unsafe in a requirements file:
# pip
Expand Down

0 comments on commit 0d419d2

Please sign in to comment.