Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ htmlcov
work/

# credentials
.credentials/
.credentials/
config/clickhouse/config.d/s3.xml
5 changes: 3 additions & 2 deletions config/clickhouse/config.d/config.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
<yandex>
<logger>
<level>warning</level>
<log>/var/log/clickhouse-server/clickhouse-server.log</log>
<errorlog>/var/log/clickhouse-server/clickhouse-server.err.log</errorlog>
<size>1000M</size>
<count>7</count>
<console>true</console>
<log remove="remove"/>
<errorlog remove="remove"/>
</logger>
<display_name>ot-genetics</display_name>
<http_port>8123</http_port>
Expand Down
45 changes: 45 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ log_level: ${LOG_LEVEL}
release_uri: ${RELEASE_URI}
scratchpad:
release: ${RELEASE}
product: ${PRODUCT}
data_source: ${RELEASE_URI}
# temp file locations
local_data: release_data
Expand All @@ -29,6 +30,7 @@ scratchpad:
clickhouse_logs: /var/log/pos/clickhouse
clickhouse_disk_name: ${CLICKHOUSE_DISK_NAME}
clickhouse_disk_snapshot_name: ${CLICKHOUSE_DISK_SNAPSHOT_NAME}
clickhouse_backup_base_path: ${CLICKHOUSE_BACKUP_BASE_PATH}
# bigquery
bq_prod_project_id: open-targets-prod
bq_parquet_path: ${BQ_DATA_SOURCE}
Expand Down Expand Up @@ -220,6 +222,49 @@ steps:
gcp_disk_name: ${clickhouse_disk_name}
gcp_snapshot_name: ${clickhouse_disk_snapshot_name}
gcp_disk_zone: europe-west1-d
clickhouse_backup_all:
- name: clickhouse_start data loading instance
volume_data: ${clickhouse_data}
volume_logs: ${clickhouse_logs}
clickhouse_version: ${clickhouse_version}
clickhouse_database: ${database_namespace}
- name: explode backup clickhouse tables
requires:
- clickhouse_start data loading instance
foreach:
- associations_otf_target
- associations_otf_disease
- intervals
- literature_index
- literature
- targets
- ml_w2v
do:
- name: clickhouse_backup table ${each}
requires:
- clickhouse_start data loading instance
clickhouse_database: ${database_namespace}
table: ${each}
gcs_base_path: ${clickhouse_backup_base_path}
clickhouse_restore_all:
- name: clickhouse_create_database for restoration
clickhouse_database: ${database_namespace}
- name: explode restore clickhouse tables
requires:
- clickhouse_create_database for restoration
foreach:
- associations_otf_target
- associations_otf_disease
- intervals
- literature_index
- literature
- targets
- ml_w2v
do:
- name: clickhouse_restore table ${each}
clickhouse_database: ${database_namespace}
table: ${each}
gcs_base_path: ${clickhouse_backup_base_path}
clickhouse_stop:
- name: clickhouse_stop data loading instance
clickhouse_database_name: ${database_namespace}
Expand Down
3 changes: 3 additions & 0 deletions deployment/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ locals {
LOG_LEVEL = var.pos_log_level
RELEASE_URI = var.data_location_source
RELEASE = var.release_id
PRODUCT = var.is_ppp == false ? "platform" : "ppp"
RELEASE_FTP_OUTPUT = local.ftp_output_path
RELEASE_GCS_OUTPUT = local.gcs_output_path
OPENSEARCH_VERSION = var.open_search_image_tag
Expand All @@ -30,6 +31,7 @@ locals {
DATABASE_NAMESPACE = var.database_namespace
CLICKHOUSE_DISK_NAME = google_compute_disk.clickhouse_data_disk.name
CLICKHOUSE_DISK_SNAPSHOT_NAME = "${google_compute_disk.clickhouse_data_disk.name}"
CLICKHOUSE_BACKUP_BASE_PATH = var.clickhouse_backup_base_path
BQ_DATA_SOURCE = var.data_location_production
# For templating reasons, we need to substitute the following variables with $${var_name}
release = "$${release}"
Expand All @@ -55,6 +57,7 @@ locals {
clickhouse_logs = "$${clickhouse_logs}"
clickhouse_disk_name = "$${clickhouse_disk_name}"
clickhouse_disk_snapshot_name = "$${clickhouse_disk_snapshot_name}"
clickhouse_backup_base_path = "$${clickhouse_backup_base_path}"
bq_prod_project_id = "$${bq_prod_project_id}"
bq_parquet_path = "$${bq_parquet_path}"
each = "$${each}"
Expand Down
17 changes: 16 additions & 1 deletion deployment/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ resource "tls_private_key" "posvm" {
rsa_bits = 4096
}

#Create the HMAC key for the associated service account
resource "google_storage_hmac_key" "key" {
service_account_email = "pos-service-account@open-targets-eu-dev.iam.gserviceaccount.com"
}

// Create a disk volume for Clickhouse data
resource "google_compute_disk" "clickhouse_data_disk" {
project = "open-targets-eu-dev"
Expand Down Expand Up @@ -88,7 +93,17 @@ resource "google_compute_instance" "posvm" {
"pos_config.tftpl",
local.yaml_config_variables
)
# pos_run_script = file("run.sh")
s3_config = templatefile(
"s3_config.tftpl",
{
GCS_BATH_PATH = var.clickhouse_backup_base_path
ACCESS_KEY = google_storage_hmac_key.key.access_id
SECRET_KEY = google_storage_hmac_key.key.secret
}
)

google_storage_hmac_key_access_id = google_storage_hmac_key.key.access_id
google_storage_hmac_key_secret = google_storage_hmac_key.key.secret
}
service_account {
email = "pos-service-account@open-targets-eu-dev.iam.gserviceaccount.com"
Expand Down
41 changes: 41 additions & 0 deletions deployment/pos_config.tftpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ log_level: ${LOG_LEVEL}
release_uri: ${RELEASE_URI}
scratchpad:
release: ${RELEASE}
product: ${PRODUCT}
data_source: ${RELEASE_URI}
# temp file locations
local_data: release_data
Expand All @@ -29,6 +30,7 @@ scratchpad:
clickhouse_logs: /var/log/pos/clickhouse
clickhouse_disk_name: ${CLICKHOUSE_DISK_NAME}
clickhouse_disk_snapshot_name: ${CLICKHOUSE_DISK_SNAPSHOT_NAME}
clickhouse_backup_base_path: ${CLICKHOUSE_BACKUP_BASE_PATH}
# bigquery
bq_prod_project_id: open-targets-prod
bq_parquet_path: ${BQ_DATA_SOURCE}
Expand Down Expand Up @@ -220,6 +222,45 @@ steps:
gcp_disk_name: ${clickhouse_disk_name}
gcp_snapshot_name: ${clickhouse_disk_snapshot_name}
gcp_disk_zone: europe-west1-d
clickhouse_backup_all:
- name: clickhouse_start data loading instance
volume_data: ${clickhouse_data}
volume_logs: ${clickhouse_logs}
clickhouse_version: ${clickhouse_version}
clickhouse_database: ${database_namespace}
- name: explode backup clickhouse tables
requires:
- clickhouse_start data loading instance
foreach:
- associations_otf_target
- associations_otf_disease
- intervals
- literature_index
- literature
- targets
- ml_w2v
do:
- name: clickhouse_backup table ${each}
requires:
- clickhouse_start data loading instance
clickhouse_database: ${database_namespace}
table: ${each}
gcs_base_path: ${clickhouse_backup_base_path}
clickhouse_restore_all:
- name: explode restore clickhouse tables
foreach:
- associations_otf_target
- associations_otf_disease
- intervals
- literature_index
- literature
- targets
- ml_w2v
do:
- name: clickhouse_restore table ${each}
clickhouse_database: ${database_namespace}
table: ${each}
gcs_base_path: ${clickhouse_backup_base_path}
clickhouse_stop:
- name: clickhouse_stop data loading instance
clickhouse_database_name: ${database_namespace}
Expand Down
9 changes: 9 additions & 0 deletions deployment/s3_config.tftpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<clickhouse>
<s3>
<endpoint-name>
<endpoint>${GCS_BATH_PATH}</endpoint>
<access_key_id>${ACCESS_KEY}</access_key_id>
<secret_access_key>${SECRET_KEY}</secret_access_key>
</endpoint-name>
</s3>
</clickhouse>
5 changes: 3 additions & 2 deletions deployment/startup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ function install_packages() {
chgrp -R google-sudoers /opt/pos_run.sh && chmod g+x /opt/pos_run.sh
create_dir_for_group /var/log/pos/opensearch google-sudoers rwx
create_dir_for_group /var/log/pos/clickhouse google-sudoers rwx
curl "http://metadata.google.internal/computeMetadata/v1/instance/attributes/s3_config" -H "Metadata-Flavor: Google" > /opt/platform-output-support/config/clickhouse/config.d/s3_config.xml
}


Expand Down Expand Up @@ -121,7 +122,7 @@ function sync_data() {
function opensearch_steps() {
log "[INFO] Starting OpenSearch steps"
uv_run opensearch_prep_all 300 && \
uv_run opensearch_load_all 100 > /var/log/open_search_load.log 2>&1 && \
uv_run opensearch_load_all 80 > /var/log/open_search_load.log 2>&1 && \
opensearch_summary && \
uv_run opensearch_stop 1 && \
sync && \
Expand All @@ -148,7 +149,7 @@ function clickhouse_steps() {

function copy_clickhouse_configs() {
log "[INFO] Syncing ClickHouse configs"
cp -vR /opt/platform-output-support/config/clickhouse/config.d /mnt/clickhouse/
cp -vR /opt/platform-output-support/config/clickhouse/config.d/config.xml /mnt/clickhouse/config.d/config.xml
cp -vR /opt/platform-output-support/config/clickhouse/users.d /mnt/clickhouse/
}

Expand Down
12 changes: 9 additions & 3 deletions deployment/variables.tf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
variable "vm_pos_boot_disk_size" {
description = "POS VM boot disk size, default '500GB'"
type = string
default = 500
default = 600
}

variable "vm_pos_machine_type" {
Expand All @@ -19,7 +19,7 @@ variable "pos_logs_path_root" {
variable "clickhouse_data_disk_size" {
description = "Clickhouse data disk size to deploy"
type = string
default = "50"
default = "200"
}

variable "clickhouse_snapshot_source" {
Expand All @@ -34,6 +34,12 @@ variable "clickhouse_tarball" {
default = false
}

variable "clickhouse_backup_base_path" {
description = "Base path in GCS bucket where ClickHouse backups will be stored"
type = string
default = "https://storage.googleapis.com/opentargets-backup/clickhouse/"
}

variable "database_namespace" {
description = "Database namespace, default 'ot'"
type = string
Expand All @@ -43,7 +49,7 @@ variable "database_namespace" {
variable "open_search_data_disk_size" {
description = "Opensearch data disk size to deploy"
type = string
default = "200"
default = "400"
}

variable "open_search_snapshot_source" {
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ dependencies = [
"google-cloud-bigquery>=3.34.0",
"google-cloud-compute>=1.31.0",
"opensearch-py>=3.0.0",
"opentargets-otter==25.0.2",
"opentargets-otter>=25.0.9",
"orjson>=3.10.18",
"ot-croissant",
"polars>=1.31.0",
Expand Down
1 change: 1 addition & 0 deletions src/pos/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@

def main() -> None:
runner = Runner('pos')
runner.start()
runner.register_tasks('pos.tasks')
runner.run()
Loading