In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Load 2023 February 17 data

Duration of download at EBI:
```
2023-02-20 08:05:00.606227 connected to db engine to use db kooplex_ebi
2023-02-20 08:05:16.360970 # 2949880 known items
2023-02-20 08:05:17.998033 loaded file ./latest_acc_ELTE_17feb.tsv # 209783 samples
....
2023-02-20 13:08:43.099806 download threads exited
2023-02-20 13:08:43.133807 finished
```

Duration: **~ 304min**

In [2]:
image = 'image-registry.vo.elte.hu/jupyter-bioinf-v5'
command = '/v/projects/ebi-vcf-wfct0p/steger/coveo/ELTE/load-2023_02_17.sh'

In [3]:
! chmod +x {command}

In [4]:
! bash /etc/jobtools/kj-submit \
  -n coveo-load -i {image} -c {command} -p 2 -m 16G \
  -H -V coveo-archive --nodename veo2 --projects_rw -P ebi-vcf-wfct0p

Job submitted successfully!


Duration: **47 min**

# Append cotents of `2023-02-17` snapshot

## 23. Feb '23.

In [5]:
import pandas as pd
import psycopg2
import time
import io
import datetime
import timeit

In [6]:
prefix = 'load_230217_'
drop_suffix = '_230217'

In [7]:
common_comment = '23-02-17 snapshots'

In [8]:
conn = psycopg2.connect(
    dbname='coveo',
    host='',
    user='public_loader',
    password='',
    application_name = 'steger loader_notebook'
)
C = conn.cursor()

In [9]:
schema = 'datahub_0'

In [10]:
class TimeLogCommit:
    def __init__(self, task, table_name = None, commit = True, verbose=True):
        self.table_name = table_name
        self.task = task
        self.verbose = verbose
        self.commit = commit

    def __enter__(self):
        self.t0 = datetime.datetime.now()
        self.start = timeit.default_timer()

    def __exit__(self, exc_type, exc_value, traceback):
        self.took = (timeit.default_timer() - self.start)
        if self.table_name:
            C.execute(f"""
INSERT INTO {schema}.merge_log
VALUES ('{common_comment}', '{self.table_name}', '{self.task}', '{self.t0}', '{datetime.datetime.now()}');
""")
        if self.commit:
            conn.commit()
        if self.verbose:
            t = f'on {self.table_name} ' if self.table_name else ' '
            print(f'\n\033[38;5;208mCode block {self.task} {t}took:\t{self.took:.5f} seconds\033[0;0m')

# Always check for any duplicates

In [11]:
with TimeLogCommit(task = 'check cov duplicates'):
    sql = f"""
select luc.runid 
into {schema}.dropme_cov_duplicates{drop_suffix}
from {schema}.unique_cov uc
inner join {schema}.{prefix}unique_cov luc
on luc.runid = uc.runid 
    """
    C.execute(sql)


[38;5;208mCode block check cov duplicates  took:	2.46514 seconds[0;0m


In [12]:
pd.read_sql(f"select * from {schema}.dropme_cov_duplicates{drop_suffix}", con = conn).head()



Unnamed: 0,runid


In [13]:
with TimeLogCommit(task = 'check vcf duplicates'):
    sql = f"""
select luv.runid 
into {schema}.dropme_vcf_duplicates{drop_suffix}
from {schema}.unique_vcf uv
inner join {schema}.{prefix}unique_vcf luv
on luv.runid = uv.runid 
    """
    C.execute(sql)


[38;5;208mCode block check vcf duplicates  took:	2.11582 seconds[0;0m


In [14]:
pd.read_sql(f"select * from {schema}.dropme_vcf_duplicates{drop_suffix}", con = conn).head()



Unnamed: 0,runid


# HEUREKA there are no duplicates!

In [15]:
with TimeLogCommit(task = 'merge', table_name = 'unique_cov'):
    sql = f"""
insert into {schema}.unique_cov (runid, insertion_ts, snapshot, integrity)
select runid, insertion_ts, snapshot, integrity 
from {schema}.{prefix}unique_cov lc
where not lc.runid in (select runid from {schema}.dropme_cov_duplicates{drop_suffix});
"""
    C.execute(sql)


[38;5;208mCode block merge on unique_cov took:	7.94339 seconds[0;0m


In [16]:
with TimeLogCommit(task = 'merge', table_name = 'cov'):
    sql = f"""
ALTER TABLE {schema}.cov DROP CONSTRAINT cov_runid_fkey;
insert into {schema}.cov (runid, pos, coverage)
select runid, pos, coverage 
from {schema}.{prefix}cov lc
where not lc.runid in (select runid from {schema}.dropme_cov_duplicates{drop_suffix});
ALTER TABLE {schema}.cov ADD CONSTRAINT cov_runid_fkey FOREIGN KEY (runid) REFERENCES {schema}.unique_cov(runid);
-- FIXME there is no index on runid!!!
    """
    C.execute(sql)


[38;5;208mCode block merge on cov took:	5658.86865 seconds[0;0m


In [17]:
with TimeLogCommit(task = 'merge', table_name = 'unique_vcf'):
    sql = f"""
insert into {schema}.unique_vcf (runid, insertion_ts, snapshot, integrity)
select runid, insertion_ts, snapshot, integrity 
from {schema}.{prefix}unique_vcf lv
where not lv.runid in (select runid from {schema}.dropme_vcf_duplicates{drop_suffix});
    """
    C.execute(sql)


[38;5;208mCode block merge on unique_vcf took:	9.89836 seconds[0;0m


In [18]:
with TimeLogCommit(task = 'merge', table_name = 'vcf_key'):
    sql = f"""
ALTER TABLE {schema}.vcf_key DROP CONSTRAINT vcf_key_runid_fkey;
DROP INDEX {schema}.idx_vcf_key_runid;
insert into {schema}.vcf_key (key, runid, pos, ref, alt)
select key, runid, pos, ref, alt
from {schema}.{prefix}vcf_key lvk
where not lvk.runid in (select runid from {schema}.dropme_vcf_duplicates{drop_suffix});
ALTER TABLE {schema}.vcf_key ADD CONSTRAINT vcf_key_runid_fkey FOREIGN KEY (runid) REFERENCES {schema}.unique_vcf(runid);
CREATE INDEX idx_vcf_key_runid ON {schema}.vcf_key USING btree (runid);
    """
    C.execute(sql)


[38;5;208mCode block merge on vcf_key took:	1356.65778 seconds[0;0m


with TimeLogCommit(task = 'prepare mask'):
    sql = f"""
select key 
into {schema}.dropme_vcfk{drop_suffix}
from datahub_0.{prefix}vcf_key lvk where lvk.runid in (select runid from {schema}.dropme_vcf_duplicates{drop_suffix});
    """
    C.execute(sql)

In [19]:
with TimeLogCommit(task = 'drop index', table_name = 'vcf'):
    sql = f"""
DROP INDEX IF EXISTS {schema}.idx_vcf_key;
DROP INDEX IF EXISTS {schema}.idx_vcf_af;
ALTER TABLE {schema}.vcf DROP CONSTRAINT vcf_key_fkey;
    """
    C.execute(sql)


[38;5;208mCode block drop index on vcf took:	0.00287 seconds[0;0m


with TimeLogCommit(task = 'prepare complementer'):
    sql = f"""
select lsv.*
into {schema}.lsv_complementer{drop_suffix}
from {schema}.{prefix}vcf lsv
left outer join {schema}.dropme_vcfk{drop_suffix} dv 
on lsv."key" = dv."key" 
where dv."key" is null;
    """
    C.execute(sql)

In [20]:
with TimeLogCommit(task = 'merge', table_name = 'vcf'):
    sql = f"""
-- insert into {schema}.vcf (key, qual, dp, af, sb, count_ref_forward_base, count_ref_reverse_base, count_alt_forward_base, count_alt_reverse_base, hrun, indel, nmd, major, ann_num)
-- select key, qual, dp, af, sb, count_ref_forward_base, count_ref_reverse_base, count_alt_forward_base, count_alt_reverse_base, hrun, indel, nmd, major, ann_num
-- from {schema}.lsv_complementer{drop_suffix};

insert into {schema}.vcf (key, qual, dp, af, sb, count_ref_forward_base, count_ref_reverse_base, count_alt_forward_base, count_alt_reverse_base, hrun, indel, nmd, major, ann_num)
select key, qual, dp, af, sb, count_ref_forward_base, count_ref_reverse_base, count_alt_forward_base, count_alt_reverse_base, hrun, indel, nmd, major, ann_num
from {schema}.{prefix}vcf;
    """
    C.execute(sql)


[38;5;208mCode block merge on vcf took:	1325.65676 seconds[0;0m


In [21]:
with TimeLogCommit(task = 'create fk', table_name = 'vcf'):
    sql = f"""
ALTER TABLE {schema}.vcf ADD CONSTRAINT vcf_key_fkey FOREIGN KEY (key) REFERENCES {schema}.vcf_key(key);
    """
    C.execute(sql)


[38;5;208mCode block create fk on vcf took:	295.07182 seconds[0;0m


In [22]:
with TimeLogCommit(task = 'create index key', table_name = 'vcf'):
    sql = f"""
CREATE INDEX idx_vcf_key ON {schema}.vcf USING btree (key);
    """
    C.execute(sql)


[38;5;208mCode block create index key on vcf took:	890.81881 seconds[0;0m


In [23]:
with TimeLogCommit(task = 'create index af', table_name = 'vcf'):
    sql = f"""
CREATE INDEX idx_vcf_af ON {schema}.vcf USING btree (af);
    """
    C.execute(sql)


[38;5;208mCode block create index af on vcf took:	590.55257 seconds[0;0m


In [25]:
conn.rollback()

In [26]:
with TimeLogCommit(task = 'merge', table_name = 'vcf_lof'):
    sql = f"""
ALTER TABLE {schema}.vcf_lof DROP CONSTRAINT vcf_lof_key_fkey;
insert into {schema}.vcf_lof (key, lof)
select lvl.key, lof
from {schema}.{prefix}vcf_lof lvl
;-- left outer join {schema}.dropme_vcfk{drop_suffix} drp
-- on drp."key" = lvl."key" 
-- where drp."key" is null;
ALTER TABLE {schema}.vcf_lof ADD CONSTRAINT vcf_lof_key_fkey FOREIGN KEY (key) REFERENCES {schema}.vcf_key(key);
    """
    C.execute(sql)


[38;5;208mCode block merge on vcf_lof took:	455.90233 seconds[0;0m


In [27]:
with TimeLogCommit(task = 'merge', table_name = 'annotation_binding'):
    sql = f"""
ALTER TABLE {schema}.annotation_binding DROP CONSTRAINT annotation_binding_key_fkey;
insert into {schema}.annotation_binding (key, gene_name, annotation_atom)
select lab.key, gene_name, annotation_atom
from {schema}.{prefix}annotation_binding lab
;-- left outer join {schema}.dropme_vcfk{drop_suffix} drp
-- on drp."key" = lab."key" 
-- where drp."key" is null;
ALTER TABLE {schema}.annotation_binding ADD CONSTRAINT annotation_binding_key_fkey FOREIGN KEY (key) REFERENCES {schema}.vcf_key(key);
    """
    C.execute(sql)


[38;5;208mCode block merge on annotation_binding took:	1467.11832 seconds[0;0m


In [28]:
with TimeLogCommit(task = 'drop index', table_name = 'annotation'):
    sql = f"""
ALTER TABLE {schema}.annotation DROP CONSTRAINT annotation_key_fkey;
DROP INDEX IF EXISTS {schema}.idx_annotation_hgvsp;
DROP INDEX IF EXISTS {schema}.idx_annotation_genename;
DROP INDEX IF EXISTS {schema}.idx_annotation_key;
    """
    C.execute(sql)


[38;5;208mCode block drop index on annotation took:	0.10367 seconds[0;0m


In [29]:
with TimeLogCommit(task = 'merge', table_name = 'annotation'):
    sql = f"""
insert into {schema}.annotation (key, annotation_impact, gene_name, feature_type, feature_id, transcript_biotype, rank_, hgvs_c, hgvs_p, cdna_pos, cdna_length, cds_pos, cds_length, aa_pos, aa_length, distance, errors_warnings_info)
select la.key, annotation_impact, gene_name, feature_type, feature_id, transcript_biotype, rank_, hgvs_c, hgvs_p, cdna_pos, cdna_length, cds_pos, cds_length, aa_pos, aa_length, distance, errors_warnings_info
from {schema}.{prefix}annotation la
;-- left outer join {schema}.dropme_vcfk{drop_suffix} drp
-- on drp."key" = la."key" 
-- where drp."key" is null;
    """
    C.execute(sql)


[38;5;208mCode block merge on annotation took:	2698.26016 seconds[0;0m


In [30]:
with TimeLogCommit(task = 'create index hgvs', table_name = 'annotation'):
    sql = f"""
CREATE INDEX idx_annotation_hgvsp ON {schema}.annotation USING btree (hgvs_p);
    """
    C.execute(sql)


[38;5;208mCode block create index hgvs on annotation took:	2105.43990 seconds[0;0m


In [31]:
with TimeLogCommit(task = 'create index gene_name', table_name = 'annotation'):
    sql = f"""
CREATE INDEX idx_annotation_genename ON {schema}.annotation USING btree (gene_name);
    """
    C.execute(sql)


[38;5;208mCode block create index gene_name on annotation took:	1069.58033 seconds[0;0m


In [32]:
with TimeLogCommit(task = 'create index key', table_name = 'annotation'):
    sql = f"""
CREATE INDEX idx_annotation_key ON {schema}.annotation USING btree (key);
    """
    C.execute(sql)


[38;5;208mCode block create index key on annotation took:	1085.54829 seconds[0;0m


In [33]:
with TimeLogCommit(task = 'create fk', table_name = 'annotation'):
    sql = f"""
ALTER TABLE {schema}.annotation ADD CONSTRAINT annotation_key_fkey FOREIGN KEY (key) REFERENCES {schema}.vcf_key(key);
    """
    C.execute(sql)


[38;5;208mCode block create fk on annotation took:	130.57688 seconds[0;0m


# Make sure to update metadata and jhd

* [update meta](./update-meta.ipynb)
* [update JHD](./update-jhd.ipynb)

In [34]:
with TimeLogCommit(task = 'analyze', table_name = '*'):
    C.execute(f"ANALYZE")


[38;5;208mCode block analyze on * took:	1041.44251 seconds[0;0m


# Take care of materialized views

In [35]:
for mv in [
    'aa_mutation', 'app_new_cases_jhd', 'app_country_samples', 'app_country_samples_full', 'app_human_meta_mv', 'app_human_meta_mv_jhd',
    'lineage', 'app_worldplot_data', 'unique_ena_run_summary', 'app_lineage', 'app_variants_weekly',
    'n_content', 'vcf_key_gene_s_af_gt_05', 'vcf_key_selected', 'test_background_sample_counts', # 'pcr_assay_results'
]:
    with TimeLogCommit(task = 'refresh', table_name = mv):
        C.execute(f"REFRESH MATERIALIZED VIEW {schema}.{mv}")
    


[38;5;208mCode block refresh on aa_mutation took:	1041.02856 seconds[0;0m

[38;5;208mCode block refresh on app_new_cases_jhd took:	30.52096 seconds[0;0m

[38;5;208mCode block refresh on app_country_samples took:	19.16254 seconds[0;0m

[38;5;208mCode block refresh on app_country_samples_full took:	19.27157 seconds[0;0m

[38;5;208mCode block refresh on app_human_meta_mv took:	24.53660 seconds[0;0m

[38;5;208mCode block refresh on app_human_meta_mv_jhd took:	23.99638 seconds[0;0m

[38;5;208mCode block refresh on lineage took:	340.26239 seconds[0;0m

[38;5;208mCode block refresh on app_worldplot_data took:	21.66840 seconds[0;0m

[38;5;208mCode block refresh on unique_ena_run_summary took:	496.99089 seconds[0;0m

[38;5;208mCode block refresh on app_lineage took:	53.62097 seconds[0;0m

[38;5;208mCode block refresh on app_variants_weekly took:	45.17494 seconds[0;0m

[38;5;208mCode block refresh on n_content took:	354.40675 seconds[0;0m

[38;5;208mCode block refresh o

In [44]:
conn.rollback()