In [27]:
%load_ext autoreload
%autoreload 2

from opengsync_db import categories, DBHandler
from opengsync_db.ext.autosession import set_db
%load_ext opengsync_db.ext.autosession

from IPython.display import display
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The opengsync_db.ext.autosession extension is already loaded. To reload it, use:
  %reload_ext opengsync_db.ext.autosession


True

In [28]:
db = DBHandler()
db.connect(
    user=os.environ["POSTGRES_USER"],
    password=os.environ["POSTGRES_PASSWORD"],
    host=os.environ["POSTGRES_SERVER_IP"],
    port=os.environ["POSTGRES_PORT"],
    db=os.environ["POSTGRES_DB"],
)
set_db(db)

LOG: Connected to DB 'postgresql+psycopg://bsf-opengsync.int.cemm.at:5432/opengsync_db'


### 0. Categories (Enums)

In [29]:
# These are used to store statuses or types, e.g. LibraryType, ExperimentStatus, GenomeRef
# Each enum has a unique id inside it's own enum-class
# these are defined in services/opengsync-app/opengsync-db/opengsync_db/categories/
categories.LibraryType.POLY_A_RNA_SEQ.id, categories.LibraryType.POLY_A_RNA_SEQ.name, categories.GenomeRef.COVID.organism_tax_id

(101, 'Poly-A RNA-Seq', 2697049)

### 1. Get All Libraries from Sequencing Run (Before demultiplexing flowcell)

In [None]:
experiment = db.experiments["BSF_1776"]
print(experiment)
print(experiment.flowcell_type)
print(experiment.operator)
print(experiment.num_lanes)
print(experiment.status)

In [None]:
# This dataframe should be sufficient to demultiplex a flowcell
experiment_libraries = db.pd.get_flowcell(experiment_id=experiment.id)
experiment_libraries.sample(10)

#### 1.1 Grouping by Sequencing Request

In [None]:
# Demultiplexed fastq-files should be grouped by 'seq_request_id' as each Sequencing Request can have only one Requestor
for (seq_request_id), df in experiment_libraries.groupby("seq_request_id"):
    print(f"seq_request_id: {seq_request_id}")
    display(df.head(20))
    print("...\n")


#### 1.2 Filtering Library Type

In [None]:
experiment_libraries[experiment_libraries["library_type"] == categories.LibraryType.WGS].head(3)

### 2. Downstream Projects (after demultiplexing)

In [34]:
project = db.projects["BSA_1041"]
# or if identifier not assigned you can also use ID (integer)
# project = db.projects[61]
project

Project(id: 61, identifier: BSA_1041)

In [None]:
db.pd.get_project_samples(project.id)

In [None]:
# this dataframe should be enough to find all fastq-files generated from demultiplexing step
# projects can contain multiple requests, and libraries from multiple experiments
project_libraries = db.pd.get_project_libraries(project.id)
project_libraries

In [None]:
# Sequencing Requests associated with a project
db.pd.get_project_seq_requests(project.id)

### 3. Using ORM instead of Pandas Dataframe

In [None]:
seq_request = db.seq_requests[1]
print(seq_request.contact_person)
print(seq_request.comments)
print(seq_request.data_delivery_mode)