In [None]:
from prot_db import bigtable_constants as btc, index_seq_files
import itertools
import os
from utils import file_util, notebook_util, fasta_util
import pandas as pd
import sys
import pprint
import subprocess
from BCBio.GFF import GFFExaminer


pd.set_option('display.max_colwidth', None)
notebook_util.disp_notebook_full_width()

In [None]:
# SCRAPE_DIR = "/GeneGraphDB/data/mgnify_scrape_20220505"
SCRAPE_DIR = "gs://durrant_prot_pred_db/mgnify_scrape_20220505"

faa_glob = os.path.join(SCRAPE_DIR, "**/*.faa.gz")
anno_glob = os.path.join(SCRAPE_DIR, "**/*annotations.gff.bgz")

faa_files = file_util.glob(faa_glob)
anno_files = file_util.glob(anno_glob)



In [None]:
len(faa_files), len(anno_files)

In [None]:
faa_files[0], anno_files[0]

df_faa = pd.DataFrame(faa_files, columns=["path"])
df_faa["type"] = "faa"

df_anno = pd.DataFrame(anno_files, columns=["path"])
df_anno["type"] = "annotation"


df_combo = pd.concat([df_faa, df_anno])
df_combo

df_combo["dirname"] = df_combo["path"].map(os.path.dirname)
for dirname, df_dir in df_combo.groupby("dirname"):
#     if len(df_dir) == 2:
#         display(df_dir)
    pass



In [None]:
faa_fpath = "/GeneGraphDB/data/mgnify_scrape_20220505/studies/MGYS00002012/samples/ERS433542/analyses/MGYA00598832/ERZ1746111_FASTA_predicted_cds.faa.gz"
anno_fpath = "/GeneGraphDB/data/mgnify_scrape_20220505/studies/MGYS00002012/samples/ERS433542/analyses/MGYA00598832/ERZ1746111_FASTA_annotations.gff.bgz"
# anno_fpath = "/GeneGraphDB/data/mgnify_scrape_20220505/studies/MGYS00002012/samples/ERS433542/analyses/MGYA00598832/derp"

In [None]:
for seq in itertools.islice(fasta_util.open_fasta(faa_fpath), 0, 5):
    description = seq.description
    fields = description.split(" # ")
    seq_id, begin_idx, end_idx = fields[0:3]
    # e.g. ERZ1746111.1-NODE-1-length-464040-cov-12.638829_3 -> ERZ1746111.1-NODE-1-length-464040-cov-12.638829
    non_indexed_seq_id = seq_id.split("_", -1)[0]
    seq_id_start_end = "@".join((non_indexed_seq_id, begin_idx, end_idx))
    print(seq_id_start_end)

In [None]:
from dataclasses import dataclass

@dataclass
class GffRow:
    seqid: str
    source: str
    type_: str
    start: str
    end: str
    score: str
    strand: str
    phase: str
    attributes: str
    


In [None]:
def parse_gff_file(gff_file):
    annotations_fname = os.path.basename(gff_file)
    
    with file_util.tmp_copy_on_open(gff_file, annotations_fname) as local_file:
        if local_file.endswith(".bgz"):
            # replace suffix so `gunzip` utility works
            gzip_name = local_file[:-len(".bgz")] + ".gz"
            cmd = f"mv {local_filegunzip {gzip_name}"
            _ = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
            local_file.rstrip(".gz")
    
        with open(local_file) as fh:
            gff_lines = [line.rstrip() for line in fh.readlines()]
        gff_lines = [line.rstrip() for line in gff_lines]
        assert(gff_lines[0] == "##gff-version 3")
        gff_lines = gff_lines[1:]
        gff_rows = [GffRow(*line.split("\t")) for line in gff_lines]
    return gff_rows
    
    
gff_rows = parse_gff_file(anno_fpath)
gff_rows[0:5]
    
    

In [None]:
for row in gff_rows[:5]:
    seq_id_start_end = "@".join((row.seqid, row.start, row.end))
    print(seq_id_start_end)