In [None]:
import os.path as osp
import os
import argparse
from glob import glob
import pyfastx
import json
from datasets import Dataset, DatasetDict
from typing import Literal
from multiprocessing import cpu_count
import ijson
from shutil import disk_usage

def json_generator(json_path, key):
    with open(json_path,'r') as f: 
        for record in ijson.items(f,f"{key}.item"):
            yield  {"ids":record}
            
def parse_pyfastx_generator(fasta_fpath):
    fasta = pyfastx.Fastx(fasta_fpath,comment=True) # Fasta fasta parser written in C
    idx = 0
    for accession, seq, description in fasta:
        yield {
            "index": idx,
            "sequence": seq,
            "accession": accession,
            "description": description
        }
        idx += 1


def make_dset_from_ids(ids_dataset: Dataset, seq_dset: Dataset, num_proc: int = cpu_count()) -> Dataset:
    # Using ids_dataset from a generator instead of from dict ensure map uses temp files in disk
    # instead of loading everything in memory
    return ids_dataset.map(lambda x: seq_dset[x["ids"]],
                      remove_columns="ids",
                      num_proc=num_proc)

def create_hf_dataset(fasta_path: str,
                      splits_path: str,
                      dataset_type: Literal['clustered', 'unclustered'],
                      num_proc: int = cpu_count()) -> DatasetDict:

    ds = Dataset.from_generator(
        parse_pyfastx_generator,
        gen_kwargs={
            "fasta_fpath": fasta_path
        }
    )
    
    if splits_path is None:
        return ds

    if dataset_type == 'unclustered':

        with open(splits_path,'r') as f:  # load in memory. WARNING: May need to make it a DatasetDict like with clustered
            splits = json.load(f)

        ds_dict = DatasetDict({
            split:ds.select(splits[split]) for split in splits
        })
        
    elif dataset_type == 'clustered':
        splits = ['train', 'test', 'valid', 'rtest']

        
        ids_dataset = DatasetDict(
            {
                split: Dataset.from_generator(
                    json_generator,
                    gen_kwargs={"json_path": splits_path, "key": split}
                    ) for split in splits
            }
        )
        
        ds_dict = make_dset_from_ids(ids_dataset = ids_dataset,
                                     seq_dset = ds,
                                     num_proc = num_proc)
        
            
    return ds_dict

def merge_and_create_hf_dataset(fasta_paths: list):
    ds_dict = {}
    for fasta_path in fasta_paths:
        name = fasta_path.split("/")[-1].split(".")[0]
        ds_dict[name] = Dataset.from_generator(
                            parse_pyfastx_generator,
                            gen_kwargs={
                                "fasta_fpath": fasta_path
                            }
                        )
    return DatasetDict(ds_dict)


In [None]:
from pathlib import Path

def find_large_files(directory,threshold=50*1024**3):
    path = Path(directory)
    for file in path.rglob('*'):
        # print(file)
        if file.is_file():
            try:
                if file.stat().st_size > threshold:
                    print(f"{file} is larger than 50GB")
            except Exception as e:
                print(f"Could not get size for {file}: {e}")

find_large_files("../data/gigaref_full/") # Check if any of the files are larger than 50GB

In [None]:
from huggingface_hub import upload_large_folder
from huggingface_hub import HfApi

api = HfApi(token="")
# api.create_repo(repo_id='samirchar/testing', repo_type='dataset', private=True)
# Check if repo already exists
# repo_exists = api.repo_exists(repo_id=repo_id, repo_type=repo_type)
upload_large_folder(folder_path = "../data/rfdiffusion/",
                    repo_id = 'samirchar/testing',
                    repo_type = 'dataset',
                    multi_commits= True
                    )

In [None]:
import os
os.makedirs("/mnt/blob/hf_cache_test_not_writable/", exist_ok=True,mode=0o555)
os.chmod("/mnt/blob/hf_cache_test_not_writable/", 0o555)
os.environ["HF_HOME"] = "/mnt/blob/hf_cache_test_not_writable/"
from datasets import Dataset
from shutil import disk_usage
def foo_gen():
    for i in range(100):
        yield {"index": i, "sequence": "A" * 100, "accession": f"acc_{i}", "description": f"desc_{i}"}

In [None]:
a = Dataset.from_generator(
    foo_gen
)

In [None]:
a.info.size_in_bytes

In [None]:
a.cache_files

In [None]:
vars(a)

In [None]:
from datasets import load_from_disk
ds = load_from_disk("../data/uniref90_202401/arrow/train")
ds.save_to_disk("/tmp_tests/uniref90_no_mp/")

In [None]:
import json
with open('../data/gigaref/no_singletons/clustered_splits.json','r') as f:
    splits = json.dumps(f)


In [None]:
splits = DatasetDict(
    {
        split: Dataset.from_generator(
            json_generator,
            gen_kwargs={"json_path": '../data/uniref50_202401/splits.json', "key": split}
            ) for split in splits
    }
)

In [None]:
splits

In [None]:
splits_path = '../data/uniref90_202401/clustered_splits.json'


In [None]:
id_set.map()

In [None]:
id_set = Dataset.from_json('../data/uniref90_202401/clustered_splits.json')

In [None]:
id_set[10]

In [None]:
id_set.cache_files

In [None]:
from collections import defaultdict
def foo(sample_ids):
    sample = []
    for i in sample_ids:
        seq = 'A'*500
        sample.append({"a":seq,"b":i,"c":i})
    return sample

In [None]:
id_set.map(lambda x: foo(x["sample_ids"]),
                    remove_columns="sample_ids",
                    num_proc=cpu_count())

In [None]:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed

set_seed(0)
torch.set_default_device("cuda")

model = AutoModelForCausalLM.from_pretrained('samirchar/test_dayhoff', subfolder = "jamba-170m-seqsam-36w")
tokenizer = AutoTokenizer.from_pretrained('samirchar/test_dayhoff', trust_remote_code=True)


inputs = tokenizer(tokenizer.bos_token, return_tensors="pt", return_token_type_ids=False)

outputs = model.generate(inputs['input_ids'],max_length=50,do_sample=True)
sequence = tokenizer.batch_decode(outputs,skip_special_tokens=True)
print(sequence)


In [None]:
# offsets = np.load('../data/uniref50_202401/lengths_and_offsets.npz')
import json

from datasets import Dataset, DatasetDict
import os.path as osp
import pyfastx
from typing import Literal
from multiprocessing import cpu_count
def parse_pyfastx_generator(fasta_fpath):
    fasta = pyfastx.Fastx(fasta_fpath,comment=True) # Fasta fasta parser written in C
    idx = 0
    for accession, seq, description in fasta:
        yield {
            "index": idx,
            "sequence": seq,
            "accession": accession,
            "description": description
        }
        idx += 1

def make_dset_from_ids(ids: list, seq_dset: Dataset, num_proc: int = cpu_count()) -> Dataset:
    id_set = Dataset.from_dict({"sample_ids": ids})
    return id_set.map(lambda x: seq_dset[x["sample_ids"]],
                      remove_columns="sample_ids",
                      num_proc=num_proc)


def create_hf_dataset(fasta_path: str,
                      splits_path: str,
                      dataset_type: Literal['clustered', 'unclustered'],
                      num_proc: int = cpu_count()) -> DatasetDict:

    ds = Dataset.from_generator(
        parse_pyfastx_generator,
        gen_kwargs={
            "fasta_fpath": fasta_path
        }
    )
    
    if splits_path is None:
        return ds

    with open(splits_path,'r') as f: 
        splits = json.load(f)


    if dataset_type == 'unclustered':
        ds = DatasetDict({
            split:ds.select(splits[split]) for split in splits
        })
        
    elif dataset_type == 'clustered':
        ds = DatasetDict({
            split:make_dset_from_ids(ids = splits[split],
                                     seq_dset = ds,
                                     num_proc = num_proc) for split in splits
        })

    return ds

In [None]:
from glob import glob


In [None]:
for idx,i in enumerate(parse_pyfastx_generator('../data/rfdiffusion/rfdiffusion_both_filter.fasta')):
    print(i)
    if idx==10:
        break
    

In [None]:
## CREATE SAMPLE DATASETS ##

sample_splits = {"train":[0,1,2,3,4],
                 "valid":[5,6],
                 "test":[7,8],
                 "rtest":[9]}

sample_clustered_splits = {"train":[[0,1],[2,3]],
                 "valid":[[4],[5,6]],
                 "test":[[7,8]],
                 "rtest":[[9]]}

with open('../data/uniref50_202401/sample_splits.json','w') as f:
    json.dump(sample_splits,f)

with open('../data/uniref90_202401/sample_clustered_splits.json','w') as f:
    json.dump(sample_clustered_splits,f)

fasta_dir = '../data/uniref50_202401/'

ds = create_hf_dataset(fasta_path = osp.join(fasta_dir,'consensus_sample.fasta'),
                        splits_path = osp.join(fasta_dir,'sample_splits.json'),
                        dataset_type = 'unclustered'
                        )

ds.save_to_disk(osp.join(fasta_dir,'parquets'))


ds = create_hf_dataset(fasta_path = osp.join('../data/uniref50_202401/','consensus_sample.fasta'),
                        splits_path = osp.join('../data/uniref90_202401/','sample_clustered_splits.json'),
                        dataset_type = 'clustered'
                        )

ds.save_to_disk(osp.join('../data/uniref90_202401/','parquets'))

In [None]:
import datasets
import os.path as osp
import os
from datasets import Dataset, Sequence
from dataclasses import dataclass
from typing import Literal
from huggingface_hub import hf_hub_url
import numpy as np
import json
import pyfastx
from multiprocessing import cpu_count


_DESCRIPTION = """\
Dayhoff dataset
"""

# TODO: Add a link to an official homepage for the dataset here
_REPO_ID = "samirchar/DayhoffDataset"
_HOMEPAGE = f"https://huggingface.co/datasets/{_REPO_ID}"

# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""

#TODO: Add citation
_CITATION = ""

# TODO: Add link to the official dataset URLs here
# The HuggingFace Datasets library doesn't host the datasets but only points to the original files.
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
_URLS = {
    
}


def parse_pyfastx_generator(fasta_fpath):
    fasta = pyfastx.Fastx(fasta_fpath,comment=True) # Fasta fasta parser written in C
    idx = 0
    for accession, seq, description in fasta:
        yield {
            "index": idx,
            "sequence": seq,
            "accession": accession,
            "description": description
        }
        idx += 1

def make_dset_from_ids(ids: list, seq_dset: Dataset, num_proc: int = cpu_count()) -> Dataset:
    id_set = Dataset.from_dict({"sample_ids": ids})
    return id_set.map(lambda x: seq_dset[x["sample_ids"]],
                      remove_columns="sample_ids",
                      num_proc=num_proc)

    
@dataclass
class ClusteredSequencesConfig(datasets.BuilderConfig):
        '''Congif for sequence generation'''
        name: str = "clustered"
        dataset: Literal["uniref90_202401",
                         "gigaref"] = "uniref90_202401"

@dataclass
class SequencesConfig(datasets.BuilderConfig):
        '''Congif for sequence generation'''
        name: str = "sequence"
        dataset: Literal["uniref50_202401",
                         "uniref90_202401",
                         "rfdiffusion_both_filter",
                         "rfdiffusion_novelty",
                         "rfdiffusion_scrmsd",
                         "rfdiffusion_unfiltered"] = "uniref50_202401"

@dataclass
class MSAConfig(datasets.BuilderConfig):
        '''Congif for MSA generation'''
        name: str = "msa"
        dataset: Literal["uniref50_202401",
                         "gigaref_with_singletons",
                         "gigaref_no_singletons"] = "uniref50_202401" #TODO: complete all possible datasets

# Name of the dataset usually matches the script name with CamelCase instead of snake_case
class DayhoffDataset(datasets.GeneratorBasedBuilder):
    """TODO: Short description of my dataset."""
    
    VERSION = datasets.Version("1.1.0")
    DEFAULT_CONFIG_NAME = "sequence"  # It's not mandatory to have a default configuration. Just use one if it make sense.

    # This is an example of a dataset with multiple configurations.
    # If you don't want/need to define several sub-sets in your dataset,
    # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.

    # If you need to make complex sub-parts in the datasets with configurable options
    # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig
    # BUILDER_CONFIG_CLASS = MyBuilderConfig

    # You will be able to load one or the other configurations in the following list with
    # data = datasets.load_dataset('my_dataset', 'first_domain')
    # data = datasets.load_dataset('my_dataset', 'second_domain')
    BUILDER_CONFIGS = [
        SequencesConfig(version=VERSION, description="sequence datasets"),
        ClusteredSequencesConfig(version=VERSION, description="Clustered sequence datasets"),
        MSAConfig(version=VERSION, description="MSA datasets"),

    ]

    def _info(self):
        # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
        # Maybe add sequence and MSA configs?

        homepage= "" # TODO: add HF homepage

        if self.config.name == "sequence":
            features = datasets.Features(
                {   "index": datasets.Value("int32"),
                    "accession": datasets.Value("string"),
                    "sequence": datasets.Value("large_string"),
                    "description": datasets.Value("string")
                    
                }
            )
        
        elif self.config.name == "clustered":
            features = datasets.Features(
                {   "indexes": Sequence(datasets.Value("int32")),
                    "accessions": Sequence(datasets.Value("string")),
                    "sequences": Sequence(datasets.Value("large_string")),
                    "descriptions": Sequence(datasets.Value("string"))
                  
                }
            )


        else:
             raise NotImplementedError(f"{self.config.name} config not implemented yet") #TODO: Implement msa config

        return datasets.DatasetInfo(
            features=features,  
            license=_LICENSE,
            citation=_CITATION,        
            supervised_keys=None
        )

    def _split_generators(self, dl_manager):
        # This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name

        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLS
        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
        DATASETS_WITHOUT_SPLITS = ["rfdiffusion_both_filter",
                                   "rfdiffusion_novelty",
                                   "rfdiffusion_scrmsd",
                                   "rfdiffusion_unfiltered"]
        
        if self.config.name == "sequence" and (self.config.dataset not in DATASETS_WITHOUT_SPLITS):

            
            file_paths = dl_manager.download({
                 'consensus':hf_hub_url(repo_id=_REPO_ID, filename="consensus_sample.fasta",subfolder=self.config.dataset, repo_type='dataset'), #TODO: using _sample for now
                 'splits':hf_hub_url(repo_id=_REPO_ID, filename="sample_splits.json",subfolder=self.config.dataset,repo_type='dataset')
            })

            fasta_path = file_paths['consensus']
            splits_path = file_paths['splits']

            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "fasta_path": fasta_path,
                        "splits_path":splits_path,
                        "split": "train"
                    },
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "fasta_path": fasta_path,
                        "splits_path":splits_path,
                        "split": "valid"
                    },
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.TEST,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "fasta_path": fasta_path,
                        "splits_path":splits_path,
                        "split": "test"
                    },
                ),
            ]
        elif self.config.name == "sequence" and (self.config.dataset in DATASETS_WITHOUT_SPLITS):

            if "rfdiffusion" in self.config.dataset:
                subfolder = "rfdiffusion"

            file_paths = dl_manager.download({
                 'data':hf_hub_url(repo_id=_REPO_ID, filename=f"{self.config.dataset}.fasta",subfolder = subfolder, repo_type='dataset'), #TODO: using _sample for now
            })

            fasta_path = file_paths['data']

            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "fasta_path": fasta_path,
                        "splits_path": None,
                        "split": "train"
                    },
                )
            ]
        
        elif self.config.name == "clustered":
            
            file_paths = dl_manager.download({
                 'consensus':hf_hub_url(repo_id=_REPO_ID, filename="consensus_sample.fasta",subfolder=self.config.dataset, repo_type='dataset'), #TODO: using _sample for now
                 'splits':hf_hub_url(repo_id=_REPO_ID, filename="sample_clustered_splits.json",subfolder=self.config.dataset,repo_type='dataset')
            })

            fasta_path = file_paths['consensus']
            splits_path = file_paths['splits']

            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "fasta_path": fasta_path,
                        "splits_path":splits_path,
                        "split": "train"
                    },
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "fasta_path": fasta_path,
                        "splits_path":splits_path,
                        "split": "valid"
                    },
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.TEST,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "fasta_path": fasta_path,
                        "splits_path":splits_path,
                        "split": "test"
                    },
                ),
            ]

    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
    def _generate_examples(self, fasta_path, splits_path, split):
        # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
        # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.

        
        if self.config.name == "sequence":        

            dataset = Dataset.from_generator(
                parse_pyfastx_generator,
                gen_kwargs={
                    "fasta_fpath": fasta_path
                }
            )

            if splits_path is not None:
                with open(splits_path,'r') as f:
                    splits = json.load(f)

                dataset = dataset.select(splits[split])
        
            for key,data in enumerate(dataset):
                yield key,data


        if self.config.name == "clustered":

            with open(splits_path,'r') as f:
                splits = json.load(f)

            dataset = Dataset.from_generator(
                parse_pyfastx_generator,
                gen_kwargs={
                    "fasta_fpath": fasta_path
                }
            )

            dataset = make_dset_from_ids(
                ids = splits[split],
                seq_dset = dataset
            ).rename_columns({
                "index":"indexes",
                "accession":"accessions",
                "sequence":"sequences",
                "description":"descriptions"
                }
            )
            
            for key,data in enumerate(dataset):
                yield key,data

In [None]:
# from datasets import DownloadManager

# dl_manager = DownloadManager()
# dl_manager.dow('https://huggingface.co/datasets/samirchar/DayhoffDataset/resolve/main/uniref50_202401/parquets/train/*')

In [None]:
hf_hub_url(repo_id=_REPO_ID, filename="parquets/",subfolder='uniref50_202401', repo_type='dataset')

In [None]:
dl_manager.download({
                 'consensus':hf_hub_url(repo_id=_REPO_ID, filename="consensus_sample.fasta",subfolder=self.config.dataset, repo_type='dataset'), #TODO: using _sample for now
                 'splits':hf_hub_url(repo_id=_REPO_ID, filename="splits.json",subfolder=self.config.dataset,repo_type='dataset'),
                 'lengths_and_offsets':hf_hub_url(repo_id=_REPO_ID, filename="lengths_and_offsets.npz",subfolder=self.config.dataset,repo_type='dataset')
            })

In [None]:
#number of files in local dir
num_local_files = 0
local_dirs = ['../data/gigaref_full/with_singletons/']

local_file_names = []
for local_dir in local_dirs:
    files = glob(osp.join(local_dir, "**"),recursive=True)
    num_local_files += len(files)
    # append the path
    local_file_names.extend([file.replace('data/','') for file in files if osp.isfile(file)])


In [None]:
from datasets import load_dataset
from dotenv import load_dotenv
load_dotenv()
token = os.environ.get("HF_TOKEN")


In [None]:
ds_gigaref_singletons = load_dataset("microsoft/DayhoffDataset",
                  name="gigaref_only_singletons",
                  token=token,
                  streaming=True)

In [None]:
a = iter(ds_gigaref_singletons["train"])


In [None]:
next(a)

In [None]:
ds = load_dataset("microsoft/DayhoffDataset",
                  name="rfdiffusion",
                  split = "rfdiffusion_both_filter",
                  token=token)

In [None]:
ds_uniref90_test = load_dataset("microsoft/DayhoffDataset",
                  name="uniref90",
                #   split = "test",
                  token=token,
                  streaming=True)

In [None]:
next(iter(ds_uniref90_test["train"]))

In [None]:
ds_uniref50_test = load_dataset("microsoft/DayhoffDataset",
                  name="uniref50",
                #   split = "test",
                  token=token,
                  streaming=True)

In [None]:
next(iter(ds_uniref50_test["train"]))

In [None]:
ds.to_json("../data/rfdiffusion/jsonl/rfdiffusion_both_filter.jsonl.gz",lines=True,compression="gzip")

In [None]:
from datasets import load_dataset
ds = load_dataset("samirchar/DayhoffDataset",
                  "sequence",
                  dataset="rfdiffusion_both_filter",
                  trust_remote_code=True)

In [None]:
ds.save_to_disk("../data/rfdiffusion/rfdiffusion_both_filter_arrows")

In [None]:
ds[1]

In [None]:
a_loader = DataLoader(a,batch_size=5)

In [None]:
for i in a_loader:
    print(i['sequence'])
    break

In [None]:
with open('../data/uniref50_202401/splits.json','r') as f:
    splits = json.load(f)

offsets = np.load('../data/uniref50_202401/lengths_and_offsets.npz')

In [None]:
with open('../data/uniref50_202401/splits.json','r') as f:
    splits = json.load(f)

In [None]:
splits.keys()