# Download CRyPTIC data
Unfortunately, there is a lot of data that needs to be downloaded... which is why it is convenient to do it in a separate notebook.

Here are all the scripts you need to do that.

In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

from google.colab import drive
drive.mount('/content/drive')

evo_general_dir = '/content/drive/MyDrive/EVO/'
vcfs_dir = 'vcfs/'
cryptic_dir = 'cryptic_data/'
cryptic_reuse_csv = 'CRyPTIC_reuse_table_20231208.csv'

Mounted at /content/drive


## Genotype data tables
See [this FTP directory](https://ftp.ebi.ac.uk/pub/databases/cryptic/release_june2022/reproducibility/data_tables/cryptic-analysis-group/). DATA_SCHEMA.pdf provides a good overview. Briefly, this section is for downloading everything BUT sample VCFs.

In [None]:
# files = ['VARIANTS.csv.gz', 'MUTATIONS.csv.gz', 'GENOMES.csv.gz']
files = ['GENOMES.csv.gz']
ftp_directory = 'ftp.ebi.ac.uk/pub/databases/cryptic/release_june2022/reproducibility/data_tables/cryptic-analysis-group/'
output_dir = evo_general_dir + cryptic_dir

for file_name in files:
  to_download = ftp_directory + file_name
  !wget -P $output_dir $to_download

## VCFs
Unlike the above, this section downloads sample VCFs.

In [None]:
# Download cryptic_reuse_csv
# !wget -P $evo_general_dir -q ftp://ftp.ebi.ac.uk/pub/databases/cryptic/	ftp.ebi.ac.uk/pub/databases/cryptic/release_june2022/reuse/CRyPTIC_reuse_table_20231208.csv

In [None]:
output_dir = evo_general_dir + vcfs_dir

reuse_vcf = pd.read_csv(evo_general_dir + cryptic_reuse_csv)

if not os.path.exists(output_dir):
  !mkdir $output_dir

In [None]:
def get_site(unique_id):
  return unique_id.split('.')[1]

for i, row in tqdm(reuse_vcf.iterrows()):
  sample_directory = output_dir + "site_" + get_site(row['UNIQUEID']) + '/'

  if not os.path.exists(sample_directory):
    !mkdir $sample_directory

  to_download = 'ftp.ebi.ac.uk/pub/databases/cryptic/release_june2022/reproducibility/' + row['VCF']

  file_name = row['VCF'].strip().split('/')[-1]
  if os.path.exists(sample_directory + file_name): continue

  !wget -P $sample_directory -q $to_download

## Reference genome gumpy pickle + WGS creation
Because it can take over a minute to load a Genome, it is best to pickle the reference genome.

However, we have discovered that loading a pickled Genome is not much faster than loading a VCF and "adding" it to a Genome. So, there's no need to pickle every M. tb genome.

**Conclusion:** Creating a Genome object takes too much time in general. The best thing to do is, to store the whole sequence as a FASTA (i.e. just one string), and use "primers" to obtain the segments you are interested in.

In [2]:
# # Uncomment this code if you have not created a .gbk file!
# # GeneBank has deprecated the .gbk format in favor of a new .gbff format
# # We can use Biopython to downgrade a .gbb file into a .gbk one
# # Download H37Rv .gbff file from NCBI Datasets: https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000195955.2/

# from Bio import SeqIO
# reference_file = evo_general_dir + 'h37rv_genebank_flatfile.gbff'
# SeqIO.convert(reference_file, 'genbank', evo_general_dir + 'h37rv_genebank.gbk', 'genbank')

# # This is important because .gbk works with gumpy, but not .gbff

In [3]:
!pip install gumpy

import gumpy as gp
import pickle

# print("Loading reference genome...")
# ref_genome = gp.Genome(evo_general_dir + 'h37rv_genebank.gbk', reference=True)
# pickle.dump(ref_genome, open(evo_general_dir + 'h37rv_genome.pkl', 'wb'))
# print("Done!")

# # how to load the data!
ref_genome = pickle.load(open(evo_general_dir + 'h37rv_genebank.pkl', 'rb'))

# how to obtain a reference genome
genome = "".join(ref_genome.nucleotide_sequence)

Collecting gumpy
  Downloading gumpy-1.2.7-py3-none-any.whl (48 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/48.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting pysam (from gumpy)
  Downloading pysam-0.22.1-cp310-cp310-manylinux_2_28_x86_64.whl (22.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.0/22.0 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting biopython (from gumpy)
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
Collecting pytest-cov (from gumpy)
  Downloading pytest_cov-5.0.0-py3-none-any.whl (21 kB)
Collecting coverage[toml]>=5.2.1 (from pytest-cov->gumpy)
  Downloading coverage-7.5.3-cp310-cp310-manylinux_2_5_x86_64.manyl

In [None]:
evo_general_dir = '/content/drive/MyDrive/EVO/'
vcfs_dir = 'vcfs/'
cryptic_dir = 'cryptic_data/'
cryptic_reuse_csv = 'CRyPTIC_reuse_table_20231208.csv'
genomes_dir = 'genomes/'

input_dir = evo_general_dir + vcfs_dir
reuse_vcf = pd.read_csv(evo_general_dir + cryptic_reuse_csv)

In [None]:
def get_site(unique_id):
  return unique_id.split('.')[1]

for i, row in tqdm(reuse_vcf.iterrows()):

  # check if the VCF file is present
  sample_directory = evo_general_dir + vcfs_dir + "site_" + get_site(row['UNIQUEID']) + '/'
  assert os.path.exists(sample_directory), "Directory does not exist for sample " + row['UNIQUEID']

  file_name = row['VCF'].strip().split('/')[-1]
  assert os.path.exists(sample_directory + file_name), "File does not exist for sample " + row['UNIQUEID']

  # check if the sample genome already exists
  if os.path.exists(evo_general_dir + genomes_dir + 'site_' + get_site(row['UNIQUEID']) + '/' + row['UNIQUEID'] + '.txt'): continue

  # create genome
  file_path = sample_directory + file_name
  gunzip(file_path)

  vcf = gp.VCFFile(file_path[:-3])
  gzip(file_path[:-3])

  genome = gef.ref_genome + vcf
  genome_string = "".join(genome.nucleotide_sequence).upper()

  # create a folder for the sample's site
  # if it does not already exist
  if not os.path.exists(evo_general_dir + genomes_dir + 'site_' + get_site(row['UNIQUEID'])):
    os.mkdir(evo_general_dir + genomes_dir + 'site_' + get_site(row['UNIQUEID']))

  output_file_path = evo_general_dir + genomes_dir + 'site_' + get_site(row['UNIQUEID']) + '/' + row['UNIQUEID'] + '.txt'
  with open(output_file_path, 'w+') as f: f.write(genome_string)

# Deprecated

### Converting embedding files to single sample files
Unfortunately, a file with 100 (500, 4096)-dimensional embeddings is 1.5 GB in size. To ease the load on computers with less CPU, this section takes embedding .npy files and splits them so that each sample gets its own file.

Generally, the .npy files have been generated in other notebooks (see: evo_scratchwork and evo_general, which should be in the same folder as the current notebook.)

In [None]:
evo_general_dir = '/content/drive/MyDrive/EVO/'

embedding_dir = 'emb_embeddings_v1/'
embedding_sub_dir = 'embeds_1.0_left/'
sub_dirs = os.listdir(evo_general_dir + embedding_dir + embedding_sub_dir)
general_path = evo_general_dir + embedding_dir

In [None]:
embed = np.load('/content/drive/MyDrive/EVO/rif_embeddings_v1/embeds_1.0_singles_last_index/site_02/site.02.subj.0001.lab.2014222001.iso.1.npy')

In [None]:
embed.shape

(4096,)

In [None]:
"""
  Pick an embedding index to use. Create a file per embedding.
"""
emb_index = -1
count = 0
for sub_dir in tqdm(sub_dirs):
  directory = general_path + embedding_sub_dir + sub_dir + '/'
  files = os.listdir(directory)
  for f in files:
    count += 1

  for f in tqdm(files):
    embed = np.load(directory + f)
    new_dir = general_path + 'embeds_1.0_singles_last_index/' + sub_dir + '/'

    if not os.path.exists(new_dir):
      os.makedirs(new_dir)

    if not os.path.exists(new_dir + f):
      np.save(new_dir + f, embed[0][emb_index])

## Miscellaneous old code

In [None]:
# for (dirpath, dirnames, filenames) in walk(dir):
#   for filename in filenames:
#     if not filename.endswith('.npy'): continue
#     os.remove(dir + filename)

import os
dir = evo_general_dir + embedding_dir + out_sub_dir
files = os.listdir(dir)
print(len(files))
# for (dirpath, dirnames, filenames) in walk(dir):
#   for filename in filenames:
#     if not filename.endswith('.npy'): continue
#     os.remove(dir + filename)

11


In [None]:
import numpy as np

evo_general_dir = '/content/drive/MyDrive/EVO/'

embedding_dir = 'rif_embeddings_v1/'
embedding_sub_dir = 'embeds_1.0/'

out_sub_dir = 'embeds_1.0_singles/'

# we use UNIQUEIDs to name our files
unique_ids = np.load(evo_general_dir + embedding_dir + 'unique_ids.npy')

In [None]:
from os.path import exists
from tqdm import tqdm

In [None]:
def get_site(unique_id):
  return unique_id.split('.')[1]

missing_ids = []
small_files = []
dir = evo_general_dir + embedding_dir + out_sub_dir
for id in tqdm(unique_ids):
  file_name = dir + "site_" + get_site(id) + '/' + id + '.npy'
  if not exists(file_name):
    missing_ids.append(id)
  else:
    if os.path.getsize(file_name) < 1000000:
      print(os.path.getsize(file_name))
      small_files.append(id)

100%|██████████| 12259/12259 [00:05<00:00, 2313.59it/s]


In [None]:
for id in tqdm(unique_ids):
  out_dir = evo_general_dir + embedding_dir + out_sub_dir + 'site_' + get_site(id) + '/'
  if not os.path.exists(out_dir):
    os.makedirs(out_dir)

  flag, embed = next(embeds)
  if flag == -1:
    current_file += 1
    embeds = get_embedding(evo_general_dir + embedding_dir + embedding_sub_dir + files[current_file])

  np.save(out_dir + id + '.npy', embed)

In [None]:
from os import walk

def get_embedding(file_name):
  embeds = np.load(file_name)
  for embed in embeds:
    yield 1, embed

  yield -1, None

def get_site(unique_id):
  return unique_id.split('.')[1]

# get all of our embedding .npy files
files = []
for (dirpath, dirnames, filenames) in walk(evo_general_dir + embedding_dir + embedding_sub_dir):
  for filename in filenames:
    if not filename.endswith('.npy'): continue
    files.append(filename)

In [None]:
current_file = 0
embeds =  get_embedding(evo_general_dir + embedding_dir + embedding_sub_dir + files[current_file])
for id in tqdm(unique_ids):
  out_dir = evo_general_dir + embedding_dir + out_sub_dir + 'site_' + get_site(id) + '/'
  if not os.path.exists(out_dir):
    os.makedirs(out_dir)

  flag, embed = next(embeds)
  if flag == -1:
    current_file += 1
    embeds = get_embedding(evo_general_dir + embedding_dir + embedding_sub_dir + files[current_file])

  np.save(out_dir + id + '.npy', embed)

100%|██████████| 12262/12262 [56:55<00:00,  3.59it/s]
