In [1]:
import pandas as pd
import numpy as np

import sys
import os

sys.path.append( "../..")

from global_config import config

results_dir = config.get_property("results_dir")
data_dir    = config.get_property("data_dir")

In [29]:
import requests
import re

sero_seq_urls  = [f"https://raw.githubusercontent.com/nextstrain/dengue/main/example_data/sequences_denv{z}.fasta.zst" for z in range(1, 4+1)]
sero_meta_urls = [f"https://raw.githubusercontent.com/nextstrain/dengue/main/example_data/metadata_denv{z}.tsv.zst" for z in range(1, 4+1)]

for z, sero_url in enumerate(sero_seq_urls):
    r              = requests.get(sero_url)
    fasta_zst_file = os.path.join(data_dir, "nextstrain", f"sequences_denv{z+1}.fasta.zst")
    if  os.path.join(data_dir, "nextstrain", f"sequences_denv{z+1}.fasta"):
        continue

    with open(fasta_zst_file, 'wb') as f:
    #giving a name and saving it in any required format
        f.write(r.content)

    r              = requests.get(sero_url)
    meta_zst_file = os.path.join(data_dir, "nextstrain", f"metadata_denv{z+1}.tsv.zst")
    with open(meta_zst_file, 'wb') as f:
    #giving a name and saving it in any required format
        f.write(r.content)

    os.system(f"zstd -d {fasta_zst_file}") # un-compress fasta files
    os.system(f"rm -rf {fasta_zst_file}")  # remove compressed fasta files

    os.system(f"zstd -d {meta_zst_file}") # un-compress fasta files
    os.system(f"rm -rf {meta_zst_file}")  # remove compressed fasta files

seq_df = pd.DataFrame(columns=["name", "serotype", "sequence"])
for sero_idx in range(1, 4+1):
    fasta_file = os.path.join(data_dir, "nextstrain", f"sequences_denv{sero_idx}.fasta")

    f     = open(fasta_file,'r')
    lines = f.readlines()
    hre   = re.compile('>(\S+)')
    lre   = re.compile('^(\S+)$')
    gene  = {}

    for line in lines:
            outh = hre.search(line)
            if outh:
                    id   = outh.group(1)
            else:
                    outl = lre.search(line)
                    if(id in gene.keys()):
                            gene[id] += outl.group(1)
                    else:
                            gene[id] = outl.group(1)

    sero_df             = pd.DataFrame.from_dict(gene, orient='index').reset_index().rename(columns={'index':'name', 0:'sequence'})
    sero_df["serotype"] = f"denv{sero_idx}"
    seq_df = seq_df.append(sero_df)
seq_df["length"] = seq_df["sequence"].apply(lambda x: len(x))

## INFO FROM Bedford lab
# https://github.com/blab/dengue-antigenic-dynamics/blob/dependabot/pip/titer_model/implementation-nextstrain-augur/cvxopt-1.2.7/titer_model/implementation-nextstrain-augur/dengue/dengue.prepare.py
dropped_strains = [
    'DENV1/VIETNAM/BIDV992/2006', 'DENV1/FRANCE/00475/2008', 'DENV1/VIETNAM/BIDV3990/2008', 'DENV2/HAITI/DENGUEVIRUS2HOMOSAPIENS1/2016', # Probable recombinants
    'DENV2/AUSTRALIA/QML22/2015', # Suspiciously far diverged
    'DENV2/MALAYSIA/DKD811/2008', 'DENV2/MALAYSIA/P81407/1970', 'DENV2/SENEGAL/0674/1970', 'DENV2/SENEGAL/DAKAR0761/1974',                  # Sylvatic
    'DENV2/NIGERIA/IBH11234/1966', 'DENV2/NIGERIA/IBH11664/1966', 'DENV2/NIGERIA/IBH11208/1966', 'DENV2/SENEGAL/DAKARD75505/1999',          # Sylvatic
    'DENV2/SENEGAL/DAKAR141069/1999', 'DENV2/SENEGAL/DAKAR141070/1999', 'DENV2/GUINEA/PM33974/1981', 'DENV2/BURKINA_FASO/DAKAR2039/1980',   # Sylvatic
    'DENV2/COTE_D_IVOIRE/DAKAR578/1980', 'DENV2/COTE_D_IVOIRE/DAKAR510/1980', 'DENV2/MALAYSIA/SAB/2015', 'DENV2/TRINIDAD_AND_TOBAGO/NA/1953'# Sylvatic
    'DENV4/MALAYSIA/P731120/1973', 'DENV4/MALAYSIA/P215/1975'# Sylvatic
]

sanofi_vaccine_strains = {
    'denv1': 'DENV1/THAILAND/PUO359/1980',
    'denv2': 'DENV2/THAILAND/PUO218/1980',
    'denv3': 'DENV3/THAILAND/PAH88188/1988',
    'denv4': 'DENV4/INDONESIA/S1228/1978'}


references = {
    "denv1": {"metadata": {'strain': "DENV1/NAURUISLAND/REFERENCE/1997", "accession": "NC_001477", "date": "1997-XX-XX", 'host': "NA", 'country': "Nauru", 'region': "oceania"}},
    "denv2": {"metadata": {'strain': "DENV2/THAILAND/REFERENCE/1964", "accession": "NC_001474", "date": "1964-XX-XX", 'host': "NA", 'country': "Thailand", "region": "southeast_asia"}},
    "denv3": {"metadata": {'strain': "DENV3/SRI_LANKA/REFERENCE/2000", "accession": "NC_001475", "date": "2000-XX-XX", 'host': "NA", 'country': "Sri Lanka", "region": "south_asia"}},
    "denv4": {"metadata": {'strain': "DENV4/NA/REFERENCE/2003", "accession": "NC_002640", "date": "2003-XX-XX", 'host': "NA", 'country': "NA", "region": "NA"}},
}

Unnamed: 0,name,serotype,sequence,length
0,DENV1/INDIA/237/1962,denv1,atgcgatgtgtgggaataggcaacagagacttcgttgaaggcctgc...,1485
1,DENV1/CHINA/RL6/2013,denv1,atgcgatgcgtgggaataggcagtagggacttcgtggaaggactgt...,1485
2,DENV1/THAILAND/AILANDKPPKDV08491NC04528V0L/2007,denv1,atgcgatgcgtgggaataggcagcagggacttcgtggaaggactgt...,1485
3,DENV1/INDIA/EAIIMSDELHI1752/2010,denv1,tgggtgacgtatggtacgtgttctcagacaggcgaacaccgacggg...,1822
4,DENV1/FIJI/C/2012,denv1,atgcggtgtgtgggaataggaaacagagacttcgtggaaggactgt...,1485
...,...,...,...,...
45,DENV4/BRAZIL/GUSP0792/2013,denv4,atgcgatgcgtaggagtaggaaacagagactttgtggaaggagtct...,1485
46,DENV4/BRAZIL/GUSP1224/2013,denv4,atgcgatgcgtaggagttggaaacagagactttgtggaaggagtct...,1485
47,DENV4/VENEZUELA/BIDV1158/2007,denv4,gacaaggacagttccaaatcggaagcttgcttaacacagttctaac...,10606
48,DENV4/BRAZIL/GUSP0766/2013,denv4,atgcgatgcgtaggagtaggaaacagagactttgtggaaggagtct...,1485


In [35]:
seq_df[seq_df.length > 2000]

Unnamed: 0,name,serotype,sequence,length
5,DENV1/BRAZIL/19RJ/2010,denv1,atgaacaaccaacggaaaaagacgggtcgaccgtctttcaatatgc...,2325
8,DENV1/NICARAGUA/BIDV641/2005,denv1,acaagaacagtttcgaatcggaagcttgcttaacgtagttctaaca...,10690
10,DENV1/NICARAGUA/BIDV621/2005,denv1,acaagaacagtttcgaatcggaagcttgcttaacgtagttctaaca...,10690
14,DENV1/SINGAPORE/K4138DK1/2005,denv1,agttgttagtctacgtggaccgacaagaacagtttcgaatcggaag...,10735
15,DENV1/NICARAGUA/BIDV622/2005,denv1,aagaacagtttcgaatcggaagcttgcttaatgtagttctaacagt...,7868
...,...,...,...,...
38,DENV4/SPAIN/BIDV3407/2001,denv4,ggacgccagttcggaagcttgcttaacacagttctaacagtttgtt...,10598
39,DENV4/PUERTO_RICO/BIDV2442/1998,denv4,gacaaggacagttccaaatcggaagcttgcttaacacagttctaac...,10606
41,DENV4/NEW_CALEDONIA/200409/2008,denv4,cggaagcttgcttaacacagttctaacagtttgttttaaatagaga...,10572
43,DENV4/PUERTO_RICO/11987/NA,denv4,atgaaccaacgaaaaaaggtggttagaccacctttcaatatgctga...,2552
