# Fauna

This is a deconstruction of parts of fauna. 

* https://github.com/nextstrain/fauna

Scripts:

**zika_upload.py**

```
python3 vdb/zika_upload.py \
  -db vdb \
  -v zika \
  --source genbank \
  --locus genome \
  --fname GenomicFastaResults.fasta
```

**zika_update.py**

```
python3 vdb/zika_update.py \
  -db vdb \
  -v zika \
  --update_citations
```

*check dependencies listed in requirements.txt*

In [1]:
# ==== Packages
import os, re, time, datetime, csv, sys
from Bio import SeqIO
from typing import NamedTuple
print("Packages available")

Packages available


## 1. Load an example dataset

Practice on 10 zika sequences. Pull from:

* https://www.viprbrc.org/brc/vipr_genome_search.spg?method=ShowCleanSearch&decorator=flavi_zika

## 2. Process the dataset

In [2]:
# === Input variables
zika_fasta = "../example_data/small.fasta"

# From fauna
strain_fix_fname =  "zika_strain_name_fix.tsv"
location_fix_fname = "zika_location_fix.tsv"
date_fix_fname = "zika_date_fix.tsv"

#virus_fasta_fields = {1:'strain', 3:'collection_date', 4: 'host', 5:'country'}
#sequence_fasta_fields = {0:'accession', 1:'strain'}
# Seems duplicative, replace with:
header_fasta_fields = {0:'accession', 1:'strain', 3:'collection_date', 4: 'host', 5:'country'}
# If we're ignoring 2, then why pull from vipr?

**Functions**

In [3]:
# vdl/uploads.py
def define_fixes_dict(fname:str) -> dict[str,str]:
    '''
    Open strain/location/date fixing tsv files and define corresponding dictionaries
    '''
    reader = csv.DictReader(filter(lambda row: row[0]!='#', open(fname)), delimiter='\t')
    fixes_dict = {}
    for line in reader:
        fixes_dict[line['label'].encode().decode('unicode-escape')] = line['fix']
    return fixes_dict

def fixes_str(original_str:str, fixes_dict:dict[str,str]={}): # bug: Key based on strain
    '''
    return the new strain name/location/date that will replace the original string
    '''
    # labmda x: fixes[original_str] if original_str in fixes dict else original_str
    if original_str in fixes_dict:
        return fixes[original_str] 
    else:
        return original_str

# vdl/zika_uploads.py
def fixes_strain_name(name, fixes_tsv:str="") -> (str,str): # Since we can't decide if we want strain or name
    fixes_dict = {}
    if(len(fixes_tsv)>0):
        fixes_dict = define_fixes_dict(fixes_tsv)
    
    original_name = name
    name = fixes_str(original_name, fixes_dict) 
    name = name.replace('Zika_virus', '').replace('Zikavirus', '').replace('Zika virus', '').replace('Zika', '').replace('ZIKV', '')
    name = name.replace('Human', '').replace('human', '').replace('H.sapiens_wt', '').replace('H.sapiens_tc', '').replace('Hsapiens_tc', '').replace('H.sapiens-tc', '').replace('Homo_sapiens', '').replace('Homo sapiens', '').replace('Hsapiens', '').replace('H.sapiens', '')
    name = name.replace('/Hu/', '')
    name = name.replace('_Asian', '').replace('_Asia', '').replace('_asian', '').replace('_asia', '')
    name = name.replace('_URI', '').replace('_SER', '').replace('_PLA', '').replace('_MOS', '').replace('_SAL', '')
    name = name.replace('Aaegypti_wt', 'Aedes_aegypti').replace('Aedessp', 'Aedes_sp')
    name = name.replace(' ', '').replace('\'', '').replace('(', '').replace(')', '').replace('//', '/').replace('__', '_').replace('.', '').replace(',', '')
    name = re.sub('^[\/\_\-]', '', name)
    try: # ID must start with letter
        name = 'V' + str(int(name))
    except:
        pass
    name = fixes_str(name, fixes_dict)
    return name

# # vdl/parse.py Load data
# def parse_fasta_file(fasta, virus_fasta_fields, sequence_fasta_fields, **kwargs):
#     '''
#     Parse FASTA file with default header formatting
#     :return: list of documents(dictionaries of attributes) to upload
#     '''
#     header_fixes = False
#     if (kwargs["fasta_header_fix"]):
#         header_fixes = {}
#         try:
#             with open(kwargs["fasta_header_fix"], 'rU') as fh:
#                 for line in fh:
#                     if not line.startswith('#'):
#                         k, v = line.strip().split("\t")
#                         header_fixes[k] = v                
#         except IOError:
#             raise Exception(kwargs["fasta_header_fix"], "not found")
#     viruses = []
#     sequences = []
#     try:
#         handle = open(fasta, 'r')
#     except IOError:
#         raise Exception(fasta, "not found")
#     else:
#         for record in SeqIO.parse(handle, "fasta"):
#             if header_fixes:
#                 try:
#                     record.description = header_fixes[record.description]
#                 except KeyError:
#                     raise Exception(record.description, "not in header fix file. Fatal.")
#             content = list(map(lambda x: x.strip(), record.description.replace(">", "").split('|')))
#             v = {key: content[ii] if ii < len(content) else "" for ii, key in virus_fasta_fields.items()}
#             s = {key: content[ii] if ii < len(content) else "" for ii, key in sequence_fasta_fields.items()}
#             s['sequence'] = str(record.seq).lower()
#             #v = self.add_virus_fields(v, **kwargs)
#             #s = self.add_sequence_fields(s, **kwargs)
#             sequences.append(s)
#             viruses.append(v)
#         handle.close()
#     return (viruses, sequences)

# === Only fix casing on the Host?
def fix_casing(self, document): # JC
    for field in ['host']:       # Looping over one entry...hmmmmmmm
        if field in document and document[field] is not None:
            document[field] = self.camelcase_to_snakecase(document[field])

# ===== Main Method
fix_name_dict = define_fixes_dict(strain_fix_fname)  # tsv file in Input
fix_location_dict = define_fixes_dict(location_fix_fname)
fix_date_dict = define_fixes_dict(date_fix_fname)

type(fix_name_dict)
type(fix_location_dict)
type(fix_date_dict)

# ... do same for locations
# self.fix_location = self.define_location_fixes(self.location_fix_fname) # tsv file in input
# self.fix_date = self.define_date_fixes(self.date_fix_fname)

dict

In [4]:
def format_date(date_str):
    '''
    Format viruses date attribute: collection date in YYYY-MM-DD format, for example, 2016-02-28
    Input date could be YYYY_MM_DD, reformat to YYYY-MM-DD
    '''
    # # ex. 2002_04_25 to 2002-04-25
    # date_fields = []
    # for f in ['date', 'collection_date', 'submission_date']: # <= This is out of scope of responsibilities
    #     if f in virus:
    #         date_fields.append(f)

    #for field in date_fields: # No...
    
    # If date_str is empty, return None
    if date_str is None or date_str.strip() == '':
        date_str = None
        return
        
    date_str = re.sub(r'_', r'-', date_str)
    # ex. 2002-XX-XX or 2002-09-05
    if re.match(r'\d\d\d\d-(\d\d|XX)-(\d\d|XX)', date_str):
        pass
    # ex. 2002-2-4
    elif re.match(r'^\d\d\d\d-\d-\d$', date_str):
        date_str = re.sub(
            r'^(\d\d\d\d)-(\d)-(\d)$', r'\1-0\2-0\3', date_str)
    # ex. 2002-02-4
    elif re.match(r'^\d\d\d\d-\d\d-\d$', date_str):
        date_str = re.sub(
            r'^(\d\d\d\d)-(\d\d)-(\d)$', r'\1-\2-0\3', date_str)
    # ex. 2002-2-15
    elif re.match(r'^\d\d\d\d-\d-\d\d$', date_str):
        date_str = re.sub(
            r'^(\d\d\d\d)-(\d)-(\d\d)$', r'\1-0\2-\3', date_str)
    elif re.match(r'\d\d\d\d\s\(Month\sand\sday\sunknown\)', date_str):
        date_str = date_str[0:4] + "-XX-XX"
    # ex. 2009-06 (Day unknown)
    elif re.match(r'\d\d\d\d-\d\d\s\(Day\sunknown\)', date_str):
        date_str = date_str[0:7] + "-XX"
    elif re.match(r'\d\d\d\d-\d\d', date_str):
        date_str = date_str[0:7] + "-XX"
    elif re.match(r'\d\d\d\d', date_str):
        date_str = date_str[0:4] + "-XX-XX"
    else:
        print("Couldn't reformat this date: " +
              date_str + ", setting to None")
        date_str = None
    return date_str

In [5]:
# zika_fasta = "../example_data/small.fasta"

# zika_seqs = parse_fasta_file(zika_fasta, virus_fasta_fields, sequence_fasta_fields, fasta_header_fix = False)

# type(zika_seqs)
# map(lambda x: x, zika_seqs[1])

# seqs=zika_seqs[1]

# one=map(lambda x: x['strain'],seqs)
# print(one)
# #type(seqs["strain"])
# #print(seqs[['strain']])
# #print(zika_seqs[0])
# #print("\n\nstrain: ",zika_seqs[0][0]['strain'])
# #print("fix_strain_name output:", fixes_strain_name(zika_seqs[0][0]['strain']))

## 3. Upload to fauna (nope)

## 3. BioPython reorg

In [6]:
fname=zika_fasta

# Early exit if file not found
try:
    fhandle = open(fname, 'r')
except IOError:
    raise Exception(fname, "not found")

for record in SeqIO.parse(fhandle, "fasta"):
    #print(record.id) # Header, Breaks at spaces!
    print(record.description) # Whole header
    content = list(
        map(lambda x: x.strip(), 
            record.description
            .replace(" ", "_") # Deal with spaces
            .split('|'))
    )
    print(content)
    metadata = {key: content[ii] if ii < len(content) else "" for ii, key in header_fasta_fields.items()}
    print("metadata=", metadata)
    metadata["strain"] = fixes_strain_name(metadata["strain"])
    # metadata["collection_date"] = fixes_str(metadata["strain"], fix_date_dict) # based on fixed strain name?
    # metadata["location"] = fixes_str(metadata["strain"], fix_location_dict)  # find where location is defined
    print("metadata[strain]=", metadata["strain"])
    
    # Hmm, was an obj method (checking for "date", "collection date", "submission date", seems too specialized...)
    # If you want to check for all dates, then check all fields for a XXXX-XX-XX or similar date format...
    metadata["collection_date"]=format_date(metadata["collection_date"])
    print("metadata[collection_date]=", metadata["collection_date"])

    #print(record.seq) # Sequence
    
    # Stream a fasta file, do not read all into memory
    # Append to metadata file
    # Append to sequence file
    
%whos

KY241742|ZIKV_SG_072|NA|2016_08_28|Human|Singapore|Asian|Zika_virus
['KY241742', 'ZIKV_SG_072', 'NA', '2016_08_28', 'Human', 'Singapore', 'Asian', 'Zika_virus']
metadata= {'accession': 'KY241742', 'strain': 'ZIKV_SG_072', 'collection_date': '2016_08_28', 'host': 'Human', 'country': 'Singapore'}
metadata[strain]= SG_072
metadata[collection_date]= 2016-08-28
MF098771|Mexico_Rus_12TVR_2017|NA|2017_01_30|Human|Russia|Asian|Zika_virus
['MF098771', 'Mexico_Rus_12TVR_2017', 'NA', '2017_01_30', 'Human', 'Russia', 'Asian', 'Zika_virus']
metadata= {'accession': 'MF098771', 'strain': 'Mexico_Rus_12TVR_2017', 'collection_date': '2017_01_30', 'host': 'Human', 'country': 'Russia'}
metadata[strain]= Mexico_Rus_12TVR_2017
metadata[collection_date]= 2017-01-30
MF098768|Dominican_Rep_Rus_7EGR_2016|NA|2016_08_25|Human|Russia|Asian|Zika_virus
['MF098768', 'Dominican_Rep_Rus_7EGR_2016', 'NA', '2016_08_25', 'Human', 'Russia', 'Asian', 'Zika_virus']
metadata= {'accession': 'MF098768', 'strain': 'Dominican_Re