# Remove duplicate genomes 

Prior to subsampling by metadata fields, I would like to remove sequences that are identical. To do so, I am first going to ideentify duplicate sequences. Then, among the duplicate sequences, I will pick the ones that have the most complete date and correspond to the best host categories. For example, if there are identical sequences from a bird and the environment, I will choose the bird one. 

In [1]:
import Bio 
from Bio import SeqIO
import random

In [2]:
def return_date_resolution(date):
    date = date.replace("-XX","")
    
    if len(date) == 4:
        date_resolution = "year"
    elif len(date) == 7:
        date_resolution = "month"
    elif len(date) == 10:
        date_resolution = "day"
    else:
        print("what's up with this date ", date)
    
    return(date_resolution)

In [3]:
def read_metadata_file(metadata_file):
    output_dict = {}
    with open(metadata_file, "r") as infile: 
        for line in infile: 
            if "originating_lab" not in line:
                strain = line.split("\t")[0]
                accession = line.split("\t")[2]
                date = line.split("\t")[3]
                date_resolution = return_date_resolution(date)
                year = date.split("-")[0]
                region = line.split("\t")[4]
                country = line.split("\t")[5]
                host_group = line.split("\t")[8]
                host_species = line.split("\t")[12]
                domestic_wild = line.split("\t")[13]
                annotation_method = line.split("\t")[14]
                genbank_id = line.split("\t")[15]
                                
                output_dict[strain] = {"accession":accession,"date":date, "region":region, "country":country,
                                       "domestic_wild":domestic_wild,"host_group":host_group, "host_species":host_species,
                                       "annotation_method":annotation_method,"genbank_id":genbank_id,
                                      "date_resolution":date_resolution, "year":year}
    return(output_dict)

In [4]:
def return_unique_sequences(fasta_file):
    unique_sequences = {}
    for seq in SeqIO.parse(fasta_file, "fasta"):
        sequence = str(seq.seq)
        strain = seq.description

        if sequence in unique_sequences: 
            unique_sequences[sequence].append(strain)
        else:
            unique_sequences[sequence] = [strain]
            
    return(unique_sequences)

In [7]:
def return_seqs_within_seqs(unique_sequences_dict):
    unique_sequences2 = {}
    
    all_seqs = list(unique_sequences_dict.keys())
    for seq in unique_sequences_dict:
        for s in all_seqs: 
            if seq in s:
                print(seq, "contained in ", s)
        

In [8]:
def return_best_host_category(strains, metadata_dict):
    hosts = []
    for strain in strains:
        host_group = metadata_dict[strain]['host_group']
        hosts.append(host_group)
    
    hosts = list(set(hosts))
    
    # if the set of hosts is greater than 1, get rid of the ones we don't want
    if len(hosts) > 1:
        for category in ["Environment","Laboratoryderived","Nonhuman Mammal"]:
            if category in hosts:
                hosts.remove(category)
    
    else:
        hosts = hosts
    
    return(hosts)

In [9]:
def return_best_date(strains, metadata_dict):
    dates = []
    for strain in strains:
        date_resolution = metadata_dict[strain]['date_resolution']
        dates.append(date_resolution)
    
    dates = list(set(dates))
    
    # if the set of hosts is greater than 1, get rid of the ones we don't want
    if len(dates) > 1:
        if "day" in dates: 
            best_date = "day"
        elif "month" in dates:
            best_date = "month"
        else:
            best_date = "year"
            
    else:
        best_date = dates[0]
    
    return(best_date)

In [10]:
def return_strains_with_best_attributes(strains, metadata_dict, hosts, best_date):
    best_strains = []
    good_strains = []
    
    for strain in strains:
        host_group = metadata_dict[strain]['host_group']
        date_resolution = metadata_dict[strain]['date_resolution']
        
        """if the strain has the best host and date, put in best strains list"""
        if host_group in hosts and date_resolution == best_date: 
            best_strains.append(strain)
            
        elif host_group in hosts or date_resolution == best_date: 
            good_strains.append(strain)
    
    
    if len(best_strains) == 0:
        return(good_strains)
    else:
        return(best_strains)

In [41]:
def randomly_pick_strain(strains):
    selection = random.randint(0, len(strains)-1)
    strain_to_keep = strains[selection]
    return(strain_to_keep)

In [42]:
def pick_best(unique_sequences):
    
    best_unique_sequences = {}
        
    for u in unique_sequences:

        """for duplicate sequences, pick the sequence with the best metadata"""
        if len(unique_sequences[u]) > 1:
            hosts = return_best_host_category(unique_sequences[u], metadata_dict)
            best_date = return_best_date(unique_sequences[u], metadata_dict)

            sequences_to_pick_from = return_strains_with_best_attributes(unique_sequences[u], metadata_dict, hosts, best_date)
            sequence_to_keep = randomly_pick_strain(sequences_to_pick_from)        
        
        else:
            sequence_to_keep = str(unique_sequences[u][0])
        
        best_unique_sequences[u] = sequence_to_keep
    
    return(best_unique_sequences)

In [43]:
def return_seqs_within_seqs(best_unique_sequences_dict):
    unique_sequences_without_nested = {}
    
    for seq in best_unique_sequences_dict:
        has_duplicate = False
        strain_name = best_unique_sequences_dict[seq]
        
        for s in best_unique_sequences_dict:
            strain_name2 = best_unique_sequences_dict[s]
            if seq in s and strain_name != strain_name2:
                has_duplicate = True
        
        if has_duplicate == False: 
            unique_sequences_without_nested[seq] = strain_name
        else: 
            pass
    
    return(unique_sequences_without_nested)

In [44]:
def write_deduped(output_filename, unique_sequences_without_nested):
    
    with open(output_filename, "w") as outfile:
        outfile.write("")
    
    for u in unique_sequences_without_nested:
        strain = unique_sequences_without_nested[u]

        with open(output_filename, "a") as outfile: 
            outfile.write(">" + strain + "\n" + u + "\n")

In [45]:
directory = "/Users/lmoncla/src/h5n1-host-classification/metadata-with-annotations/"
genes = ['pb2','pb1','pa','ha','np','na','mp','ns']

for gene in genes: 
    fasta_file = directory + "sequences_h5n1_"+gene+"-full-genomes-only-2020-10-07.fasta"
    output_filename = directory + "sequences_h5n1_"+gene+"-full-genomes-deduped-2020-11-16.fasta"
    metadata_file = directory + "metadata_h5n1_"+gene+"-full-genomes-only-2020-10-07.txt"
    
    metadata_dict = read_metadata_file(metadata_file)
    unique_sequences = return_unique_sequences(fasta_file)
    best_unique_sequences = pick_best(unique_sequences)
    unique_sequences_without_nested = return_seqs_within_seqs(best_unique_sequences)
    write_deduped(output_filename, unique_sequences_without_nested)