# Remove contamination from SAM/BAM file

### This notebook accompanies the paper "Illuminating Genetic Mysteries of the Dead Sea Scrolls"
#### Author: Moran Neuhof

The following notebook describes the code behind the removal of contamination in BAM files.
It follows the method described in Figure S4 and in the *"Filtration of mitochondrial sequences from contaminants"* section of the STAR methods.
The script receives a folder containing BAM files with files, with file names in the following format:
```<fragment name>.<species>.<suffix>```  
Where:  
```
suffix = 'mito.e2e.sam'
species = one of {'Bos taurus', 'Capra hircus', 'Ovis aries'}, etc.
```

In [None]:
import os
import pysam
from collections import defaultdict

Parsing a CSV file, returning a list of lists.
An example for such a CSV file:
```
dss565,Ovis,Bos
dss565,Ovis,Capra
dss565,Ovis,Bos+Capra
dss565,Bos,Ovis
dss565,Bos,Capra
dss565,Bos,Ovis+Capra
dss565,Capra,Ovis
dss565,Capra,Bos
dss565,Capra,Ovis+Bos
```

In [None]:
def parse_csv(input_file):
    """
    Parse csv file with the following structure: fragment,organism,other_organsisms
    Input: csv file
    Outpt: a list of split lines (list of lists)
    """
    with open(input_file, 'r') as infile:
        parsed_lines = [line.strip().split(',') for line in infile]
    return parsed_lines

In [None]:
def clean_bam_file(fragment, organism, other_organisms, bam_path=BAM_PATH):
    """
    Filters reads from fragment.organism which are also present in on of fragment.other_organisms.
    The filtered files are saved in bam_path.
    
    Receives: 
        fragment:         fragment name
        organism:         organism name
        other_organisms:  a list of organisms to remove from fragment
        bam_path:         a path containing the SAM/BAM files
    Returns:
        result:           a comma-separated string with of the following fields:
                            fragment,organism,#reads,#kept_reads,#removed_reads
    """

    reads_to_remove = []
    suffix = "mito.e2e.sam" 
    organism_dict = {"Bos": "Bos_taurus",
                     "Capra": "Capra_hircus", 
                     "Ovis": "Ovis_aries"}
    # for organism, organism_full_name in organism_dict.items():
    organism_full_name = organism_dict[organism]
    filename = os.path.join(bam_path, f"{fragment}.{organism_full_name}.{suffix}")
    if len(other_organisms) == 1:  # filter only one organism
        output_filename = os.path.join(bam_path, f"{fragment}.{organism_full_name}.mito.cleaned_from_{other_organisms[0]}.bam")  # saving as BAM
    else:  # more than one organism to filter
        output_filename = os.path.join(bam_path, f"{fragment}.{organism_full_name}.mito.cleaned_from_{'_'.join(other_organisms)}.bam")  # saving as BAM

    # now iterate over the two other organisms
    for other_organism in other_organisms:
        # preparing the other_organism filename
        other_organism_full_name = organism_dict[other_organism]
        other_organism_filename = os.path.join(bam_path, f"{fragment}.{other_organism_full_name}.{suffix}")

        # reading the other organism file, looking for reads
        with pysam.AlignmentFile(other_organism_filename, "r") as other_organism_samfile:
            reads_to_remove += [read.qname for read in other_organism_samfile.fetch()]  # compiling a list of read IDs to renmove
            

    # now open the input and output sam files and remove the reads we don't need:
    print(f"Removing {len(reads_to_remove)} reads from {filename}")
    with pysam.AlignmentFile(filename, "r") as input_samfile:
        with pysam.AlignmentFile(output_filename, "wb", template=input_samfile) as output_bamfile:
            read_num = 0  # total reads counter
            good_reads = 0  # reads kept counter
            # iterating over reads in the original file
            for read in input_samfile:
                read_num += 1  # counter
                if read.qname not in reads_to_remove:  # not removing the read
                    output_bamfile.write(read)  
                    good_reads += 1  # counter
            print(f"Kept {good_reads} out of {read_num}")
    
    # printing result
    result = f"{fragment},{organism},{read_num},{good_reads},{read_num-good_reads}"
    print(f"Reads written to {output_filename}")
    return result


Running the script requires a path of SAM/BAM files, and a CSV file (infile) in the format above.

In [None]:
BAM_PATH = # Enter a folder containing the BAM files you wish to filter
infile = # Enter CSV file argv[1]  # csv file
output_stats_file = "removed_reads.csv"

In [None]:
fragment_lines = parse_csv(infile)

with open(output_stats_file, 'w') as stats_outfile:
    for fragment_line in fragment_lines:  # going over the lines in the CSV file
        fragment, organism, other_organisms_str = fragment_line  # unpacking
        other_organisms_list = other_organisms_str.split('+')  # treating a case where we filter multiple organisms
        print(f"Filtering {fragment} from {other_organisms_str}:")
        line_to_print = clean_bam_file(fragment, organism, other_organisms_list)  # cleaning file and writing output
        print(line_to_print, file=stats_outfile)

print("Done.")