In [118]:
import subprocess
import os
import pandas
import pickle
from pathlib import Path
from Bio.Blast import NCBIXML
from collections import defaultdict
from intervaltree import Interval, IntervalTree

In [210]:
def get_output_path(input_file, output_dir, results_dir):
    file_id = Path(input_file).stem
    cwd = Path(os.getcwd())
    output_path = cwd.joinpath(output_dir, file_id, results_dir)
    if not os.path.exists(output_path):
        print(f"Creating output path: {output_path}")
        os.makedirs(output_path)
    return output_path

In [200]:
def run_command(command):
    result = subprocess.run(command, check=True, capture_output=True, text=True)
    return result.stdout if result.returncode == 0 else result.stderr

In [201]:
def get_db_type(db_dir):
    """ Get the individual database for blast and the database type """
    command = ['blastdbcmd', '-list', f'{db_dir}', '-recursive',]
    output = run_command(command)
    dbs = []
    progs = []
    for line in output.split('\n')[:-1]: # last line is empty so don't iterate to it!
        db_info = line.split(' ')
        print(db_info)
        db_name = db_info[0]
        db_type = db_info[1].lower()
        dbs.append(db_name)
        if db_type == 'protein':
            progs.append('blastx')
        elif db_type == 'nucleotide':
            progs.append('blastn')
    return list(zip(dbs, progs))

In [202]:
test_inputs = []
for i in range(1,100):
    test_inputs.append(f'test_{i}.fasta')

In [203]:
dbs = get_db_type('dbs/')

['dbs/all_pf32_db/all_pf32_db', 'Protein']
['dbs/pf32_v3/pf32', 'Protein']
['dbs/plasmid_db/wp_db', 'Nucleotide']
['dbs/plasmid_db/wpdb', 'Nucleotide']


In [211]:
output = 'test_output'
for db in dbs:
    db_path = db[0]
    prog = db[1]
    results_dir = f'{Path(db_path).stem}'
    for file in test_inputs:
        get_output_path(file, output, results_dir)

Creating output path: /home/mf019/longread_pangenome/plasmid_id/test_output/test_1/all_pf32_db
Creating output path: /home/mf019/longread_pangenome/plasmid_id/test_output/test_2/all_pf32_db
Creating output path: /home/mf019/longread_pangenome/plasmid_id/test_output/test_3/all_pf32_db
Creating output path: /home/mf019/longread_pangenome/plasmid_id/test_output/test_4/all_pf32_db
Creating output path: /home/mf019/longread_pangenome/plasmid_id/test_output/test_5/all_pf32_db
Creating output path: /home/mf019/longread_pangenome/plasmid_id/test_output/test_6/all_pf32_db
Creating output path: /home/mf019/longread_pangenome/plasmid_id/test_output/test_7/all_pf32_db
Creating output path: /home/mf019/longread_pangenome/plasmid_id/test_output/test_8/all_pf32_db
Creating output path: /home/mf019/longread_pangenome/plasmid_id/test_output/test_9/all_pf32_db
Creating output path: /home/mf019/longread_pangenome/plasmid_id/test_output/test_10/all_pf32_db
Creating output path: /home/mf019/longread_pangen

In [179]:
for db in dbs:
    db_path = db[0]
    prog = db[1]
    print(db_path, prog)

dbs/all_pf32_db/all_pf32_db blastx
dbs/pf32_v3/pf32 blastx
dbs/plasmid_db/wp_db blastn
dbs/plasmid_db/wpdb blastn


In [11]:
pf_results_test = 'blast_results_v8/pf32_v3/ESI26H_pf32.xml'
wp_results_test = '/home/mf019/borrelia_plasmid_classifier_v3/output/blast_results_v7/whole_plasmid/ESI26H_whole_plasmid.xml'

In [12]:
output_test = 'test_outdir'

In [36]:
## Function to calculate overall percent identity and coverage using HSPs
#def calculate_percent_identity_and_coverage(intervals, alignment):
#    total_identities = 0
#    total_alignment_length = 0
#    covered_intervals = []
#    query_intervals = []
#    subject_hit_coords = []
#    hsp_counter = 0
#    interval_groups = set()
#    
#    for hsp in alignment.hsps:
#        total_identities += hsp.identities
#        total_alignment_length += hsp.align_length
#
#        # Add HSP interval to the hit location list. this is to check the annotation.
#        subj_start = hsp.sbjct_start
#        subj_end = hsp.sbjct_end
#
#        # Add 'em to our list of subject hit coords.
#        subject_hit_coords.append((subj_start, subj_end))
#
#        # Add abs HSP interval to the covered intervals list, take the min/max to make sure the coordinates are correct! (This is for raw coverage)
#        abs_subj_start = min(subj_start, subj_end)
#        abs_subj_end = max(subj_start, subj_end)
#
#        # Also add query coordinates so we can see where on the contig we are aligning.
#        query_start = hsp.query_start
#        query_end = hsp.query_end
#
#        # Add this to the query_intervals list
#        query_intervals.append((query_start,query_end))
#
#        # Add abs HSP interval to the covered intervals list
#        covered_intervals.append((abs_subj_start, abs_subj_end))
#        
#        interval_groups.add(intervals.add_interval(query_start, query_end))
#
#    # Merge overlapping intervals to calculate the covered length
#    merged_intervals = merge_intervals(covered_intervals)
#    covered_length = sum(end - start for start, end in merged_intervals)
#
#    # Calculate overall percent identity
#    overall_percent_identity = (total_identities / total_alignment_length) * 100 if total_alignment_length != 0 else 0
#
#    # Calculate coverage percentage
#    reference_length = alignment.length
#    coverage_percentage = (covered_length / reference_length) * 100 if reference_length != 0 else 0
#    
#    if len(interval_groups) == 1:
#        interval_groups = int(next(iter(interval_groups)))
#    
#    return interval_groups, overall_percent_identity, coverage_percentage, covered_length, covered_intervals, query_intervals, subject_hit_coords

# Function to calculate overall percent identity and coverage using HSPs
def calculate_percent_identity_and_coverage(alignment):
    total_identities = 0
    total_alignment_length = 0
    query_intervals = IntervalTree()
    subject_intervals = IntervalTree()

    for hsp in alignment.hsps:
        total_identities += hsp.identities
        total_alignment_length += hsp.align_length

        # Add HSP interval to the query and subject intervals
        query_start, query_end = sorted([hsp.query_start, hsp.query_end])
        subject_start, subject_end = sorted([hsp.sbjct_start, hsp.sbjct_end])
        
        query_intervals.addi(query_start, query_end)
        subject_intervals.addi(subject_start, subject_end)

    # Merge overlapping intervals to calculate the covered length
    covered_length = sum(interval.length() for interval in subject_intervals)
    
    # Calculate overall percent identity
    overall_percent_identity = (total_identities / total_alignment_length) * 100 if total_alignment_length != 0 else 0

    # Calculate coverage percentage
    reference_length = alignment.length
    coverage_percentage = (covered_length / reference_length) * 100 if reference_length != 0 else 0
    
    return {
        "overall_percent_identity": overall_percent_identity,
        "coverage_percentage": coverage_percentage,
        "covered_length": covered_length,
        "covered_intervals": [(interval.begin, interval.end) for interval in subject_intervals],
        "query_intervals": [(interval.begin, interval.end) for interval in query_intervals],
        "subject_hit_coords": [(hsp.sbjct_start, hsp.sbjct_end) for hsp in alignment.hsps]
    }

In [109]:
class IntervalGroupManager:
    def __init__(self):
        self.groups = []
        self.trees = []

    def get_groups(self):
        return self.groups

    def get_trees(self):
        return self.trees
    
    def add_interval(self, start, end):
        new_interval = Interval(start, end)
        for idx, tree in enumerate(self.trees):
            if tree.overlaps(start, end):
                tree.add(new_interval)
                self.groups[idx].append((start, end))
                return idx
        # If no existing tree can accommodate the new interval, create a new group, 
        # this indicates multiple homologous regions?
        new_tree = IntervalTree()
        new_tree.add(new_interval)
        self.trees.append(new_tree)
        self.groups.append([(start, end)])
        return len(self.groups) - 1

def get_name_from_acc(hit_id, parsing_dict):
    acc_id = hit_id.split('|')[1]
    name = parsing_dict[acc_id]['name']
    strain = parsing_dict[acc_id]['strain']
    return strain, name

def parse_hit_id(hit_id):
    gene_id_list = hit_id.strip().split('_')
    if len(gene_id_list) == 2:
        strain = gene_id_list[0]
        plasmid_id = gene_id_list[-1]
    elif len(gene_id_list) == 1: # this is the weird pdb case.
        strain = gene_id_list[0].split('|')[1]
        plasmid_id = gene_id_list[0].split('|')[-1]
    elif gene_id_list[0] == 'NE': # this is the NE_1234 strains single case, hate this too.
        strain = '_'.join(gene_id_list[0:2])
        plasmid_id = gene_id_list[2]
    else:
        strain = gene_id_list[0]
        plasmid_id = gene_id_list[1]
    return strain, plasmid_id     

def parse_blast_xml(xml_file, **kwargs):
    assembly_id = Path(xml_file).stem.split('_')[0]
    parsing_type = kwargs.get('parsing_type', 'general')
    
    with open(xml_file, 'r') as handle:
        records = NCBIXML.parse(handle)
        
        # Dictionary to store contig IDs and their interval group managers
        contig_interval_groups = kwargs.get('contig_interval_groups', defaultdict(IntervalGroupManager))
        print(parsing_type)
        parsing_pickle = '/home/mf019/borrelia_plasmid_classifier_v3/parsing_tables/blast_parsing_dict.pkl'
        parsing_dict = pickle.load(open(parsing_pickle, 'rb'))
        
        # set up dict for intervals
        blast_hits = defaultdict(dict)
        
        for record in records:
            keys = [
                "assembly_id", "contig_id", "plasmid_id", "plasmid_name", "strain", "query_length", 
                "ref_length", "overall_percent_identity", "coverage_percentage", 
                "covered_length", "covered_intervals", "query_intervals", "subject_hit_coords",
            ]
            contig_id = record.query
            query_length = record.query_length
            hsp_counter = 0
            
            if contig_id not in blast_hits:
                blast_hits[contig_id] = []
            hits = blast_hits[contig_id]
            
            if len(record.alignments) == 0:
                hits = dict(zip(keys, 'NaN'*len(keys)))
            for alignment in record.alignments:
                plasmid_id = alignment.hit_id
                ref_length = alignment.length
                
                
                if parsing_type == 'wp':
                    strain, plasmid_name = get_name_from_acc(plasmid_id, parsing_dict)
                elif parsing_type == 'pf32':
                    strain, plasmid_name = parse_hit_id(plasmid_id)
                else:
                    strain = 'unknown' # this needs work
                    plasmid_name = alignment.hit_id

                results = {
                    "assembly_id": assembly_id,
                    "contig_id": contig_id,
                    "plasmid_id": plasmid_id,
                    "plasmid_name": plasmid_name,
                    "strain": strain,
                    "query_length": query_length,
                    "ref_length": ref_length,
                }
                alignment_results = calculate_percent_identity_and_coverage(alignment)
                results.update(alignment_results)
                hits.append(results)
        return blast_hits

# Function to determine the best match
def get_best_match(matches, key):
    """Highest percent identity takes the cake. specify which feature to compare.
    ex: get_best_match(matches, "percent_identity")"""
    best_match = None
    best_score = -1
    for match in matches:
        if match[key] > best_score:
            best_score = match[key]
            best_match = match
    return best_match

def get_hits_table(hits):
    rows = []
    for contig, hits in hits.items():
        for hit in hits:
            rows.append(hit)
    return pandas.DataFrame(rows)

        #hits_names = [parse_hit_id(name)[1] for name in hits]
        #print(f'{asm_id}\t{contig}\t{hits_names}')
        #for contig, intervals in contig_hit_intervals.items():
        #    if len(intervals) > 1:
        #        sorted_interval = sorted(intervals)
        #        print(contig, sorted_interval)
        #        previous_interval = sorted_interval[0]
        #        multiple_non_ovlp = False
        #        for current_interval in sorted_interval[1:]: 
        #            if previous_interval.end < current_interval.begin:
        #                multiple_non_ovlp = True
        #            previous_interval = current_interval
        #        if multiple_non_ovlp:
        #            print(f"{contig} has multiple non-overlapping pf32 hits!")
        #    else:
        #        print(contig, intervals)
                

In [110]:
wp_hits = parse_blast_xml(wp_results_test, parsing_type='wp')
pf32_hits= parse_blast_xml(pf_results_test, parsing_type='pf32')
pf32_hits_df = get_hits_table(pf32_hits)
wp_hits_df = get_hits_table(wp_hits)

wp
pf32


In [111]:
wp_hits_df

Unnamed: 0,assembly_id,contig_id,plasmid_id,plasmid_name,strain,query_length,ref_length,overall_percent_identity,coverage_percentage,covered_length,covered_intervals,query_intervals,subject_hit_coords
0,ESI26H,contig000001,gb|CP124100.1|,chromosome,NE_5261,910396,910422,99.952394,101.286986,922139,"[(213508, 214315), (213184, 213667), (213670, ...","[(438371, 441568), (1, 910396), (213176, 21414...","[(1, 910420), (438379, 441576), (435133, 43833..."
1,ESI26H,contig000001,gb|CP124096.1|,chromosome,NE_5267,910396,910422,99.952178,101.286986,922139,"[(213183, 213990), (435132, 438330), (213345, ...","[(438371, 441568), (1, 910396), (213176, 21414...","[(1, 910420), (438378, 441575), (435132, 43833..."
2,ESI26H,contig000001,gb|CP124104.1|,chromosome,NE_5248,910396,910663,99.927423,101.218014,921755,"[(213179, 214148), (438374, 441571), (909788, ...","[(438371, 441568), (1, 909583), (213176, 21414...","[(1, 909646), (435128, 438326), (438374, 44157..."
3,ESI26H,contig000001,gb|CP124088.1|,chromosome,Am315,910396,910229,99.554979,101.491713,923807,"[(909447, 910227), (213287, 213932), (213125, ...","[(438371, 441568), (213176, 214145), (213338, ...","[(1, 909305), (434768, 437966), (438014, 44121..."
4,ESI26H,contig000001,gb|CP031412.1|,chromosome,MM1,910396,908512,99.290610,101.389965,921140,"[(213217, 214024), (904706, 907514), (213055, ...","[(213176, 214145), (905896, 908705), (336791, ...","[(1, 904621), (434870, 438062), (438118, 44131..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,ESI26H,contig000051,gb|AE001577.1|,cp32-4,B31,210,30299,100.000000,0.689792,209,"[(10178, 10387)]","[(1, 210)]","[(10178, 10387)]"
241,ESI26H,contig000051,gb|CP019759.1|,cp32-4,B31_NRZ,210,30300,100.000000,0.689769,209,"[(10179, 10388)]","[(1, 210)]","[(10179, 10388)]"
242,ESI26H,contig000051,gb|CP019757.1|,cp32-2,B31_NRZ,210,31245,100.000000,0.668907,209,"[(10153, 10362)]","[(1, 210)]","[(10153, 10362)]"
243,ESI26H,contig000051,gb|CP001572.1|,cp32-5,Bol26,210,30186,100.000000,0.692374,209,"[(10146, 10355)]","[(1, 210)]","[(10146, 10355)]"


In [112]:
pf32_hits_df

Unnamed: 0,assembly_id,contig_id,plasmid_id,plasmid_name,strain,query_length,ref_length,overall_percent_identity,coverage_percentage,covered_length,covered_intervals,query_intervals,subject_hit_coords
0,ESI26H,contig000001,NE_5261_chromosome_ParA_2,chromosome,NE_5261,910396,380,100.0,96.052632,365,"[(1, 366)]","[(368852, 369949)]","[(1, 366)]"
1,ESI26H,contig000001,B379_chromosome_ParA_2,chromosome,B379,910396,380,99.453552,96.052632,365,"[(1, 366)]","[(368852, 369949)]","[(1, 366)]"
2,ESI26H,contig000001,80a_chromosome_ParA_2,chromosome,80a,910396,380,99.453552,96.052632,365,"[(1, 366)]","[(368852, 369949)]","[(1, 366)]"
3,ESI26H,contig000001,B500_chromosome_ParA_2,chromosome,B500,910396,380,99.450549,95.526316,363,"[(1, 364)]","[(368852, 369943)]","[(1, 364)]"
4,ESI26H,contig000001,NE_5261_chromosome_ParA_1,chromosome,NE_5261,910396,295,100.0,99.661017,294,"[(1, 295)]","[(281792, 282676)]","[(1, 295)]"
5,ESI26H,contig000002,JD1_cp32-12_ParA_1,cp32-12,JD1,61321,260,99.615385,99.615385,259,"[(1, 260)]","[(49214, 49993)]","[(1, 260)]"
6,ESI26H,contig000002,ZS7_cp32-12_ParA_1,cp32-12,ZS7,61321,260,98.846154,99.615385,259,"[(1, 260)]","[(49214, 49993)]","[(1, 260)]"
7,ESI26H,contig000002,297_cp32-12,cp32-12,297,61321,260,98.841699,99.230769,258,"[(1, 259)]","[(49217, 49993)]","[(1, 259)]"
8,ESI26H,contig000002,N40_cp32-12_ParA_1,cp32-12,N40,61321,260,92.277992,98.461538,256,"[(1, 257)]","[(49217, 49993)]","[(1, 257)]"
9,ESI26H,contig000002,B31_cp32-5,cp32-5,B31,61321,257,100.0,99.610895,256,"[(1, 257)]","[(18558, 19328)]","[(1, 257)]"


In [108]:
pf_df = get_pf32_table('ESI26H', pf32_hits)
wp_df = get_wp_table('ESI26H', wp_hits)
merged = pandas.merge(wp_df, pf_df, on=['assembly', 'contig', 'homology_group'], how='outer')

NameError: name 'groups' is not defined

In [698]:
merged[merged['contig']=='contig000002']

Unnamed: 0,assembly,contig,contig_len,homology_group,wp_id,wp_name,wp_percent_identity,wp_query_coords,pf32_id,pf32_name,pf32_percent_identity,pf32_query_coords
1,ESI26H,contig000002,61321,0,CP001565.1,cp32-12,95.634673,"(1, 10018)",JD1_cp32-12_ParA_1,cp32-12,99.615385,"[(49214, 49993)]"
2,ESI26H,contig000002,61321,1,CP001565.1,cp32-12,95.634673,"(10019, 12990)",,,,
3,ESI26H,contig000002,61321,2,CP001565.1,cp32-12,95.634673,"(13472, 13919)",B31_cp32-5,cp32-5,100.0,"[(18558, 19328)]"
4,ESI26H,contig000002,61321,3,CP001565.1,cp32-12,95.634673,"(13829, 18366)",,,,
5,ESI26H,contig000002,61321,4,CP001565.1,cp32-12,95.634673,"(18633, 22346)",,,,
6,ESI26H,contig000002,61321,5,CP001565.1,cp32-12,95.634673,"(22432, 39900)",,,,


In [687]:
wp_hits['contig000002'][0]

[{'contig_id': 'contig000002',
  'plasmid_id': 'CP001565.1',
  'plasmid_name': 'cp32-12',
  'query_length': 61321,
  'ref_length': 31433,
  'overall_percent_identity': 95.63467288798344,
  'coverage_percentage': 99.99681863010213,
  'covered_positions': 31432,
  'covered_intervals': [(10018, 31433),
   (1, 17505),
   (1, 10017),
   (22311, 26898),
   (28477, 31433),
   (18267, 22039),
   (27531, 27987)],
  'query_intervals': [(1, 10018),
   (10019, 12990),
   (13472, 13919),
   (13829, 18366),
   (18633, 22346),
   (22432, 39900),
   (39901, 61321)],
  'subject_hit_coords': [(31433, 10018),
   (17505, 1),
   (10017, 1),
   (26898, 22311),
   (31433, 28477),
   (22039, 18267),
   (27987, 27531)]},
 {'contig_id': 'contig000002',
  'plasmid_id': 'CP019921.1',
  'plasmid_name': 'cp32-9-4',
  'query_length': 61321,
  'ref_length': 62238,
  'overall_percent_identity': 92.983426064249,
  'coverage_percentage': 72.71763231466308,
  'covered_positions': 45258,
  'covered_intervals': [(26761, 47

In [212]:
os.listdir('dbs_v3/wp')

['.ipynb_checkpoints',
 'wp.nto',
 'wp.nhd',
 'wp.ndb',
 'wp.not',
 'wp.njs',
 'wp.nog',
 'wp.nhi',
 'wp.ntf',
 'wp.nsq',
 'wp.nhr',
 'wp.nos',
 'wp.nin']

In [220]:
for file in os.listdir('dbs_v3/wp')[1:]:
    new_name = f'{file.split(".")[0]}_db.{file.split(".")[1]}'
    os.rename(f'dbs_v3/wp/{file}', f'dbs_v3/wp/{new_name}')
    

In [223]:
command = ['blastn', '-query', '/home/mf019/borrelia_plasmid_classifier_v3/assemblies/longread/annotation/URI111H/URI111H.fna', '-task', '"blastn"', '-db', 'dbs_v3/wp_v2/wp', '-out', '/home/mf019/longread_pangenome/plasmid_id/calls_v8/wp/xml_files/URI111H_blast_results.xml', '-evalue 1e-100', '-num_threads 4', '-outfmt 5', '-max_target_seqs 5', '-max_hsps', '10']
print(' '.join(command))

blastn -query /home/mf019/borrelia_plasmid_classifier_v3/assemblies/longread/annotation/URI111H/URI111H.fna -task "blastn" -db dbs_v3/wp_v2/wp -out /home/mf019/longread_pangenome/plasmid_id/calls_v8/wp/xml_files/URI111H_blast_results.xml -evalue 1e-100 -num_threads 4 -outfmt 5 -max_target_seqs 5 -max_hsps 10


In [222]:
get_db_type('dbs_v3')

['dbs_v3/pf32_v3/pf32', 'Protein']
['dbs_v3/wp_v2/wp', 'Nucleotide']


[('dbs_v3/pf32_v3/pf32', 'blastx'), ('dbs_v3/wp_v2/wp', 'blastn')]