In [None]:
import os
import csv
import glob
from Bio import SeqIO
from collections import defaultdict


In [None]:
b31_reference_genbank = '/Users/mf019/bioinformatics/longread_pangenome/ref/renamed_GCF_000008685.2.gbff'
# def some vars for paths
annotations_dir = '/Users/mf019/bioinformatics/longread_pangenome/assemblies/paired_assemblies/paired_only/longread/annotation'
alignments_dir = '/Users/mf019/bioinformatics/longread_pangenome/synteny-checking/pgv_mummer_output_2'
output_dir = '/Users/mf019/bioinformatics/longread_pangenome/synteny-checking/output'

# Okay let's def some functs to parse the aln tsv from mummer.
# we also need to parse the B31 genome and store that in a dictionary for each plasmid.

# Basically, {Plasmid: {gene: {coords: int:int, strand: int, name: str, etc...} ... } ... }
# And the same thing for contigs.
# then parse the aln file and go : okay we have this assembly, this contig, and these coordinates that align with a reference plasmid.
# What genes are within these coordinates, and what genes are on the reference between those coordinates.

# :)

# We will also do this one at a time. Probably also move blast results into each asm directory so each assembly has everything in one place.

# That is the goal at least.

# I will go for a walk to think about what exactly I'm gonna do. Or rather, how exactly I'm gonna do this.

# ultimately I just need to *actually do it* but manually checking is probably the best way forward in the meantime.


In [None]:

# VSCODE JUST DELETED MY FUNCTION DEFINTION CELL OMFG
# okay so i need to rewrite:

def parse_genbank(path):
    records = defaultdict()
    for record in SeqIO.parse(path, 'genbank'):
        records[record.id] = record
    return records

def parse_alignment(path):
    alignments = []
    with open(path, 'r') as infile:
        lines = infile.readlines()
    keys = lines[0].strip().split('\t')
    for line in lines[1::]:
        values = line.strip().split('\t')
        alignments.append({key:value for key,value in zip(keys, values)})
    return alignments

def is_within_range(feature, start, end):
    #print(feature)
    start_check = int(feature.location.start) >= start
    end_check = int(feature.location.end) <= end
    checked = (start_check and end_check)
    #if start_check is False and end_check is True:
    #    fragment = "3'-truncation"
    #if  end_check is True and start_check is False:
    #    fragment = "5'-truncation"
    #else:
    #    fragment = 'NA'
    return checked

def get_coverage(length, coords):
    covered_positions = set()
    for position in coords:
        start = position[0]
        end = position[1]

        if start > end:  # If the start position is higher than the end position - as in cases of reverse strand alignment :) - swap em
            start, end = end, start

        covered_positions.update(range(start, end+1))
    percent_coverage = (len(covered_positions)/length)*100
    return percent_coverage

def get_features_from_range(record, start, end):
    # gonna pull the whole feature
    if start > end:  # If the start position is higher than the end position - as in cases of reverse strand alignment :) - swap em
        start, end = end, start
    features = [feature for feature in record.features if is_within_range(feature, start, end) and feature.type == 'CDS']
    return features

def get_genes_from_alignments(alignments, ref_dict, asm_dict):
    genes_dict = defaultdict()
    for alignment in alignments:
        # for whatever reason, mummer indicates the reference as the query and the assembly as the ref. Whatever. just be mindful.
        ref_id = alignment['QUERY_ID']
        ref_name = alignment['QUERY_NAME']
        ref_start = int(alignment['QUERY_START'])
        ref_end = int(alignment['QUERY_END'])
        ref_aln_length = int(alignment['QUERY_LENGTH']) # ALIGNED LENGTH)
        ref_length = len(ref_dict[ref_name].seq)
        assembly_id = alignment['REF_ID']
        contig_name = alignment['REF_NAME'].replace("0000", "_").replace("_0", "_")
        contig_start = int(alignment['REF_START'])
        contig_end = int(alignment['REF_END'])
        contig_aln_length = int(alignment['REF_LENGTH']) # ALIGNED LENGTH)
        aln_identity = alignment['IDENTITY']
        contig_length = len(asm_dict[contig_name].seq)
        #print(ref_name, ref_length)
        # ref_features = [feature.qualifiers['locus_tag'] for feature in ref_dict[query_name].features if is_within_range(feature, aln_start_ref, aln_end_ref) and feature.type == 'CDS']
        ref_features = get_features_from_range(ref_dict[ref_name], ref_start, ref_end)
        asm_features = get_features_from_range(asm_dict[contig_name], contig_start, contig_end)
        ref_genes = simplify_genes_for_contig(ref_features)
        asm_genes = simplify_genes_for_contig(asm_features)
        #asm_genes = "placeholder :)"
        # Get percent coverage for each alignment
        # UPDATE DON'T DO THAT HERE THE RANGES GET ALL WEIRD.
        #ref_coverage = get_coverage(ref_length, ref_start, ref_end)
        #contig_coverage = get_coverage(contig_length, contig_start, contig_end)

        if contig_name not in genes_dict:
            genes_dict[contig_name] = defaultdict(dict)
            genes_dict[contig_name]['contig_length'] = int(contig_length)

        if ref_name not in genes_dict[contig_name]:
            genes_dict[contig_name][ref_name] = []

        alignment_dict = {
                        'ref_aln': {
                                'ref_length': int(ref_length),
                                'start': int(ref_start),
                                'end': int(ref_end),
                                'aln_length': ref_aln_length,
                                'percent_cov': int(),
                                'features': ref_genes,
                            },
                        'asm_aln': {
                                'start': int(contig_start),
                                'end': int(contig_end),
                                'aln_length': int(contig_aln_length),
                                'aln_identity': aln_identity, # to the reference, this will probably get confusing downstream :)
                                'percent_cov': int(),
                                'features': asm_genes,
                            }
                    }

        genes_dict[contig_name][ref_name].append(alignment_dict)
    #
    return genes_dict

def simplify_genes_for_contig(features):
    genes = []
    for feature in features:
        locus_tag = feature.qualifiers['locus_tag'][0].strip("'").strip('[').strip(']')
        product = ' '.join(feature.qualifiers['product'])
        gene = (locus_tag, product)
        genes.append(gene)
    return genes

def make_table_for_asm(genes_dict, output_path):
    lines = []
    header_row = 'contig_id\tcontig_len\tref\tref_len\tcontig_cov\treference_cov\tgenes_on_ref\tgenes_on_contig\n'
    with open(output_path, 'w') as outfile:
        lines.append(header_row)
        for contig in genes_dict:
            ref_genes = []
            contig_genes = []
            for aln in genes_dict[contig].keys():
                if aln != 'contig_length':
                    contig_len = genes_dict[contig]['contig_length']
                    aln_coords = [(cov['asm_aln']['start'],cov['asm_aln']['end']) for cov in genes_dict[contig][aln]]
                    ref_aln_coords = [(cov['ref_aln']['start'],cov['ref_aln']['end']) for cov in genes_dict[contig][aln]]
                    ref_len = genes_dict[contig][aln][0]['ref_aln']['ref_length']
                    for index, item in enumerate(genes_dict[contig][aln]):
                        ref_genes.extend([gene[0] for gene in item['ref_aln']['features']])
                        contig_genes.extend([gene[0] for gene in item['asm_aln']['features']])
                    contig_cov = get_coverage(contig_len, aln_coords)
                    #print(contig_cov, aln_coords)
                    reference_cov = get_coverage(ref_len, ref_aln_coords)
                    output_row = f'{contig}\t{contig_len}\t{aln}\t{ref_len}\t{contig_cov:.2f}\t{reference_cov:.2f}\t{len(ref_genes)}\t{len(contig_genes)}\n'
                    lines.append(output_row)
        outfile.writelines(lines)

def run_all(alignments_dir, annotations_dir, reference_genbank, output_dir):
    ref_dict = parse_genbank(reference_genbank)
    alignment_files = glob.glob(f'{alignments_dir}/*/align_coords.tsv')
    for alignment in alignment_files:
        sample_id = alignment.split('/')[-2]
        print(f'parsing alignments for {sample_id}!')
        assembly = f'{annotations_dir}/{sample_id}.gbff'
        print(f'gbff for {sample_id} exists: {os.path.exists(assembly)}')
        print(assembly)
        asm_dict = parse_genbank(assembly)
        alignments = parse_alignment(alignment)
        genes = get_genes_from_alignments(alignments, ref_dict, asm_dict)
        output_path = f'{output_dir}/{sample_id}_B31_Synteny.tsv'
        make_table_for_asm(genes, output_path)
        print(f'Finished! moving on')

run_all(alignments_dir, annotations_dir, b31_reference_genbank, output_dir)

In [None]:
error_gb = parse_genbank('/Users/mf019/bioinformatics/longread_pangenome/assemblies/paired_assemblies/paired_only/longread/annotation/UWI263P.gbff')

In [None]:
error_gb['contig_1']