# Looking at ~~v4~~ parsed output to see why it's so BROKEN

In [10]:
import os
import glob
from rich.progress import Progress
from Bio import SeqIO

def process_results(results_dir):
    source_assemblies = set()
    target_assemblies = set()
    source_contigs = set()
    target_contigs = set()
    result_count = 0

    with Progress() as progress:
        task = progress.add_task('Getting results', total=len(os.listdir(results_dir)))
        for result in os.listdir(results_dir):
            source_contig, target_contig = result.split('_vs_')
            source_assembly = target_assembly = source_contig.split('_')[0]
            
            source_assemblies.add(source_assembly)
            target_assemblies.add(target_assembly)
            source_contigs.add(source_contig)
            target_contigs.add(target_contig)
            
            result_count += 1
            progress.update(task, advance=1)
        
        progress.update(task, total=result_count, completed=result_count, description='Finished processing')
    
    return source_assemblies, target_assemblies, source_contigs, target_contigs, result_count

def check_duplicates(file_path):
    matrix_cols = set()
    duplicated_contigs = []

    with open(file_path, 'r') as infile:
        for line in infile:
            asm_contig = line.strip()
            if asm_contig not in matrix_cols:
                matrix_cols.add(asm_contig)
            else:
                print(f"DUPLICATE DETECTED!!!!!!{asm_contig}")
                duplicated_contigs.append(asm_contig)
    
    return matrix_cols, duplicated_contigs

def check_matrix_results(matrix_cols):
    matrix_source_assemblies = set()
    matrix_source_contigs = set()
    result_count = 0

    with Progress() as progress:
        task = progress.add_task('Checking matrix results to determine what broke!!!!', total=len(matrix_cols))
        for result in matrix_cols:
            source_contig = result.split('_vs_')[0]
            source_assembly = source_contig.split('_')[0]
            matrix_source_assemblies.add(source_assembly)
            matrix_source_contigs.add(source_contig)
            result_count += 1
            progress.update(task, advance=1)
        progress.update(task, total=result_count, completed=result_count, description='Finished checking!!!')
    return matrix_source_assemblies, matrix_source_contigs

def find_missing_items(source_set, matrix_set):
    return source_set - matrix_set

def print_missing_contig_lengths(missing_contigs, genbank_dir):
    print("getting the lengths of those missing contigs:")
    if len(missing_contigs) > 0:
        for contig in missing_contigs:
            records = list(SeqIO.parse(f'{genbank_dir}/{contig}.gbff', 'genbank'))
            for rec in records:
                print(f'{contig}, length: {len(rec.seq)}')
        print('\n')
    else:
        print("NO MISSING CONTIGS!!!\n")

def print_set_lengths(source_assemblies, target_assemblies, source_contigs, target_contigs):
    print(f'source_assemblies: {len(source_assemblies)}')
    print(f'target_assemblies: {len(target_assemblies)}\n')
    print(f'source_contigs: {len(source_contigs)}')
    print(f'target_contigs: {len(target_contigs)}\n')

def print_matrix_lengths(matrix_source_assemblies, matrix_source_contigs):
    print(f'Parsed assemblies: {len(matrix_source_assemblies)}')
    print(f'Parsed contigs: {len(matrix_source_contigs)}\n')

def print_missing_items(missing_assemblies, missing_contigs):
    print(f'missing assemblies: {missing_assemblies}')
    print(f'missing contigs: {missing_contigs}\n')

def scuffbuster(results_dir, v4_results_from_parsing, genbank_dir):
    # Process results
    source_assemblies, target_assemblies, source_contigs, target_contigs, _ = process_results(results_dir)
    # Check for duplicates
    matrix_cols, duplicated_contigs = check_duplicates(v4_results_from_parsing)
    # Print initial results
    print_set_lengths(source_assemblies, target_assemblies, source_contigs, target_contigs)
    # Check matrix results
    matrix_source_assemblies, matrix_source_contigs = check_matrix_results(matrix_cols)
    print_matrix_lengths(matrix_source_assemblies, matrix_source_contigs)
    # Find missing items
    missing_assemblies = find_missing_items(source_assemblies, matrix_source_assemblies)
    missing_contigs = find_missing_items(source_contigs, matrix_source_contigs)
    print_missing_items(missing_assemblies, missing_contigs)
    # Print missing contig lengths
    print_missing_contig_lengths(missing_contigs, genbank_dir)

In [11]:
results_dir = '/mnt/disks/big-results/asms_all_v_all/homology'
parsed_v4 = 'v4/ava_homology_np_matrix_labels_v4.txt'
parsed_v5 = 'v5/ava_homology_np_matrix_labels_v5.txt'
genbanks = '/home/mf019/longread_pangenome/synteny/asm_genbanks'

scuffbuster(results_dir, parsed_v4, genbanks)

Output()

Output()

source_assemblies: 49
target_assemblies: 49

source_contigs: 2942
target_contigs: 2942



Parsed assemblies: 49
Parsed contigs: 2931

missing assemblies: set()
missing contigs: {'UCT35H_contig000027', 'UCT35H_contig000025', 'URI48H_contig000022', 'UCT35H_contig000030', 'URI48H_contig000021', 'ESI26H_contig000036', 'UCT35H_contig000032', 'UCT50H_contig000077', 'UCT113H_contig000142', 'UCT113H_contig000183', 'URI36H_contig000037', 'ESI26H_contig000034'}

getting the lengths of those missing contigs:
UCT35H_contig000027, length: 281
UCT35H_contig000025, length: 315
URI48H_contig000022, length: 106
UCT35H_contig000030, length: 174
URI48H_contig000021, length: 108
ESI26H_contig000036, length: 376
UCT35H_contig000032, length: 152
UCT50H_contig000077, length: 142
UCT113H_contig000142, length: 182
UCT113H_contig000183, length: 139
URI36H_contig000037, length: 437
ESI26H_contig000034, length: 393




In [12]:
scuffbuster(results_dir, parsed_v5, genbanks)

Output()

Output()

source_assemblies: 49
target_assemblies: 49

source_contigs: 2942
target_contigs: 2942



Parsed assemblies: 49
Parsed contigs: 2943

missing assemblies: set()
missing contigs: set()

getting the lengths of those missing contigs:
NO MISSING CONTIGS!!!

