In [1]:
import os
from shutil import rmtree

EDCONTIGS_SCRIPT_CMD = '/home/diplomski-rad/racon-edcontigs/racon/scripts/edcontigs.py {} {} >> {} 2>&1'

# Calculate edit distance with edcontig script

## Get directories

In [2]:
def check_dir(dir_name):
    
    if 'racon-hax' not in dir_name:
        return False
    
    return True

consensus_directories = [dir_name for dir_name in sorted(os.listdir('./')) if check_dir(dir_name)]

assembly_directories = [dir_name for dir_name in sorted(os.listdir('../')) if 'fusobacterium' in dir_name]

## Helper functions

In [3]:
import numpy as np

def calc_number_of_Ns(consensus_path, result_path):
    
    num_Ns = 0
    
    with open(consensus_path, 'r') as file:
        for line in file:
            if line.strip().startswith('>'):
                continue
            num_Ns += int(np.char.count(line, 'N'))
    
    with open(result_path, 'a+') as file:
        file.write('Number of Ns in consensus: {}'.format(num_Ns))

## Generate edit distance reports for fusiobacterium

In [4]:
references_dir = {
    ('gonidiaformans', None): '/home/data/oxford_nanopore/bacteria/fusobacterium/gonidiaformans/f-gonidiaformans.fasta',
    ('mortiferum', None): '/home/data/oxford_nanopore/bacteria/fusobacterium/mortiferum/f-motriferum.fasta',
    ('necrophorum', None): '/home/data/oxford_nanopore/bacteria/fusobacterium/necrophorum/f-necrophorum.fasta',
    ('nucleatum', '23726'): '/home/data/oxford_nanopore/bacteria/fusobacterium/nucleatum-23726/f-nucleatum-23726.fasta',
    ('nucleatum', '25586'): '/home/data/oxford_nanopore/bacteria/fusobacterium/nucleatum-25586/f-nucleatum-25586.fasta',
    ('periodonticum', None): '/home/data/oxford_nanopore/bacteria/fusobacterium/periodonticum/f-periodonticum.fasta',
    ('ulcerans', None): '/home/data/oxford_nanopore/bacteria/fusobacterium/ulcerans/f-ulcerans.fasta',
    ('varium', None): '/home/data/oxford_nanopore/bacteria/fusobacterium/varium/f-varium.fasta',
}

### No cw

In [5]:
RESULT_DIR_PATH = '/home/diplomski-rad/results/ont/fusobacterium-dataset/'

for dir_name in consensus_directories:
    
    if 'f-bact' not in dir_name:
        continue
    if 'cw' in dir_name:
        continue
        
    strain_type = None
        
    if 'nucleatum' not in dir_name:
        _, _, strain, neighbourhood_string, _, model_number, _, _ = dir_name.split('-')
    else:
        _, _, strain, strain_type, neighbourhood_string, _, model_number, _, _ = dir_name.split('-')
    
    print(strain, neighbourhood_string, strain_type, model_number)
    
    ref_path = references_dir[(strain, strain_type)]
    consensus_path = os.path.join('./', dir_name, 'consensus.fasta')
    
    result_path = os.path.join(
        RESULT_DIR_PATH,
        strain if strain_type == None else '{}-{}'.format(strain, strain_type),
        '{}-model-{}-all-contigs-racon-hax-report.txt'.format(neighbourhood_string, model_number)
    )
    
    # Edit distance calculation.
    os.system(EDCONTIGS_SCRIPT_CMD.format(ref_path, consensus_path, result_path))
    
    # Calulating the number of Ns.
    calc_number_of_Ns(consensus_path, result_path)

gonidiaformans n15 None 11
gonidiaformans n15 None 23
gonidiaformans n15 None 24
gonidiaformans n20 None 11
gonidiaformans n20 None 23
gonidiaformans n20 None 24
mortiferum n15 None 11
mortiferum n15 None 23
mortiferum n15 None 24
mortiferum n20 None 11
mortiferum n20 None 23
mortiferum n20 None 24
necrophorum n15 None 11
necrophorum n15 None 23
necrophorum n15 None 24
necrophorum n20 None 11
necrophorum n20 None 23
necrophorum n20 None 24
nucleatum n15 23726 11
nucleatum n15 23726 23
nucleatum n15 23726 24
nucleatum n20 23726 11
nucleatum n20 23726 23
nucleatum n20 23726 24
nucleatum n15 25586 11
nucleatum n15 25586 23
nucleatum n15 25586 24
nucleatum n20 25586 11
nucleatum n20 25586 23
nucleatum n20 25586 24
periodonticum n15 None 11
periodonticum n15 None 23
periodonticum n15 None 24
periodonticum n20 None 11
periodonticum n20 None 23
periodonticum n20 None 24
ulcerans n15 None 11
ulcerans n15 None 23
ulcerans n15 None 24
ulcerans n20 None 11
ulcerans n20 None 23
ulcerans n20 None 2

### cw

In [6]:
RESULT_DIR_PATH = '/home/diplomski-rad/results/ont/fusobacterium-dataset/'

for dir_name in consensus_directories:
    
    if 'f-bact' not in dir_name:
        continue
        
    if 'cw' not in dir_name:
        continue
        
    strain_type = None
        
    if 'nucleatum' not in dir_name:
        _, _, strain, neighbourhood_string, _, model_number, _, _, _ = dir_name.split('-')
    else:
        _, _, strain, strain_type, neighbourhood_string, _, model_number, _, _, _ = dir_name.split('-')
    
    print(strain, neighbourhood_string, strain_type, model_number)
    
    ref_path = references_dir[(strain, strain_type)]
    consensus_path = os.path.join('./', dir_name, 'consensus.fasta')
    
    result_path = os.path.join(
        RESULT_DIR_PATH,
        strain if strain_type == None else '{}-{}'.format(strain, strain_type),
        '{}-model-{}-cw-all-contigs-racon-hax-report.txt'.format(neighbourhood_string, model_number)
    )
    
    # Edit distance calculation.
    os.system(EDCONTIGS_SCRIPT_CMD.format(ref_path, consensus_path, result_path))
    
    # Calulating the number of Ns.
    calc_number_of_Ns(consensus_path, result_path)

gonidiaformans n15 None 11
gonidiaformans n15 None 23
gonidiaformans n15 None 24
gonidiaformans n20 None 11
gonidiaformans n20 None 23
gonidiaformans n20 None 24
mortiferum n15 None 11
mortiferum n15 None 23
mortiferum n15 None 24
mortiferum n20 None 11
mortiferum n20 None 23
mortiferum n20 None 24
necrophorum n15 None 11
necrophorum n15 None 23
necrophorum n15 None 24
necrophorum n20 None 11
necrophorum n20 None 23
necrophorum n20 None 24
nucleatum n15 23726 11
nucleatum n15 23726 23
nucleatum n15 23726 24
nucleatum n20 23726 11
nucleatum n20 23726 23
nucleatum n20 23726 24
nucleatum n15 25586 11
nucleatum n15 25586 23
nucleatum n15 25586 24
nucleatum n20 25586 11
nucleatum n20 25586 23
nucleatum n20 25586 24
periodonticum n15 None 11
periodonticum n15 None 23
periodonticum n15 None 24
periodonticum n20 None 11
periodonticum n20 None 23
periodonticum n20 None 24
ulcerans n15 None 11
ulcerans n15 None 23
ulcerans n15 None 24
ulcerans n20 None 11
ulcerans n20 None 23
ulcerans n20 None 2

## For racon report

In [6]:
RESULT_DIR_PATH = '/home/diplomski-rad/results/ont/fusobacterium-dataset/'

for dir_name in assembly_directories:
    
    if 'varium' not in dir_name:
        continue
    
    splits = dir_name.split('-')
    
    strain_type = None
    strain = splits[1]
    if len(splits) == 3:
        strain_type = splits[2]
        
    print(strain, strain_type)
        
    ref_path = references_dir[(strain, strain_type)]
    assembly_path = os.path.join('../', dir_name, 'iter2.fasta')
    
    result_path = os.path.join(
        RESULT_DIR_PATH,
        strain if strain_type == None else '{}-{}'.format(strain, strain_type),
        'racon-report.txt'
    )
    
    # Edit distance calculation.
    os.system(EDCONTIGS_SCRIPT_CMD.format(ref_path, assembly_path, result_path))
    
    # Calulating the number of Ns.
    calc_number_of_Ns(assembly_path, result_path)

varium None


### Delete wrong data - no cw

In [12]:
RESULT_DIR_PATH = '/home/diplomski-rad/results/ont/fusobacterium-dataset/'

for dir_name in consensus_directories:
    
    if 'f-bact' not in dir_name:
        continue
    if 'cw' in dir_name:
        continue
        
    strain_type = None
        
    if 'nucleatum' not in dir_name:
        _, _, strain, neighbourhood_string, _, model_number, _, _ = dir_name.split('-')
    else:
        _, _, strain, strain_type, neighbourhood_string, _, model_number, _, _ = dir_name.split('-')
    
    print(strain, neighbourhood_string, strain_type, model_number)
    
    ref_path = references_dir[(strain, strain_type)]
    consensus_path = os.path.join('./', dir_name, 'consensus.fasta')
    
    result_path = os.path.join(
        RESULT_DIR_PATH,
        strain if strain_type == None else '{}-{}'.format(strain, strain_type),
        '{}-model-{}-all-contigs-racon-hax-report.txt'.format(neighbourhood_string, model_number)
    )
    
    # Remove temporary dirs.
    tmp_dir = os.path.join('./', dir_name, 'edit-dist-calc')
    if os.path.exists(tmp_dir):
        rmtree(tmp_dir)
    
    # Remove previous results.
    with open(result_path) as f:
        lines = f.readlines()
    index = 0
    for i, line in enumerate(lines):
        if line.strip().startswith('Running MUMmer'):
            index = i
            break
    good_lines = lines[:index]
    os.remove(result_path)
    with open(result_path, 'w') as f:
        for line in good_lines:
            f.write('{}\n'.format(line.strip()))
    

gonidiaformans n15 None 11
gonidiaformans n15 None 23
gonidiaformans n15 None 24
gonidiaformans n20 None 11
gonidiaformans n20 None 23
gonidiaformans n20 None 24
mortiferum n15 None 11
mortiferum n15 None 23
mortiferum n15 None 24
mortiferum n20 None 11
mortiferum n20 None 23
mortiferum n20 None 24
necrophorum n15 None 11
necrophorum n15 None 23
necrophorum n15 None 24
necrophorum n20 None 11
necrophorum n20 None 23
necrophorum n20 None 24
nucleatum n15 23726 11
nucleatum n15 23726 23
nucleatum n15 23726 24
nucleatum n20 23726 11
nucleatum n20 23726 23
nucleatum n20 23726 24
nucleatum n15 25586 11
nucleatum n15 25586 23
nucleatum n15 25586 24
nucleatum n20 25586 11
nucleatum n20 25586 23
nucleatum n20 25586 24
periodonticum n15 None 11
periodonticum n15 None 23
periodonticum n15 None 24
periodonticum n20 None 11
periodonticum n20 None 23
periodonticum n20 None 24
ulcerans n15 None 11
ulcerans n15 None 23
ulcerans n15 None 24
ulcerans n20 None 11
ulcerans n20 None 23
ulcerans n20 None 2

### Delete wrong data - cw

In [9]:
RESULT_DIR_PATH = '/home/diplomski-rad/results/ont/fusobacterium-dataset/'

for dir_name in consensus_directories:
    
    if 'f-bact' not in dir_name:
        continue
    if 'cw' not in dir_name:
        continue
        
    strain_type = None
        
    if 'nucleatum' not in dir_name:
        _, _, strain, neighbourhood_string, _, model_number, _, _, _ = dir_name.split('-')
    else:
        _, _, strain, strain_type, neighbourhood_string, _, model_number, _, _, _ = dir_name.split('-')
    
    print(strain, neighbourhood_string, strain_type, model_number)
    
    ref_path = references_dir[(strain, strain_type)]
    consensus_path = os.path.join('./', dir_name, 'consensus.fasta')
    
    result_path = os.path.join(
        RESULT_DIR_PATH,
        strain if strain_type == None else '{}-{}'.format(strain, strain_type),
        '{}-model-{}-cw-all-contigs-racon-hax-report.txt'.format(neighbourhood_string, model_number)
    )
    
    # Remove temporary dirs.
    tmp_dir = os.path.join('./', dir_name, 'edit-dist-calc')
    if os.path.exists(tmp_dir):
        print('Deleting edit-dist-calc')
        rmtree(tmp_dir)
    else:
        print('edit-dist-calc not found')
    
    # Remove previous results.
    with open(result_path) as f:
        lines = f.readlines()
    index = len(lines)
    for i, line in enumerate(lines):
        if line.strip().startswith('Running MUMmer'):
            print('Found line on', i)
            index = i
            break
    good_lines = lines[:index]
    os.remove(result_path)
    with open(result_path, 'w') as f:
        for line in good_lines:
            f.write('{}\n'.format(line.strip()))
    

gonidiaformans n15 None 11
edit-dist-calc not found
Found line on 92
gonidiaformans n15 None 23
edit-dist-calc not found
Found line on 92
gonidiaformans n15 None 24
edit-dist-calc not found
Found line on 92
gonidiaformans n20 None 11
edit-dist-calc not found
Found line on 92
gonidiaformans n20 None 23
edit-dist-calc not found
Found line on 92
gonidiaformans n20 None 24
edit-dist-calc not found
Found line on 92
mortiferum n15 None 11
edit-dist-calc not found
Found line on 92
mortiferum n15 None 23
edit-dist-calc not found
Found line on 92
mortiferum n15 None 24
edit-dist-calc not found
Found line on 92
mortiferum n20 None 11
edit-dist-calc not found
Found line on 92
mortiferum n20 None 23
edit-dist-calc not found
Found line on 92
mortiferum n20 None 24
edit-dist-calc not found
Found line on 92
necrophorum n15 None 11
edit-dist-calc not found
Found line on 92
necrophorum n15 None 23
edit-dist-calc not found
Found line on 92
necrophorum n15 None 24
edit-dist-calc not found
Found line on 9

## Generate dnadiff again

In [11]:
CONSENSUS_SUMMARY_CMD = '{}/mummer3.23/dnadiff -p {}/dnadiff-output {} {} ' \
                          '2>> {}/err'
RESULT_CMD = 'cp {}/dnadiff-output.report {}'

RESULT_DIR_PATH = '/home/diplomski-rad/results/ont/fusobacterium-dataset/'
TOOLS_DIR = '/home/diplomski-rad/'

for dir_name in consensus_directories:
    
    if 'f-bact' not in dir_name:
        continue
    if 'cw' in dir_name:
        continue
        
    strain_type = None
        
    if 'nucleatum' not in dir_name:
        _, _, strain, neighbourhood_string, _, model_number, _, _ = dir_name.split('-')
    else:
        _, _, strain, strain_type, neighbourhood_string, _, model_number, _, _ = dir_name.split('-')
    
    print(strain, neighbourhood_string, strain_type, model_number)
    
    ref_path = references_dir[(strain, strain_type)]
    consensus_path = os.path.join('./', dir_name, 'consensus.fasta')
    
    result_path = os.path.join(
        RESULT_DIR_PATH,
        strain if strain_type == None else '{}-{}'.format(strain, strain_type),
        '{}-model-{}-all-contigs-racon-hax-report.txt'.format(neighbourhood_string, model_number)
    )
    
    # Remove old dnadiff files.
    dir_path = os.path.join('./', dir_name)
    file_names = os.listdir(dir_path)
    for file_name in file_names:
        file_path = os.path.join(dir_path, file_name)
        if file_name.startswith('dnadiff'):
            os.remove(file_path)
    
    # Generate new dnadiff files
    os.system(CONSENSUS_SUMMARY_CMD.format(
        TOOLS_DIR,
        dir_path,
        ref_path,
        consensus_path,
        dir_path
    ))
    os.system(RESULT_CMD.format(
        dir_path,
        result_path
    )) 

gonidiaformans n15 None 11
gonidiaformans n15 None 23
gonidiaformans n15 None 24
gonidiaformans n20 None 11
gonidiaformans n20 None 23
gonidiaformans n20 None 24
mortiferum n15 None 11
mortiferum n15 None 23
mortiferum n15 None 24
mortiferum n20 None 11
mortiferum n20 None 23
mortiferum n20 None 24
necrophorum n15 None 11
necrophorum n15 None 23
necrophorum n15 None 24
necrophorum n20 None 11
necrophorum n20 None 23
necrophorum n20 None 24
nucleatum n15 23726 11
nucleatum n15 23726 23
nucleatum n15 23726 24
nucleatum n20 23726 11
nucleatum n20 23726 23
nucleatum n20 23726 24
nucleatum n15 25586 11
nucleatum n15 25586 23
nucleatum n15 25586 24
nucleatum n20 25586 11
nucleatum n20 25586 23
nucleatum n20 25586 24
periodonticum n15 None 11
periodonticum n15 None 23
periodonticum n15 None 24
periodonticum n20 None 11
periodonticum n20 None 23
periodonticum n20 None 24
ulcerans n15 None 11
ulcerans n15 None 23
ulcerans n15 None 24
ulcerans n20 None 11
ulcerans n20 None 23
ulcerans n20 None 2