In [1]:
import os
import re
from functools import reduce
from Bio import SeqIO
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool

In [36]:
alignments = [file for file in os.listdir('all_mito_mappings_reversed') if file.split('.')[-1] == 'txt']

In [7]:
def get_alignment_total(cigar):
    matches = re.findall(r'(\d+)M', cigar)
    total_matches = reduce(lambda x,y: int(x)+int(y), matches)
    return total_matches

In [8]:
def get_contig_length(fasta, contig):
    with open(fasta, "r") as in_file:
        for record in SeqIO.parse(in_file, "fasta"):
            if record.id == contig:
                return len(record.seq)
        return -1
    

In [17]:
alignments = ['gp041.1.chmM.reversed_contigs.txt', 'gp041.2.chmM.reversed_contigs.txt']

In [18]:
data = []
fasta_contigs = []
for alignment in alignments:
    # print(alignment)
    info = open(f'all_mito_mappings_reversed/{alignment}', 'r').read().splitlines()
    info_items = info[0].split()
    contig = info_items[0]
    cigar = info_items[-1]
    sample = alignment.split('.')[0]
    hap = alignment.split('.')[1][-1]
    
    fasta = f"../all_assemblies/{sample}.bp.hap{hap}.p_ctg.fa"
    # print(fasta)
    
    fasta_contigs.append((fasta, contig))
    
    
    align_total = get_alignment_total(cigar)
    # # contig_length = get_contig_length(fasta, contig)
    # # print(align_total)
    
    # data.append([sample, hap, contig, align_total, contig_length, align_total/contig_length])
    
    
    

In [19]:
fasta_contigs

[('../all_assemblies/gp041.bp.hap1.p_ctg.fa', 'h1tg000400c'),
 ('../all_assemblies/gp041.bp.hap2.p_ctg.fa', 'h2tg000373c')]

In [20]:
with Pool(len(fasta_contigs)) as p:
    results = p.starmap(get_contig_length, fasta_contigs)

In [23]:
results

[33140, 33140]

In [83]:
data=[]
for sample_info, length, alignment in zip(fasta_contigs, results, alignments):
    contig = sample_info[-1]
    
    sample = alignment.split('.')[0]
    hap = alignment.split('.')[1]
    
    info = open(f'all_mito_mappings_reversed/{alignment}', 'r').read().splitlines()
    info_items = info[0].split()
    cigar = info_items[-1]
    
    align_total = get_alignment_total(cigar)
    
    data.append([sample, hap, contig, align_total, length, int(align_total)/length])
    
    
    
    
    

In [85]:
import pandas as pd

In [90]:
pd.DataFrame(data, columns=['sample', 'hap', 'contig', 'align_total', 'length', 'percentage']).to_csv('align_information.csv')

In [91]:
fasta_contigs

[('../all_assemblies/gp011.bp.hap1.p_ctg.fa', 'h1tg000063l'),
 ('../all_assemblies/gp008.bp.hap1.p_ctg.fa', 'h1tg000060l'),
 ('../all_assemblies/gp038.bp.hap1.p_ctg.fa', 'h1tg000246l'),
 ('../all_assemblies/gp027.bp.hap1.p_ctg.fa', 'h1tg000067c'),
 ('../all_assemblies/gp024.bp.hap2.p_ctg.fa', 'h2tg000047l'),
 ('../all_assemblies/gp015.bp.hap2.p_ctg.fa', 'h2tg000065c'),
 ('../all_assemblies/gp020.bp.hap1.p_ctg.fa', 'h1tg000087l'),
 ('../all_assemblies/gp031.bp.hap2.p_ctg.fa', 'h2tg000088l'),
 ('../all_assemblies/gp033.bp.hap2.p_ctg.fa', 'h2tg000081c'),
 ('../all_assemblies/gp014.bp.hap1.p_ctg.fa', 'h1tg000051c'),
 ('../all_assemblies/gp041.bp.hap2.p_ctg.fa', 'h2tg000373c'),
 ('../all_assemblies/gp040.bp.hap1.p_ctg.fa', 'h1tg000123l'),
 ('../all_assemblies/gp016.bp.hap2.p_ctg.fa', 'h2tg000041c'),
 ('../all_assemblies/gp041.bp.hap1.p_ctg.fa', 'h1tg000400c'),
 ('../all_assemblies/gp006.bp.hap1.p_ctg.fa', 'h1tg000088l'),
 ('../all_assemblies/gp021.bp.hap1.p_ctg.fa', 'h1tg000082c'),
 ('../al

In [24]:
def remove_contig(fasta, contig):
    sample = fasta.split('/')[-1].split('.')[0]
    hap = fasta.split('/')[-1].split('.')[2][-1]
    
    if (sample == 'gp027' and hap == '2'):
        return

    with open(fasta, "r") as in_file, open (f'mito_filtered_assemblies/{fasta.split("/")[-1]}', 'w') as outfile:
        for record in SeqIO.parse(in_file, "fasta"):
            if record.id != contig:
                outfile.write(f">{record.description}\n")
                outfile.write(f"{str(record.seq)}\n")
    
        
    
    

In [25]:
fasta_contigs

[('../all_assemblies/gp041.bp.hap1.p_ctg.fa', 'h1tg000400c'),
 ('../all_assemblies/gp041.bp.hap2.p_ctg.fa', 'h2tg000373c')]

In [26]:
with Pool(len(fasta_contigs)) as p:
    results = p.starmap(remove_contig, fasta_contigs)

In [113]:
with open('../all_assemblies/gp027.bp.hap2.p_ctg.fa', "r") as in_file, open (f'mito_filtered_assemblies/gp027.bp.hap2.p_ctg.fa', 'w') as outfile:
        for record in SeqIO.parse(in_file, "fasta"):
            if record.id == 'h2tg000012l':
                sequence = record.seq
                sequence1 = sequence[:68872793]
                sequence2 = sequence[68872793+16569:]
           
                outfile.write(f">{record.description}p1\n")
                outfile.write(f"{sequence1}\n")
                outfile.write(f">{record.description}p2\n")
                outfile.write(f"{sequence2}\n")
            else:
                outfile.write(f">{record.description}\n")
                outfile.write(f"{str(record.seq)}\n")