## Parse indels and export in BED format

In [778]:
import pandas as pd
import sys

try:
	species = sys.argv[1]  # "human", "mouse", or "macaque"
	file_name = f"indels.denovo_chrM_{species}.tab"
	df = pd.read_table( file_name )
except:
	species = 'mouse'
	file_name = f"indels.denovo_chrM_{species}.tab"
	df = pd.read_table( file_name )

# Import indel data.
df = pd.read_table( file_name )

# Keep only relevant columns.
df = df[['CHROM','POS','REF','ALT','Mutation_ID']]
df['Species'] = species

# Add indel size and type.
df['Indel_size'] = df['ALT'].str.len() - df['REF'].str.len()
df['Indel_type'] = df['Indel_size'].apply( lambda x: 'ins' if x > 0 else 'del' )

# Add BED format columns.
df['Chrom'] = df['CHROM']
df['Start'] = df['POS'] - 1
df['End'] = df['POS']

# Create new mutation ID.
df['New_Mutation_ID'] = df['Species'] + '_' + df['Mutation_ID'] + '_' + df['Indel_type']

# Reorder columns.
df = df.iloc[:,-4:]

# Save to BED file.
df.to_csv( f"{species}.indels.bed", sep='\t', index=False, header=False )

# Split into insertions and deletions.
df_ins = df[ df['New_Mutation_ID'].str.contains('ins') ]
df_del = df[ df['New_Mutation_ID'].str.contains('del') ]
# Save to separate BED files.
df_ins.to_csv( f"{species}.indels.ins.bed", sep='\t', index=False, header=False )
df_del.to_csv( f"{species}.indels.del.bed", sep='\t', index=False, header=False )

## Get flanking sequences

### Run bedtools slop

In [779]:
import subprocess

slop_size = 200  # Number of base pairs to add as flanks
species_list = ["mouse", "macaque", "human"]

for species in species_list:
	if species == "mouse":
		genome_size = 16300
	elif species == "macaque":
		genome_size = 16564
	elif species == "human":
		genome_size = 16569
	else:
		print(f"Unknown species: {species}")
		continue

	with open("genome.txt", "w") as f:
		f.write(f"chrM\t{genome_size}\n")
	print(f"Created genome.txt for {species} chrM")

	# Deletions
	subprocess.run([
		"bedtools", "slop",
		"-i", f"{species}.indels.del.bed",
		"-b", f"{slop_size}",
		"-g", "genome.txt"
	], stdout=open(f"{species}.indels.del.slop{slop_size}.bed", "w"))
	print(f"Created {species}.indels.del.slop{slop_size}.bed with {slop_size} bp flanks")

	# Insertions
	subprocess.run([
		"bedtools", "slop",
		"-i", f"{species}.indels.ins.bed",
		"-b", f"{slop_size}",
		"-g", "genome.txt"
	], stdout=open(f"{species}.indels.ins.slop{slop_size}.bed", "w"))
	print(f"Created {species}.indels.ins.slop{slop_size}.bed with {slop_size} bp flanks")


Created genome.txt for mouse chrM
Created mouse.indels.del.slop200.bed with 200 bp flanks
Created mouse.indels.ins.slop200.bed with 200 bp flanks
Created genome.txt for macaque chrM
Created macaque.indels.del.slop200.bed with 200 bp flanks
Created macaque.indels.ins.slop200.bed with 200 bp flanks
Created genome.txt for human chrM
Created human.indels.del.slop200.bed with 200 bp flanks
Created human.indels.ins.slop200.bed with 200 bp flanks


### Run bedtools getfasta

In [780]:
fasta_dir = "../../data/refs/"
fasta_files = {
	"mouse": f"{fasta_dir}/mouse/NC_005089_9821insA.chrM.fa",
	"macaque": f"{fasta_dir}/rhesus_macaque/rheMac10.chrM.fasta",
	"human": f"{fasta_dir}/human/NC_012920.1.fa"
}

for sp in species_list:
	fasta_file = fasta_files[sp]
	# Insertions
	subprocess.run([
		"bedtools", "getfasta",
		"-bed", f"{sp}.indels.ins.slop{slop_size}.bed",
		"-fi", fasta_file,
		"-fo", f"{sp}.indels.ins.slop{slop_size}.fa",
		"-name+"
	])
	print(f"Created {sp}.indels.ins.slop{slop_size}.fa with sequences")
	# Deletions
	subprocess.run([
		"bedtools", "getfasta",
		"-bed", f"{sp}.indels.del.slop{slop_size}.bed",
		"-fi", fasta_file,
		"-fo", f"{sp}.indels.del.slop{slop_size}.fa",
		"-name+"
	])
	print(f"Created {sp}.indels.del.slop{slop_size}.fa with sequences")

Created mouse.indels.ins.slop200.fa with sequences
Created mouse.indels.del.slop200.fa with sequences
Created macaque.indels.ins.slop200.fa with sequences
Created macaque.indels.del.slop200.fa with sequences
Created human.indels.ins.slop200.fa with sequences
Created human.indels.del.slop200.fa with sequences


# Insertions:

## Parse FastA file sequence

In [781]:
from Bio import SeqIO
import pandas as pd
pd.set_option('display.max_rows', 20)

all_tables = []

for species in species_list:
	fasta_file = f"{species}.indels.ins.slop{slop_size}.fa"
	fasta_dict = {record.id: str(record.seq) for record in SeqIO.parse(fasta_file, "fasta")}

	split_fasta_dict = {}
	for key, seq in fasta_dict.items():
		mid = len(seq) // 2
		split_fasta_dict[key] = (seq[:mid+1], seq[mid+1:])

	table = pd.DataFrame([
		{'Fasta_ID': k, 'Left_Flank': v[0], 'Right_Flank': v[1]}
		for k, v in split_fasta_dict.items()
	])
	
	# Extract Mutation_ID from Fasta_ID
	table['Mutation_ID'] = table['Fasta_ID'].str.split(':').str[0].str.split('_').str[1:4].str.join('_')

	table['Left_Flank'] = table['Left_Flank'].str.upper()
	table['Right_Flank'] = table['Right_Flank'].str.upper()
	table['Left_Flank_flipped'] = table['Left_Flank'].apply(lambda x: x[::-1])
	table[['Species', 'Position', 'Ref', 'Alt', 'Type', 'Region']] = table['Fasta_ID'].str.extract(r'([^_]+)_([^_]+)_([^_]+)_([^_]+)_([^:]+)::(.+)')
	table['Inserted_bases'] = table['Alt'].str[1:]
	table['Inserted_bases_len'] = table['Inserted_bases'].str.len()

	# Consider only the flank that is the same size as the insertion.
	table['Left_Flank_flipped_shortened'] = table.apply(lambda row: 
									row['Left_Flank_flipped'][:row['Inserted_bases_len']], axis=1)
	table['Right_Flank_shortened'] = table.apply(lambda row: 
									row['Right_Flank'][:row['Inserted_bases_len']], axis=1)
	table['Right_Flank_shortened_flipped'] = table['Right_Flank_shortened'].apply(lambda x: x[::-1])
	table['Left_Flank_flipped_shortened_unflipped'] = table['Left_Flank_flipped_shortened'].apply(lambda x: x[::-1])

	all_tables.append(table)

# Concatenate all tables into one DataFrame
table = pd.concat(all_tables, ignore_index=True)
table


Unnamed: 0,Fasta_ID,Left_Flank,Right_Flank,Mutation_ID,Left_Flank_flipped,Species,Position,Ref,Alt,Type,Region,Inserted_bases,Inserted_bases_len,Left_Flank_flipped_shortened,Right_Flank_shortened,Right_Flank_shortened_flipped,Left_Flank_flipped_shortened_unflipped
0,mouse_13053_T_TC_ins::chrM:12852-13253,TCATGCCTAGTAATCGGAAGCCTCGCCCTCACAGGAATACCATTCC...,CCCCCCCTAATCTCCATTAACGAAAATGACCCAGACCTCATAAACC...,13053_T_TC,TTTTGCGCCAAAACAATAACAATGCTTCATTTACTAAGCATACGAC...,mouse,13053,T,TC,ins,chrM:12852-13253,C,1,T,C,C,T
1,mouse_14640_A_AG_ins::chrM:14439-14840,GGACGAGGCTTATATTATGGATCATATACATTTATAGAAACCTGAA...,GGGGGCTTCTCAGTAGACAAAGCCACCTTGACCCGATTCTTCGCTT...,14640_A_AG,AGTTTAAGTAAGCTGATCCCAACAAGGTTATATACCCTACCGACTA...,mouse,14640,A,AG,ins,chrM:14439-14840,G,1,A,G,G,A
2,mouse_5171_G_GA_ins::chrM:4970-5371,CCGCGAGCCTTCAAAGCCCTAAGAAAACACACAAGTTTAACTTCTG...,AAAAAAAAAAATGGCGGTAGAAGTCTTAGTAGAGATTTCTCTACAC...,5171_G_GA,GCCGCCATCTTCATCTAACTTCGGTCATTATCCCATAAATCGACAA...,mouse,5171,G,GA,ins,chrM:4970-5371,A,1,G,A,A,G
3,mouse_9841_G_GT_ins::chrM:9640-10041,ATTTCTATTATTTGACCTAGAAATTGCTCTTCTACTTCCACTACCA...,ACTCATTAGATTATGATGATGTTCATAATTACCAATATGCCATCTA...,9841_G_GT,GCTTTAGTAATTAAAAAAAAATTTGATTAATGGTAAATGAGACAAG...,mouse,9841,G,GT,ins,chrM:9640-10041,T,1,G,A,A,G
4,mouse_12636_C_CA_ins::chrM:12435-12836,TGACTACCATCAGCAATAGAAGGCCCTACACCAGTTTCAGCACTAC...,AAAAAAATCATTGCCTTCTCTACATCAAGCCAACTAGGCCTGATAA...,12636_C_CA,CTACAGCAAAACCCACTCTCGTGTTTATCGACATTTATTACACCAA...,mouse,12636,C,CA,ins,chrM:12435-12836,A,1,C,A,A,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,human_16182_A_AC_ins::chrM:15981-16382,ATTAGCACCCAAAGCTAAGATTCTAATTTAAACTATTCTCTGTTCT...,ACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCCTCAAC...,16182_A_AC,AAACTACACCTAACCCAAAAATACATGATGTCCACCAGTTCATAAA...,human,16182,A,AC,ins,chrM:15981-16382,C,1,A,A,A,A
446,human_16188_T_TC_ins::chrM:15987-16388,ACCCAAAGCTAAGATTCTAATTTAAACTATTCTCTGTTCTTTCATG...,TCCCCATGCTTACAAGCAAGTACAGCAATCAACCCTCAACTATCAC...,16188_T_TC,CCCCCAAAACTACACCTAACCCAAAAATACATGATGTCCACCAGTT...,human,16188,T,TC,ins,chrM:15987-16388,C,1,C,T,T,C
447,human_16188_T_TCC_ins::chrM:15987-16388,ACCCAAAGCTAAGATTCTAATTTAAACTATTCTCTGTTCTTTCATG...,TCCCCATGCTTACAAGCAAGTACAGCAATCAACCCTCAACTATCAC...,16188_T_TCC,CCCCCAAAACTACACCTAACCCAAAAATACATGATGTCCACCAGTT...,human,16188,T,TCC,ins,chrM:15987-16388,CC,2,CC,TC,CT,CC
448,human_16356_T_TCTCGTCCCCATGGATGACCCCC_ins::chr...,GTACATAAAAACCCAATCCACATCAAAACCCCCTCCCCATGCTTAC...,TCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCA...,16356_T_TCTCGTCCCCATGGATGACCCCC,TCCCTAAACTGACATTACACGATACATGCCATTTACCGAAATACAT...,human,16356,T,TCTCGTCCCCATGGATGACCCCC,ins,chrM:16155-16556,CTCGTCCCCATGGATGACCCCC,22,TCCCTAAACTGACATTACACGA,TCTCGTCCCCATGGATGACCCC,CCCCAGTAGGTACCCCTGCTCT,AGCACATTACAGTCAAATCCCT


### Homopolymer?

In [782]:
# Check if the beginning of Right_Flank is a homopolymer (e.g., first 2-5 bases are the same)
def is_homopolymer(seq, n=3):
	"""Return True if the first n bases of seq are all the same."""
	return len(seq) >= n and seq[:n] == seq[0] * n

# Apply to the Right_Flank column for n=3 (adjust n as needed)
table['Right_Flank_homopolymer'] = table['Right_Flank'].apply(lambda x: is_homopolymer(x, n=3))
table[['Right_Flank', 'Right_Flank_homopolymer']]

# Check if the beginning of Left_Flank_flipped is a homopolymer (e.g., first 3 bases are the same)
table['Left_Flank_flipped'] = table['Left_Flank_flipped']
table['Left_Flank_flipped_homopolymer'] = table['Left_Flank_flipped'].apply(lambda x: is_homopolymer(x, n=3))

# Show both Right_Flank_homopolymer and Left_Flank_flipped_homopolymer in the same table
table[['Inserted_bases','Right_Flank', 'Right_Flank_homopolymer', 'Left_Flank_flipped', 'Left_Flank_flipped_homopolymer']]


Unnamed: 0,Inserted_bases,Right_Flank,Right_Flank_homopolymer,Left_Flank_flipped,Left_Flank_flipped_homopolymer
0,C,CCCCCCCTAATCTCCATTAACGAAAATGACCCAGACCTCATAAACC...,True,TTTTGCGCCAAAACAATAACAATGCTTCATTTACTAAGCATACGAC...,True
1,G,GGGGGCTTCTCAGTAGACAAAGCCACCTTGACCCGATTCTTCGCTT...,True,AGTTTAAGTAAGCTGATCCCAACAAGGTTATATACCCTACCGACTA...,False
2,A,AAAAAAAAAAATGGCGGTAGAAGTCTTAGTAGAGATTTCTCTACAC...,True,GCCGCCATCTTCATCTAACTTCGGTCATTATCCCATAAATCGACAA...,False
3,T,ACTCATTAGATTATGATGATGTTCATAATTACCAATATGCCATCTA...,False,GCTTTAGTAATTAAAAAAAAATTTGATTAATGGTAAATGAGACAAG...,False
4,A,AAAAAAATCATTGCCTTCTCTACATCAAGCCAACTAGGCCTGATAA...,True,CTACAGCAAAACCCACTCTCGTGTTTATCGACATTTATTACACCAA...,False
...,...,...,...,...,...
445,C,ACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCCTCAAC...,False,AAACTACACCTAACCCAAAAATACATGATGTCCACCAGTTCATAAA...,True
446,C,TCCCCATGCTTACAAGCAAGTACAGCAATCAACCCTCAACTATCAC...,False,CCCCCAAAACTACACCTAACCCAAAAATACATGATGTCCACCAGTT...,True
447,CC,TCCCCATGCTTACAAGCAAGTACAGCAATCAACCCTCAACTATCAC...,False,CCCCCAAAACTACACCTAACCCAAAAATACATGATGTCCACCAGTT...,True
448,CTCGTCCCCATGGATGACCCCC,TCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCA...,False,TCCCTAAACTGACATTACACGATACATGCCATTTACCGAAATACAT...,False


### Hamming distance between inserted bases and flanks

In [783]:
from difflib import SequenceMatcher

def hamming_similarity(s1, s2):
	"""Return the proportion of matching characters (Hamming similarity) between two strings of equal length."""
	if len(s1) != len(s2) or len(s1) == 0:
		return None
	return sum(a == b for a, b in zip(s1, s2)) / len(s1)

# Compute Hamming similarity for each row between Inserted_bases and Left_Flank_flipped_shortened, Right_Flank_shortened.
table['Hamming_Left'] = table.apply(
	lambda row: hamming_similarity(row['Inserted_bases'], row['Left_Flank_flipped_shortened_unflipped']), axis=1
)

# Compute Hamming similarity for each row between Inserted_bases and Right_Flank_shortened.
table['Hamming_Right'] = table.apply(
	lambda row: hamming_similarity(row['Inserted_bases'], row['Right_Flank_shortened']), axis=1
)

# Identify rows where the deleted bases are identical to either flank.
table['Identical_to_flank'] = (table['Hamming_Left'] == 1) | (table['Hamming_Right'] == 1)

# Create a copy of the table for insertions.
table_insertions = table.copy()

table[['Left_Flank_flipped_shortened_unflipped', 'Inserted_bases', 'Right_Flank_shortened', 'Hamming_Left', 'Hamming_Right']]

Unnamed: 0,Left_Flank_flipped_shortened_unflipped,Inserted_bases,Right_Flank_shortened,Hamming_Left,Hamming_Right
0,T,C,C,0.000000,1.000000
1,A,G,G,0.000000,1.000000
2,G,A,A,0.000000,1.000000
3,G,T,A,0.000000,0.000000
4,C,A,A,0.000000,1.000000
...,...,...,...,...,...
445,A,C,A,0.000000,0.000000
446,C,C,T,1.000000,0.000000
447,CC,CC,TC,1.000000,0.500000
448,AGCACATTACAGTCAAATCCCT,CTCGTCCCCATGGATGACCCCC,TCTCGTCCCCATGGATGACCCC,0.272727,0.363636


In [784]:
# Show rows where inserted bases are longer than 2.
hamming_cutoff = 1
table[
	(table['Inserted_bases_len'] > 2) &
	((table['Hamming_Left'].isnull()) | (table['Hamming_Left'] < hamming_cutoff)) &
	((table['Hamming_Right'].isnull()) | (table['Hamming_Right'] < hamming_cutoff))
][['Left_Flank_flipped_shortened_unflipped', 'Inserted_bases', 'Right_Flank_shortened', 'Hamming_Left', 'Hamming_Right']].sort_values(by=['Hamming_Left', 'Hamming_Right'], ascending=[False, False])

Unnamed: 0,Left_Flank_flipped_shortened_unflipped,Inserted_bases,Right_Flank_shortened,Hamming_Left,Hamming_Right
389,CCCC,CCCT,CTCC,0.750000,0.500000
390,CCCC,CCTC,TCCC,0.750000,0.500000
392,CCC,CTC,TCC,0.666667,0.333333
406,AAG,AAA,TTG,0.666667,0.000000
394,ATACT,ATGTT,AACCC,0.600000,0.200000
...,...,...,...,...,...
427,TAGAATT,ATTCCCC,AATTCCC,0.000000,0.714286
53,AAG,CCA,TTA,0.000000,0.333333
260,TGT,AAC,ACG,0.000000,0.333333
309,ACA,TTC,ACC,0.000000,0.333333


### Summary table using Hamming distance

In [785]:
def summarize_hamming(species, table=table, hamming_threshold=0.7):
	"""
	Summarize the hamming distance of insertions to their flanking sequences, excluding partial matches.
	"""
	# Filter table for the specified species
	table = table.copy()[table['Species'] == species]

	# Create mask for hamming distance of 1 to either flank
	identical_to_flank = (
		(table['Hamming_Left'] == 1) | (table['Hamming_Right'] == 1)
	)

	# Only count Identical and Different (exclude Partial match)
	summary_hamming = pd.DataFrame({
		'1-bp': [
			((table['Inserted_bases_len'] == 1) & identical_to_flank).sum(),
			((table['Inserted_bases_len'] == 1) & ~identical_to_flank).sum(),
			(table['Inserted_bases_len'] == 1).sum()
		],
		'>1-bp': [
			((table['Inserted_bases_len'] > 1) & identical_to_flank).sum(),
			((table['Inserted_bases_len'] > 1) & ~identical_to_flank).sum(),
			(table['Inserted_bases_len'] > 1).sum()
		],
		'Total': [
			identical_to_flank.sum(),
			(~identical_to_flank).sum(),
			table.shape[0]
		],
		'Species': species
	}, index=['Identical', 'Different', 'Total'])

	return summary_hamming

# Run summarize_hamming for all species and concatenate the results into a single DataFrame
summary_hamming_all = [
	summarize_hamming(sp, table=table) for sp in species_list
]
summary_hamming_all_df = pd.concat(summary_hamming_all, keys=species_list, names=['Species']).drop(columns=['Species'])
summary_hamming_all_df


Unnamed: 0_level_0,Unnamed: 1_level_0,1-bp,>1-bp,Total
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mouse,Identical,24,8,32
mouse,Different,18,8,26
mouse,Total,42,16,58
macaque,Identical,155,54,209
macaque,Different,76,39,115
macaque,Total,231,93,324
human,Identical,14,6,20
human,Different,30,18,48
human,Total,44,24,68


# Deletions:

## Parse FastA files

In [786]:
from Bio import SeqIO
import pandas as pd
pd.set_option('display.max_rows', 20)

mutation_type = 'del'  # 'ins' or 'del'

all_tables = []

for species in species_list:
	fasta_file = f"{species}.indels.{mutation_type}.slop{slop_size}.fa"
	fasta_dict = {record.id: str(record.seq) for record in SeqIO.parse(fasta_file, "fasta")}

	split_fasta_dict = {}
	for key, seq in fasta_dict.items():
		mid = len(seq) // 2
		split_fasta_dict[key] = (seq[:mid+1], seq[mid+1:])

	table = pd.DataFrame([
		{'Fasta_ID': k, 'Left_Flank': v[0], 'Right_Flank': v[1]}
		for k, v in split_fasta_dict.items()
	])

	# Extract Mutation_ID from Fasta_ID.
	table['Mutation_ID'] = table['Fasta_ID'].str.split(':').str[0].str.split('_').str[1:4].str.join('_')

	table['Left_Flank'] = table['Left_Flank'].str.upper()
	table['Right_Flank'] = table['Right_Flank'].str.upper()
	table['Left_Flank_flipped'] = table['Left_Flank'].apply(lambda x: x[::-1])
	table[['Species', 'Position', 'Ref', 'Alt', 'Type', 'Region']] = table['Fasta_ID'].str.extract(r'([^_]+)_([^_]+)_([^_]+)_([^_]+)_([^:]+)::(.+)')
	if mutation_type == 'del':
		table['Deleted_bases'] = table['Ref'].str[1:]
	else:
		raise ValueError("Currently only 'del' mutation_type is supported.")
	table['Deleted_bases_len'] = table['Deleted_bases'].str.len()
	table['Right_Flank_deleted'] = table.apply(
		lambda row: row['Right_Flank'][row['Deleted_bases_len']:] if row['Deleted_bases_len'] > 0 else row['Right_Flank'],
		axis=1
	)

	# Consider only the flank that is the same size as the deletion.
	table['Left_Flank_flipped_shortened'] = table.apply(lambda row: 
									row['Left_Flank_flipped'][:row['Deleted_bases_len']], axis=1)
	table['Right_Flank_deleted_shortened'] = table.apply(lambda row: 
									row['Right_Flank_deleted'][:row['Deleted_bases_len']], axis=1)
	table['Right_Flank_deleted_shortened_flipped'] = table['Right_Flank_deleted_shortened'].apply(lambda x: x[::-1])
	table['Left_Flank_flipped_shortened_unflipped'] = table['Left_Flank_flipped_shortened'].apply(lambda x: x[::-1])

	all_tables.append(table)

# Concatenate all tables into one DataFrame
table = pd.concat(all_tables, ignore_index=True)
table[['Deleted_bases_len', 'Left_Flank_flipped_shortened_unflipped', 'Deleted_bases', 'Right_Flank_deleted_shortened', 'Right_Flank_deleted']]


Unnamed: 0,Deleted_bases_len,Left_Flank_flipped_shortened_unflipped,Deleted_bases,Right_Flank_deleted_shortened,Right_Flank_deleted
0,1,A,T,T,TATATTCTCCAACAACAACGACAATCTAATTCCACTTATAGGCCTA...
1,58,GAAACAGGATCAAACAACCCAACAGGATTAAACTCAGATGCAGATA...,ACCCCTACTATACAATCAAAGATATCCTAGGTATCCTAATCATATT...,AACCCTAGTATTATTTTTCCCAGACATACTAGGAGACCCAGACAAC...,AACCCTAGTATTATTTTTCCCAGACATACTAGGAGACCCAGACAAC...
2,1,G,C,C,CGAAAAAAAAAAATGGCGGTAGAAGTCTTAGTAGAGATTTCTCTAC...
3,1,C,A,A,AAAAAAACCCACGATCAACTGAAGCAGCAACAAAATACTTCGTCAC...
4,3,ACC,AAT,AAT,AATAATTGGAGGCTTTGGAAACTGACTTGTCCCACTAATAATCGGA...
...,...,...,...,...,...
1092,1,A,C,C,CCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCCTCAACT...
1093,2,CC,CT,TC,TCCCCATGCTTACAAGCAAGTACAGCAATCAACCCTCAACTATCAC...
1094,1,C,T,T,TCCCCATGCTTACAAGCAAGTACAGCAATCAACCCTCAACTATCAC...
1095,1,C,A,A,ACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAG...


### Hamming distance between inserted bases and flanks

In [787]:
from difflib import SequenceMatcher

def hamming_similarity(s1, s2):
	"""Return the proportion of matching characters (Hamming similarity) between two strings of equal length."""
	if len(s1) != len(s2) or len(s1) == 0:
		return None
	return sum(a == b for a, b in zip(s1, s2)) / len(s1)

# Compute Hamming similarity for each row between Deleted_bases and Left_Flank_flipped_shortened, Right_Flank_deleted_shortened.
table['Hamming_Left'] = table.apply(
	lambda row: hamming_similarity(row['Deleted_bases'], row['Left_Flank_flipped_shortened_unflipped']), axis=1
)
table['Hamming_Right'] = table.apply(
	lambda row: hamming_similarity(row['Deleted_bases'], row['Right_Flank_deleted_shortened']), axis=1
)

# Identify rows where the deleted bases are identical to either flank.
table['Identical_to_flank'] = (table['Hamming_Left'] == 1) | (table['Hamming_Right'] == 1)

# Create a copy of the table for deletions.
table_deletions = table.copy()

table[['Left_Flank_flipped_shortened_unflipped', 'Deleted_bases', 'Right_Flank_deleted_shortened', 'Hamming_Left', 'Hamming_Right', 'Identical_to_flank']]

Unnamed: 0,Left_Flank_flipped_shortened_unflipped,Deleted_bases,Right_Flank_deleted_shortened,Hamming_Left,Hamming_Right,Identical_to_flank
0,A,T,T,0.000000,1.000000,True
1,GAAACAGGATCAAACAACCCAACAGGATTAAACTCAGATGCAGATA...,ACCCCTACTATACAATCAAAGATATCCTAGGTATCCTAATCATATT...,AACCCTAGTATTATTTTTCCCAGACATACTAGGAGACCCAGACAAC...,0.241379,0.293103,False
2,G,C,C,0.000000,1.000000,True
3,C,A,A,0.000000,1.000000,True
4,ACC,AAT,AAT,0.333333,1.000000,True
...,...,...,...,...,...,...
1092,A,C,C,0.000000,1.000000,True
1093,CC,CT,TC,0.500000,0.000000,False
1094,C,T,T,0.000000,1.000000,True
1095,C,A,A,0.000000,1.000000,True


### Summary table using Hamming distance

In [None]:
def summarize_hamming(species, table=table, hamming_threshold=0.7):
	"""
	Summarize the hamming distance of deletions to their flanking sequences, excluding partial matches.
	"""
	# Filter table for the specified species
	table = table.copy()[table['Species'] == species]

	# Create mask for hamming distance of 1 to either flank
	identical_to_flank = (
		(table['Hamming_Left'] == 1) | (table['Hamming_Right'] == 1)
	)

	# Create mask for hamming distance <1 but >hamming_threshold to either flank (partial match)
	partial_match_to_flank = (
		(((table['Hamming_Left'] < 1) & (table['Hamming_Left'] > hamming_threshold)) |
		  ((table['Hamming_Right'] < 1) & (table['Hamming_Right'] > hamming_threshold)))
	)

	# Only count Identical and Different (exclude Partial match)
	summary_hamming = pd.DataFrame({
		'1-bp': [
			((table['Deleted_bases_len'] == 1) & identical_to_flank).sum(),
			((table['Deleted_bases_len'] == 1) & ~identical_to_flank).sum(),
			(table['Deleted_bases_len'] == 1).sum()
		],
		'>1-bp': [
			((table['Deleted_bases_len'] > 1) & identical_to_flank).sum(),
			((table['Deleted_bases_len'] > 1) & ~identical_to_flank).sum(),
			(table['Deleted_bases_len'] > 1).sum()
		],
		'Total': [
			identical_to_flank.sum(),
			(~identical_to_flank).sum(),
			table.shape[0]
		],
		'Species': species
	}, index=['Identical', 'Different', 'Total'])

	return summary_hamming

# Run summarize_hamming for all species and concatenate the results into a single DataFrame
summary_hamming_all = [
	summarize_hamming(sp, table=table) for sp in species_list
]
summary_hamming_all_df = pd.concat(summary_hamming_all, keys=species_list, names=['Species']).drop(columns=['Species'])
summary_hamming_all_df


Unnamed: 0_level_0,Unnamed: 1_level_0,1-bp,>1-bp,Total
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mouse,Identical,96,21,117
mouse,Different,35,50,85
mouse,Total,131,71,202
macaque,Identical,318,76,394
macaque,Different,118,304,422
macaque,Total,436,380,816
human,Identical,39,3,42
human,Different,7,30,37
human,Total,46,33,79


# Indels

In [789]:
table_indels = pd.concat([table_deletions[['Species','Mutation_ID','Identical_to_flank']],table_insertions[['Species','Mutation_ID','Identical_to_flank']]])
table_indels

Unnamed: 0,Species,Mutation_ID,Identical_to_flank
0,mouse,12343_AT_A,True
1,mouse,14806_CACCCCTACTATACAATCAAAGATATCCTAGGTATCCTAA...,False
2,mouse,5168_GC_G,True
3,mouse,4051_CA_C,True
4,mouse,5542_CAAT_C,True
...,...,...,...
445,human,16182_A_AC,False
446,human,16188_T_TC,True
447,human,16188_T_TCC,True
448,human,16356_T_TCTCGTCCCCATGGATGACCCCC,False


# Microsatellites/STRs

In [790]:
repeats = pd.read_table("indels.denovo_chrM_microsats.tab")

# Merge with the main indels table.
table_indels_repeats = pd.merge(repeats[['Species','Mutation_ID','Change_in_length','Within_Repeat']],
		 table_indels[['Species','Mutation_ID','Identical_to_flank']], how='right')#.drop_duplicates()
table_indels_repeats['Mutation_type'] = table_indels_repeats['Change_in_length'].apply(lambda x: 'Insertion' if x > 0 else 'Deletion')	
table_indels_repeats

Unnamed: 0,Species,Mutation_ID,Change_in_length,Within_Repeat,Identical_to_flank,Mutation_type
0,mouse,12343_AT_A,-1,STR,True,Deletion
1,mouse,14806_CACCCCTACTATACAATCAAAGATATCCTAGGTATCCTAA...,-58,Non-STR,False,Deletion
2,mouse,5168_GC_G,-1,Non-STR,True,Deletion
3,mouse,5168_GC_G,-1,Non-STR,True,Deletion
4,mouse,5168_GC_G,-1,Non-STR,True,Deletion
...,...,...,...,...,...,...
2712,human,16188_T_TC,1,STR,True,Insertion
2713,human,16188_T_TCC,2,STR,True,Insertion
2714,human,16356_T_TCTCGTCCCCATGGATGACCCCC,22,Non-STR,False,Insertion
2715,human,16373_A_AC,1,Non-STR,False,Insertion


In [None]:
# Summarize counts by Species, Within_Repeat, and Identical_to_flank
summary_repeat_flank = (
	table_indels_repeats
	.groupby(['Species', 'Mutation_type', 'Identical_to_flank', 'Within_Repeat'])
	.size()
	.unstack(fill_value=0)
)
# Add a Total column.
summary_repeat_flank['Total'] = summary_repeat_flank.sum(axis=1)

# Reorder Species index to mouse, macaque, human
species_order = ['mouse', 'macaque', 'human']
summary_repeat_flank = summary_repeat_flank.reindex(species_order, level='Species')
summary_repeat_flank = summary_repeat_flank.rename_axis(index={'Species': 'Species', 'Mutation_type': 'Mutation Type', 'Identical_to_flank': 'Identical to Flank', 'Within_Repeat': 'Within Repeat'})
summary_repeat_flank = summary_repeat_flank.rename({True: 'Identical', False: 'Not identical'})
summary_repeat_flank = summary_repeat_flank.reset_index()
summary_repeat_flank.reset_index().rename(columns={
	'Species': 'Species',
	'Mutation_type': 'Mutation Type',
	'Identical_to_flank': 'Identical to Flank',
	'Within_Repeat': 'Within Repeat'
})
summary_repeat_flank

Within_Repeat,Species,Mutation Type,Identical to Flank,Non-STR,STR,Total
0,mouse,Deletion,Not identical,63,39,102
1,mouse,Deletion,Identical,56,154,210
2,mouse,Insertion,Not identical,16,10,26
3,mouse,Insertion,Identical,3,86,89
4,macaque,Deletion,Not identical,298,337,635
5,macaque,Deletion,Identical,212,585,797
6,macaque,Insertion,Not identical,80,83,163
7,macaque,Insertion,Identical,92,420,512
8,human,Deletion,Not identical,29,12,41
9,human,Deletion,Identical,25,24,49
