info:
- plus (L) strand
- minus (H) strand 
- positions in the abasic sites table are 1-based (there are position 16299 in the table)

In [1]:
import ete3
from ete3 import PhyloTree
from Bio import SeqIO

import numpy as np
import pandas as pd

In [24]:
PATH_TO_DATA = '../../data/abasic_sites/'
path_to_refseq = PATH_TO_DATA + 'mm10_ChrM_oneline.fasta'
path_to_genbank = PATH_TO_DATA + 'NC_005089.1.gb'
path_to_abasic_data = PATH_TO_DATA + '41467_2022_33594_MOESM26_ESM.csv'

In [9]:
refseq = str(next(SeqIO.parse(path_to_refseq, 'fasta')).seq)
genbank = next(SeqIO.parse(path_to_genbank, 'genbank'))

assert genbank.seq == refseq

In [25]:
abasic = pd.read_csv(path_to_abasic_data, sep=';')
abasic.head()

Unnamed: 0,Chr(mm10),Start(mm10),End(mm10),NumberOfReads,Strand
0,chrM,68,69,1,+
1,chrM,69,70,2,+
2,chrM,70,71,1,+
3,chrM,74,75,4,+
4,chrM,81,82,2,+


In [16]:
genbank.features[-6:]

[SeqFeature(FeatureLocation(ExactPosition(15422), ExactPosition(16299), strand=1), type='D-loop'),
 SeqFeature(FeatureLocation(ExactPosition(15450), ExactPosition(15509), strand=1), type='misc_feature'),
 SeqFeature(FeatureLocation(ExactPosition(15514), ExactPosition(15558), strand=1), type='misc_feature'),
 SeqFeature(FeatureLocation(ExactPosition(16034), ExactPosition(16058), strand=1), type='misc_feature'),
 SeqFeature(FeatureLocation(ExactPosition(16088), ExactPosition(16104), strand=1), type='misc_feature'),
 SeqFeature(FeatureLocation(ExactPosition(16113), ExactPosition(16131), strand=1), type='misc_feature')]

In [17]:
DLOOP_START = 15423

In [66]:
df = pd.DataFrame([[i, x] for i, x in enumerate(genbank, 0)], columns=['site', 'nuc'])
df['down'] = [''] + list(df.nuc.values[:-1])
df['up'] = list(df.nuc.values[1:]) + ['']
df['triplet'] = df['down'].str.lower() + df['nuc'] + df['up'].str.lower()
# df = df[(df.triplet.str.len() == 3) & (df.site < DLOOP_START)]
df[['site', 'down', 'nuc', 'up', 'triplet']]

Unnamed: 0,site,down,nuc,up,triplet
0,0,,G,T,Gt
1,1,G,T,T,gTt
2,2,T,T,A,tTa
3,3,T,A,A,tAa
4,4,A,A,T,aAt
...,...,...,...,...,...
16294,16294,T,A,A,tAa
16295,16295,A,A,C,aAc
16296,16296,A,C,A,aCa
16297,16297,C,A,A,cAa


In [59]:
merged = df.merge(abasic, right_on='Start(mm10)', left_on='site')

In [60]:
merged.Strand.value_counts()

-    8422
+    5919
Name: Strand, dtype: int64

In [61]:
merged

Unnamed: 0,site,nuc,down,up,triplet,Chr(mm10),Start(mm10),End(mm10),NumberOfReads,Strand
0,2,T,T,A,tTa,chrM,2,3,2,-
1,3,A,T,A,tAa,chrM,3,4,1,-
2,4,A,A,T,aAt,chrM,4,5,2,-
3,5,T,A,G,aTg,chrM,5,6,4,-
4,6,G,T,T,tGt,chrM,6,7,2,-
...,...,...,...,...,...,...,...,...,...,...
14336,15408,T,T,A,tTa,chrM,15408,15409,1,-
14337,15409,A,T,A,tAa,chrM,15409,15410,2,-
14338,15410,A,A,A,aAa,chrM,15410,15411,2,-
14339,15411,A,A,C,aAc,chrM,15411,15412,1,-


In [64]:
counts = merged.groupby(['Strand', 'triplet']).NumberOfReads.agg(['sum', 'count'])

In [65]:
counts.loc['-']

Unnamed: 0_level_0,sum,count
triplet,Unnamed: 1_level_1,Unnamed: 2_level_1
aAa,661,269
aAc,2002,358
aAg,602,149
aAt,388,138
aCa,2196,250
...,...,...
tGt,136,56
tTa,1581,193
tTc,3308,185
tTg,1710,78
