In [1]:
import glob
import pandas as pd
from collections import Counter

## Main table with mutations (Kennedy)
- raw files from https://github.com/Kennedy-Lab-UW/Sanchez_Contreras_etal_2023/blob/main/data/Mouse_aging_mtDNA_summary.csv  
- paper: https://doi.org/10.7554/eLife.83395

In [5]:
outfilename = '../data/processed/mus_musculus/all_mut_mus.txt'

filenames = glob.glob('../data/raw/mus_musculus/Sanchez_Contreras_etal_2023-main/mut_files/*.dcs.mut')

In [56]:
import shutil

with open(outfilename, 'wb') as outfile:
    for filename in filenames:
        if filename == outfilename:
            # don't want to copy the output into the output
            continue
        with open(filename, 'rb') as readfile:
            columns = readfile.readline()
            shutil.copyfileobj(readfile, outfile)

In [79]:
columns = str(columns)[2:-3].split('\\t')

In [97]:
df = pd.read_table(outfilename, header=None)

In [98]:
df.columns = columns

In [92]:
df['sample'].str.split('_')

0        [JW21-NP, SS31, C, D247, L191110, 1, S200114]
1        [JW21-NP, SS31, C, D247, L191110, 1, S200114]
2        [JW21-NP, SS31, C, D247, L191110, 1, S200114]
3        [JW21-NP, SS31, C, D247, L191110, 1, S200114]
4        [JW21-NP, SS31, C, D247, L191110, 1, S200114]
                             ...                      
89626       [YM7-MS, NT, R, D240, L191110, 1, S200114]
89627       [YM7-MS, NT, R, D240, L191110, 1, S200114]
89628       [YM7-MS, NT, R, D240, L191110, 1, S200114]
89629       [YM7-MS, NT, R, D240, L191110, 1, S200114]
89630       [YM7-MS, NT, R, D240, L191110, 1, S200114]
Name: sample, Length: 89631, dtype: object

In [100]:
df['MouseID'] = df['sample'].str.split('_').str[0]
df['Treatment'] = df['sample'].str.split('_').str[1]
df['Tissue'] = df['sample'].str.split('_').str[2]

In [109]:
df['Age'] = df['MouseID'].str[0].replace('Y','4.5').replace('[J, O]', '26', regex=True)

In [111]:
df['Mut'] = df['ref'] + '>' + df['alt']

In [116]:
# df.to_csv('../data/processed/mus_musculus/all_mut_mus.csv', index=False)

In [31]:
df

Unnamed: 0,chr,start,end,sample,type,gene,ref,alt,depth,alt_count,VAF,3nt_context,5nt_context,coding_change,MouseID,Treatment,Tissue,Age,Mut
0,chrM,55,55,JW21-NP_SS31_C_D247_L191110_1_S200114,5'Flank,mt-Nd1,G,A,11422,1,0.000088,TGT,TTGTA,,JW21-NP,SS31,C,26.0,G>A
1,chrM,153,153,JW21-NP_SS31_C_D247_L191110_1_S200114,5'Flank,mt-Nd1,C,T,15166,1,0.000066,ACA,AACAT,,JW21-NP,SS31,C,26.0,C>T
2,chrM,204,204,JW21-NP_SS31_C_D247_L191110_1_S200114,5'Flank,mt-Nd1,G,A,16538,1,0.000060,AGA,AAGAC,,JW21-NP,SS31,C,26.0,G>A
3,chrM,225,225,JW21-NP_SS31_C_D247_L191110_1_S200114,5'Flank,mt-Nd1,C,T,15509,1,0.000064,CCC,CCCCC,,JW21-NP,SS31,C,26.0,C>T
4,chrM,302,302,JW21-NP_SS31_C_D247_L191110_1_S200114,5'Flank,mt-Nd1,T,C,19351,1,0.000052,ATT,AATTT,,JW21-NP,SS31,C,26.0,T>C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89626,chrM,15886,15886,YM7-MS_NT_R_D240_L191110_1_S200114,3'Flank,mt-Nd4,C,T,2971,1,0.000337,CCA,ACCAA,,YM7-MS,NT,R,4.5,C>T
89627,chrM,15912,15912,YM7-MS_NT_R_D240_L191110_1_S200114,3'Flank,mt-Nd4,G,A,3272,1,0.000306,TGG,TTGGT,,YM7-MS,NT,R,4.5,G>A
89628,chrM,16012,16012,YM7-MS_NT_R_D240_L191110_1_S200114,3'Flank,mt-Nd4,G,A,3356,1,0.000298,CGC,CCGCA,,YM7-MS,NT,R,4.5,G>A
89629,chrM,16020,16020,YM7-MS_NT_R_D240_L191110_1_S200114,3'Flank,mt-Nd4,C,T,3252,1,0.000308,CCA,CCCAA,,YM7-MS,NT,R,4.5,C>T


## Collect ALL mutations in intervals around (df_win) and within (df_win_within_sides) G4

In [2]:
df_win = pd.read_csv('../data/processed/mus_musculus/df_win_mm10.csv')
df_win_within_sides = pd.read_csv('../data/processed/mus_musculus/df_win_within_sides_mm10.csv')

In [25]:
Mut_win = []
for window_start, window_end in zip(df_win['window_start'], df_win['window_end']):
    row = [*range(window_start, window_end)]
    Mut_win.append(str(df[(df['start'] == df['end']) & (df['start'].isin(row))].Mut.values))

In [26]:
Mut_win_df = pd.DataFrame(Mut_win)

In [27]:
Mut_win_df[0] = Mut_win_df[0].str.findall("[A-T]>[A-T]")

In [28]:
obsMutwin = pd.DataFrame.from_records(Mut_win_df[0].apply(lambda x: Counter(x))).fillna(0).sort_index(axis=1)

In [29]:
df_win_mus_mut = df_win.join(obsMutwin)

In [30]:
df_win_mus_mut

Unnamed: 0,G4_id,side,number_of_win,window_start,window_end,sequence,A,C,G,T,...,A>T,C>A,C>G,C>T,G>A,G>C,G>T,T>A,T>C,T>G
0,1,up,91,106,115,AAATTACACA,6.0,2.0,0.0,2.0,...,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0
1,1,up,90,107,116,AATTACACAT,5.0,2.0,0.0,3.0,...,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0
2,1,up,89,108,117,ATTACACATG,4.0,2.0,1.0,3.0,...,1.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0
3,1,up,88,109,118,TTACACATGC,3.0,3.0,1.0,3.0,...,1.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,6.0,0.0
4,1,up,87,110,119,TACACATGCA,4.0,3.0,1.0,2.0,...,1.0,2.0,2.0,1.0,7.0,0.0,1.0,0.0,6.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4754,29,down,86,16215,16224,TTTAGTACTT,2.0,1.0,1.0,6.0,...,0.0,2.0,1.0,4.0,5.0,0.0,0.0,0.0,3.0,0.0
4755,29,down,87,16216,16225,TTAGTACTTG,2.0,1.0,2.0,5.0,...,0.0,2.0,1.0,4.0,5.0,0.0,0.0,0.0,3.0,0.0
4756,29,down,88,16217,16226,TAGTACTTGT,2.0,1.0,2.0,5.0,...,0.0,2.0,1.0,4.0,5.0,0.0,0.0,0.0,3.0,0.0
4757,29,down,89,16218,16227,AGTACTTGTA,3.0,1.0,2.0,4.0,...,0.0,2.0,1.0,4.0,6.0,1.0,1.0,0.0,3.0,0.0


In [33]:
# df_win_mus_mut.to_csv('../data/processed/mus_musculus/df_win_mus_mut.csv', index=False)

In [34]:
Mut_win_within_sides = []
for window_start, window_end in zip(df_win_within_sides['window_start'], df_win_within_sides['window_end']):
    row = [*range(window_start, window_end)]
    Mut_win_within_sides.append(str(df[(df['start'] == df['end']) & (df['start'].isin(row))].Mut.values))

In [35]:
Mut_win_df_within_sides = pd.DataFrame(Mut_win_within_sides)

In [36]:
Mut_win_df_within_sides[0] = Mut_win_df_within_sides[0].str.findall("[A-T]>[A-T]")

In [37]:
obsMutwin_within_sides = pd.DataFrame.from_records(Mut_win_df_within_sides[0].apply(lambda x: Counter(x))) \
    .fillna(0) \
    .sort_index(axis=1)

In [38]:
df_win_mus_mut_within_sides = df_win_within_sides.join(obsMutwin_within_sides)

In [39]:
df_win_mus_mut_within_sides

Unnamed: 0,G4_id,side,number_of_win,window_start,window_end,sequence,A,C,G,T,...,A>T,C>A,C>G,C>T,G>A,G>C,G>T,T>A,T>C,T>G
0,1,within_start,1,206,215,CACCTTGCCT,1.0,5.0,1.0,3.0,...,0.0,7.0,3.0,3.0,8.0,2.0,2.0,0.0,2.0,0.0
1,1,within_start,2,207,216,ACCTTGCCTA,2.0,4.0,1.0,3.0,...,0.0,5.0,3.0,2.0,8.0,2.0,2.0,0.0,4.0,0.0
2,1,within_start,3,208,217,CCTTGCCTAG,1.0,4.0,2.0,3.0,...,0.0,5.0,3.0,2.0,8.0,2.0,2.0,0.0,4.0,0.0
3,1,within_start,4,209,218,CTTGCCTAGC,1.0,4.0,2.0,3.0,...,0.0,5.0,3.0,0.0,28.0,4.0,12.0,0.0,4.0,0.0
4,1,within_start,5,210,219,TTGCCTAGCC,1.0,4.0,2.0,3.0,...,0.0,3.0,4.0,1.0,28.0,4.0,12.0,0.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1354,29,within_end,43,16078,16087,TTTAACTCTC,2.0,3.0,0.0,5.0,...,2.0,10.0,1.0,12.0,0.0,0.0,0.0,0.0,19.0,1.0
1355,29,within_end,44,16077,16086,TTTTAACTCT,2.0,2.0,0.0,6.0,...,2.0,7.0,1.0,4.0,0.0,0.0,0.0,0.0,19.0,1.0
1356,29,within_end,45,16076,16085,ATTTTAACTC,3.0,2.0,0.0,5.0,...,2.0,7.0,1.0,4.0,0.0,0.0,0.0,0.0,8.0,1.0
1357,29,within_end,46,16075,16084,AATTTTAACT,4.0,1.0,0.0,5.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,1.0


In [41]:
# df_win_mus_mut_within_sides.to_csv('../data/processed/mus_musculus/df_win_mus_mut_within_sides.csv', index=False)

## Collect YOUNG|OLD mutation in intervals separately 

In [4]:
df = pd.read_csv('../data/processed/mus_musculus/all_mut_mus.csv')

In [3]:
df_win = pd.read_csv('../data/processed/mus_musculus/df_win_mm10.csv')
df_win_within_sides = pd.read_csv('../data/processed/mus_musculus/df_win_within_sides_mm10.csv')

In [42]:
Mut_win_y = []
for window_start, window_end in zip(df_win['window_start'], df_win['window_end']):
    row = [*range(window_start, window_end)]
    Mut_win_y.append(str(df[((df['start'] == df['end']) & (df['Age'] == 4.5)) & (df['start'].isin(row))].Mut.values))

In [43]:
Mut_win_o = []
for window_start, window_end in zip(df_win['window_start'], df_win['window_end']):
    row = [*range(window_start, window_end)]
    Mut_win_o.append(str(df[((df['start'] == df['end']) & (df['Age'] == 26)) & (df['start'].isin(row))].Mut.values))

In [44]:
Mut_win_df_y = pd.DataFrame(Mut_win_y)
Mut_win_df_o = pd.DataFrame(Mut_win_o)

In [45]:
Mut_win_df_y[0] = Mut_win_df_y[0].str.findall("[A-T]>[A-T]")
Mut_win_df_o[0] = Mut_win_df_o[0].str.findall("[A-T]>[A-T]")

In [46]:
obsMutwin_y = pd.DataFrame.from_records(Mut_win_df_y[0].apply(lambda x: Counter(x))).fillna(0).sort_index(axis=1)
obsMutwin_o = pd.DataFrame.from_records(Mut_win_df_o[0].apply(lambda x: Counter(x))).fillna(0).sort_index(axis=1)

In [47]:
df_win_mus_mut_y = df_win.join(obsMutwin_y)
df_win_mus_mut_o = df_win.join(obsMutwin_o)

In [51]:
# df_win_mus_mut_y.to_csv('../data/processed/mus_musculus/df_win_mus_mut_y.csv', index=False)
# df_win_mus_mut_o.to_csv('../data/processed/mus_musculus/df_win_mus_mut_o.csv', index=False)

In [52]:
Mut_win_within_sides_y = []
for window_start, window_end in zip(df_win_within_sides['window_start'], df_win_within_sides['window_end']):
    row = [*range(window_start, window_end)]
    Mut_win_within_sides_y.append(str(df[((df['start'] == df['end']) & (df['Age'] == 4.5)) & (df['start'].isin(row))].Mut.values))

In [53]:
Mut_win_within_sides_o = []
for window_start, window_end in zip(df_win_within_sides['window_start'], df_win_within_sides['window_end']):
    row = [*range(window_start, window_end)]
    Mut_win_within_sides_o.append(str(df[((df['start'] == df['end']) & (df['Age'] == 26)) & (df['start'].isin(row))].Mut.values))

In [54]:
Mut_win_df_within_sides_y = pd.DataFrame(Mut_win_within_sides_y)
Mut_win_df_within_sides_o = pd.DataFrame(Mut_win_within_sides_o)

In [55]:
Mut_win_df_within_sides_y[0] = Mut_win_df_within_sides_y[0].str.findall("[A-T]>[A-T]")
Mut_win_df_within_sides_o[0] = Mut_win_df_within_sides_o[0].str.findall("[A-T]>[A-T]")

In [56]:
obsMutwin_within_sides_y = pd.DataFrame.from_records(Mut_win_df_within_sides_y[0].apply(lambda x: Counter(x))) \
    .fillna(0) \
    .sort_index(axis=1)
obsMutwin_within_sides_o = pd.DataFrame.from_records(Mut_win_df_within_sides_o[0].apply(lambda x: Counter(x))) \
    .fillna(0) \
    .sort_index(axis=1)

In [57]:
df_win_mus_mut_within_sides_y = df_win_within_sides.join(obsMutwin_within_sides_y)
df_win_mus_mut_within_sides_o = df_win_within_sides.join(obsMutwin_within_sides_o)

In [74]:
# add column T>G becaise in data no this transversion
df_win_mus_mut_within_sides_y['T>G'] = 0

In [76]:
# df_win_mus_mut_within_sides_y.to_csv('../data/processed/mus_musculus/df_win_mus_mut_within_sides_y.csv', index=False)
# df_win_mus_mut_within_sides_o.to_csv('../data/processed/mus_musculus/df_win_mus_mut_within_sides_o.csv', index=False)