In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
from patsy import dmatrices
import statsmodels.api as sm

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


In [2]:
from bean.mapping._supporting_fn import _get_allele_from_alignment
from bean import Allele,Edit

In [3]:
edit_dict = {"A":"G", "T":"C"}
revcomp_dict = {"A":"T", "T":"A", "C":"G", "G":"C"}

In [4]:
var_id = "rs62084210_Maj_ABE_250"

### Load variant & amplicon info

In [5]:
var_info_tbl = pd.read_excel("../../resources/LDLvar/20221013_LDLvar_simpleZscores_credset.xlsx", index_col=0, header=1)

In [6]:
amplicon_info = pd.read_csv("../../resources/atac_seq/030123_ATACseq_info.csv", ).set_index("gRNA")

In [7]:
var_alleles = var_info_tbl.loc[var_id, ["A1", "A2"]].reset_index()
var_alleles['edited'] = var_alleles[var_id].map(lambda b: edit_dict[b] if b in edit_dict.keys() else np.nan)
var_alleles = var_alleles.loc[~var_alleles.edited.isnull()]
var_alleles.columns=["allele", "ref", "edited"]
var_chrom = int(var_info_tbl.loc[var_id, "CHR"])
var_pos = int(var_info_tbl.loc[var_id, 'position_hg19'])

In [8]:
amplicon_start = amplicon_info.loc[var_id, "Varpos_fw"]-4
amplicon_seq = amplicon_info.loc[var_id, "Amplicon_fw (reference)"].replace("N", "")
amplicon_len = len(amplicon_seq)
amplicon_end = amplicon_start + amplicon_len

In [9]:
phased_snps = pd.read_table(f"../../results/atac_seq/{var_id}_phasedSNPs.vcf", comment="#", header=None)
phased_snps.columns=["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "HepG2"]
phased_snps["GT"] = phased_snps.HepG2.map(lambda s: s.split(":")[0])
phased_snps = phased_snps.loc[phased_snps.GT.isin(["1|0", "0|1"]) & (phased_snps.FILTER == "PASS")]

In [10]:
def get_allele(row):
    ids = row.GT.split("|")
    alleles = [row[{"0":"REF", "1":"ALT"}[aid]] for aid in ids]
    return alleles

def _get_allele_from_alignment(
    ref_aligned: str,
    query_aligned: str,
    offset: int,
    strand: int,
    start_pos: int,
    end_pos: int,
    positionwise_quality: np.ndarray = None,
    quality_thres: float = -1,
):
    # Include N, no quality filter
    assert len(ref_aligned) == len(query_aligned)
    allele = Allele()
    ref_gaps = 0
    alt_gaps = 0
    alt_seq_len = len(query_aligned) - query_aligned.count("-")
    if positionwise_quality is None:
        # alt_position_is_good_quality = np.ones(alt_seq_len, dtype=bool)
        alt_position_is_good_quality = np.array(
            [c != "N" for c in query_aligned.replace("-", "")]
        )
    else:
        assert len(positionwise_quality) == alt_seq_len
        alt_position_is_good_quality = positionwise_quality > quality_thres
    for i in range(len(ref_aligned)):
        if ref_aligned[i] == query_aligned[i]:
            continue
        ref_base = ref_aligned[i]
        alt_base = query_aligned[i]
        if alt_base != "-":
            alt_base_is_good_quality = alt_position_is_good_quality[i - alt_gaps]
        else:
            alt_base_is_good_quality = True
        if ref_base == "-":
            ref_gaps += 1
        elif alt_base == "-":
            alt_gaps += 1
        ref_pos = i - ref_gaps
        allele.add(
            Edit(
                rel_pos=ref_pos,
                ref_base=ref_base,
                alt_base=alt_base,
                offset=offset,
                strand=strand,
            )
        )
    return allele

In [11]:
if len(phased_snps) == 1:
    raise ValueError("Cannot phase variant of interest.")

phased_snps['alleles'] = phased_snps.apply(get_allele, axis=1)
phased_snps['amplicon_pos'] = phased_snps.POS - var_pos + amplicon_start
phased_snps["amplicon_base"] = phased_snps.amplicon_pos.map(
    lambda i: amplicon_seq[i]
)

In [12]:
phased_snps_nonvar = phased_snps.loc[phased_snps.amplicon_pos != amplicon_start]

In [13]:
editable_phase = np.where(np.array([p in edit_dict.keys() for p in phased_snps.loc[phased_snps.amplicon_pos == amplicon_start, "alleles"].item()]))[0]

In [14]:
phases = pd.DataFrame(
    phased_snps_nonvar.alleles.tolist(), 
    columns=[f'phase{i}' for i in range(len(phased_snps_nonvar.alleles.tolist()[0]))]
).sum(axis=0)
'''
phase0    TT
phase1    CC
'''
#pd.DataFrame(df2["teams"].to_list(), columns=['team1', 'team2'])

'\nphase0    TT\nphase1    CC\n'

## Phase reads

In [15]:
def get_edit(row, target_pos):
    return _get_allele_from_alignment(row.Reference_Sequence, row.Aligned_Sequence, offset=0, strand=1, start_pos=target_pos-20, end_pos=target_pos+20)

def get_all_edits(row):
    return _get_allele_from_alignment(row.Reference_Sequence, row.Aligned_Sequence, offset=0, strand=1, start_pos=phased_snps.amplicon_pos.min()-1, end_pos = phased_snps.amplicon_pos.max()+1)
    

def filter_allele_by_pos_qual(allele, target_pos):
    # filters allele by variant included in list_pos or near target_pos
    edits = []
    n_out_edits = 0
    n_lq = 0
    n_crit_lq = 0
    for edit in allele.split(","):
        if edit == "": continue
        alt = edit.split(">")[-1]
        pos = int(edit.split(":")[0])
        if alt == "N": 
            n_lq += 1
            if abs(pos-target_pos) < 5:
                n_crit_lq += 1
        elif abs(pos-target_pos) < 5:
            edits.append(edit)
        else:
            n_out_edits += 1
    return n_out_edits, n_lq, n_crit_lq, ",".join(sorted(edits))

def filter_allele_by_pos(list_pos, allele, target_pos):
    edits = []
    for edit in allele.split(","):
        if edit == "": continue
        pos = int(edit.split(":")[0])
        if pos in list_pos or abs(pos-target_pos) < 5:
            edits.append(edit)
    return ",".join(sorted(edits))

def filter_out_indels(allele):
    edits = []
    for edit in allele.edits:
        if edit.ref_base != "-" and edit.alt_base != "-":
            edits.append(str(edit))
    return ",".join(sorted(edits))



def get_phase(allele, phased_snps, var_pos):
    phase = []
    allele_edits = {}
    if allele != "":
        edits = allele.split(",")
        for edit in edits:
            try:
                allele_edits[int(edit.split(":")[0])] = edit.split(":")[-1]
            except ValueError:
                print(edit)
    for pos in phased_snps.amplicon_pos.tolist():
        if pos == var_pos: continue
        if pos in allele_edits.keys():
            phase.append(allele_edits[pos].split(">")[-1])
        else:
            phase.append(phased_snps[['amplicon_pos', 'amplicon_base']].set_index("amplicon_pos").loc[pos, "amplicon_base"])
    return phase


def mask_seq(seq, mask_pos):
    seq_array = np.array(list(seq))
    seq_array[mask_pos] = "N"
    return "".join(seq_array.tolist())
    
def assign_phase(phases, allele_phase):
    match= np.where(phases == allele_phase[0])[0]
    if len(match) != 1:
        return -1
    return match[0]
def filter_out_indels(allele):
    edits = []
    for edit in allele.edits:
        if edit.ref_base != "-" and edit.alt_base != "-":
            edits.append(str(edit))
    return ",".join(sorted(edits))

def get_pos_edit_rate(aftbl):
    total_counts = aftbl['#Reads'].sum()
    aftbl = aftbl.loc[aftbl.be_allele != "",:].copy()
    aftbl['edits'] = aftbl.be_allele.map(lambda a: a.split(","))
    edit_tbl = aftbl.explode("edits")
    edit_tbl['pos'] = edit_tbl.edits.map(lambda e: int(e.split(":")[0]))
    per_pos_edits = edit_tbl.groupby("pos")["#Reads"].sum()
    per_pos_edit_rates = per_pos_edits / aftbl['#Reads'].sum()
    return np.where(per_pos_edit_rates > 0.5)[0]

def format_allele_freq_table(path, varpos_amplicon):
    aftbl = pd.read_csv(path, sep='\t')
    aftbl["allele"] = aftbl.apply(lambda row:get_all_edits(row),axis=1)
    aftbl["allele_around_target"] = aftbl.apply(lambda row:get_edit(row, varpos_amplicon),axis=1)
    aftbl['be_allele'] = aftbl.allele.map(filter_out_indels)
    aftbl['be_allele_around_target'] = aftbl.allele_around_target.map(filter_out_indels)
    aftbl['err'] = aftbl.be_allele_around_target.map(lambda a:filter_allele_by_pos_qual(a, varpos_amplicon)[0])
    aftbl['N'] = aftbl.be_allele_around_target.map(lambda a:filter_allele_by_pos_qual(a, varpos_amplicon)[1])
    aftbl['target_proximal_N'] = aftbl.be_allele_around_target.map(lambda a:filter_allele_by_pos_qual(a, varpos_amplicon)[2])
    aftbl['phased_allele'] = aftbl.be_allele.map(lambda a: filter_allele_by_pos(phased_snps.amplicon_pos.tolist(), a, varpos_amplicon))
    aftbl = aftbl.loc[(aftbl.err <= 1) & (aftbl.target_proximal_N == 0) & (aftbl.N <= 2)]
    aftbl['phase'] = aftbl.phased_allele.map(lambda a: get_phase(a, phased_snps, amplicon_start))
    aftbl["assigned_phase"] = aftbl.phase.map(lambda ap: assign_phase(phases, ap))
    return aftbl

def get_phased_edit_rate(aftbl):
    var_amplicon_base = phased_snps.loc[phased_snps.amplicon_pos == amplicon_start, "amplicon_base"].item()
    aftbl.loc[~aftbl.assigned_phase.isin(editable_phase), "edited"] = np.nan
    if var_amplicon_base == var_alleles.ref.item():
        aftbl.loc[aftbl.assigned_phase.isin(editable_phase),'edited'] = aftbl.loc[aftbl.assigned_phase.isin(editable_phase),:].phased_allele.map(lambda s: f"{amplicon_start}:{amplicon_start}:+:{var_alleles.ref}>{var_alleles.edited.item()}" in s)
    elif var_amplicon_base == var_alleles.edited.item():
        aftbl.loc[aftbl.assigned_phase.isin(editable_phase),'edited'] = aftbl.loc[aftbl.assigned_phase.isin(editable_phase),:].phased_allele.map(lambda s: f"{amplicon_start}:{amplicon_start}:+:{var_amplicon_base}>{var_alleles.ref.item()}" not in s)
    aftbl['edited_clean'] = aftbl['edited'].map(lambda s: False if np.isnan(s) else s)
    return aftbl.groupby(['assigned_phase', 'edited_clean'])['#Reads'].sum()

In [16]:
results = {}
for rep in tqdm([1,2,3, "C"]):
    for cond in tqdm(["Ser", "SS"]):
        exp_id = f"{rep}-{cond}_20-locus"
        atac_path = f"../../results/atac_seq/crispresso_runs_indiv_demuxed/CRISPResso_on_{rep}-{cond}_20-locus_ATAC_Seq.{var_id}/Alleles_frequency_table.txt"
        gdna_path = f"../../results/atac_seq/crispresso_runs_indiv_demuxed/CRISPResso_on_{rep}-{cond}_20-locus_GDNA.{var_id}/Alleles_frequency_table.txt"
        
        if not os.path.exists(atac_path):
            os.system(f"unzip ../../results/atac_seq/crispresso_runs_indiv_demuxed/CRISPResso_on_{rep}-{cond}_20-locus_ATAC_Seq.{var_id}/Alleles_frequency_table.zip -d ../../results/atac_seq/crispresso_runs_indiv_demuxed/CRISPResso_on_{rep}-{cond}_20-locus_ATAC_Seq.{var_id}/")
        aftbl_atac = format_allele_freq_table(atac_path, amplicon_start)
            
        if not os.path.exists(gdna_path):
            os.system(f"unzip ../../results/atac_seq/crispresso_runs_indiv_demuxed/CRISPResso_on_{rep}-{cond}_20-locus_GDNA.{var_id}/Alleles_frequency_table.zip -d ../../results/atac_seq/crispresso_runs_indiv_demuxed/CRISPResso_on_{rep}-{cond}_20-locus_GDNA.{var_id}/")
        aftbl_gdna = format_allele_freq_table(gdna_path, amplicon_start)
        results[exp_id] = [get_phased_edit_rate(aftbl_atac), get_phased_edit_rate(aftbl_gdna)]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
res = []
for k, dfs in results.items():
    catdf = dfs[0].to_frame().join(dfs[1].to_frame(), how='outer', rsuffix='gdna').fillna(0)
    print(catdf)
    catdf.columns = ["atac", "gdna"]
    print(catdf)
    cols = catdf.index.get_level_values(0)
    catdf = catdf.T
    catdf.columns=cols
    catdf['exp'] = k
    res.append(catdf)
results_df = pd.concat(res)

                             #Reads  #Readsgdna
assigned_phase edited_clean                    
-1             False            0.0           1
 0             False           82.0         305
 1             False          117.0         360
                              atac  gdna
assigned_phase edited_clean             
-1             False           0.0     1
 0             False          82.0   305
 1             False         117.0   360
                             #Reads  #Readsgdna
assigned_phase edited_clean                    
0              False             57         313
1              False             57         327
                             atac  gdna
assigned_phase edited_clean            
0              False           57   313
1              False           57   327
                             #Reads  #Readsgdna
assigned_phase edited_clean                    
0              False             80         258
1              False             75         324
           

In [18]:
results_df.columns = ["Unassigned", "Phase0", "Phase1", "exp"]
results_df['is_atac'] = results_df.index == 'atac'
results_df['phase0_frac'] = results_df.Phase0 / results_df.iloc[:,1:3].sum(axis=1)

In [19]:
results_df['exp'] = results_df.exp.map(lambda x: x.split("_")[0])
results_df[['rep', 'cond']] = results_df.exp.str.split("-", expand=True)

In [20]:
results_df

Unnamed: 0,Unassigned,Phase0,Phase1,exp,is_atac,phase0_frac,rep,cond
atac,0.0,82.0,117.0,1-Ser,True,0.41206,1,Ser
gdna,1.0,305.0,360.0,1-Ser,False,0.458647,1,Ser
atac,,57.0,57.0,1-SS,True,0.5,1,SS
gdna,,313.0,327.0,1-SS,False,0.489063,1,SS
atac,,80.0,75.0,2-Ser,True,0.516129,2,Ser
gdna,,258.0,324.0,2-Ser,False,0.443299,2,Ser
atac,0.0,30.0,49.0,2-SS,True,0.379747,2,SS
gdna,2.0,337.0,349.0,2-SS,False,0.491254,2,SS
atac,,204.0,210.0,3-Ser,True,0.492754,3,Ser
gdna,,280.0,258.0,3-Ser,False,0.520446,3,Ser


In [21]:
y, X = dmatrices('Phase0 + Phase1 ~ rep + cond + is_atac + cond*is_atac', data = results_df, return_type='dataframe')

In [22]:
glm_binom = sm.GLM(y, X, family=sm.families.Binomial())

In [23]:
glm_res = glm_binom.fit()

In [24]:
glm_res.summary()

0,1,2,3
Dep. Variable:,"['Phase0', 'Phase1']",No. Observations:,16.0
Model:,GLM,Df Residuals:,9.0
Model Family:,Binomial,Df Model:,6.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-55.723
Date:,"Wed, 14 Jun 2023",Deviance:,10.834
Time:,13:13:09,Pearson chi2:,10.8
No. Iterations:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.0825,0.058,-1.414,0.157,-0.197,0.032
rep[T.2],0.0013,0.072,0.018,0.986,-0.140,0.142
rep[T.3],0.1524,0.069,2.219,0.026,0.018,0.287
rep[T.C],0.0862,0.069,1.254,0.210,-0.049,0.221
cond[T.Ser],-0.0717,0.056,-1.281,0.200,-0.181,0.038
is_atac[T.True],-0.0623,0.084,-0.744,0.457,-0.226,0.102
cond[T.Ser]:is_atac[T.True],0.0330,0.111,0.298,0.766,-0.184,0.250


In [27]:
result_html = glm_res.summary().tables[1].as_html()
glm_res_tbl = pd.read_html(result_html, header=0, index_col=0)[0]

In [30]:
glm_res_tbl

Unnamed: 0,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.0825,0.058,-1.414,0.157,-0.197,0.032
rep[T.2],0.0013,0.072,0.018,0.986,-0.14,0.142
rep[T.3],0.1524,0.069,2.219,0.026,0.018,0.287
rep[T.C],0.0862,0.069,1.254,0.21,-0.049,0.221
cond[T.Ser],-0.0717,0.056,-1.281,0.2,-0.181,0.038
is_atac[T.True],-0.0623,0.084,-0.744,0.457,-0.226,0.102
cond[T.Ser]:is_atac[T.True],0.033,0.111,0.298,0.766,-0.184,0.25


In [31]:
glm_res_tbl.to_csv("rs62084210_Maj_ABE_250_caqtl_result.csv")

ns