In [None]:
path_to_data = "data/"
path_to_scripts = "scripts/"
path_to_images = "images/"

In [None]:

## Figure 1
import numpy as np
import matplotlib.pyplot as plt

## Show the added read values for high and low read simulations

N = np.linspace(100,1000,20)
q = np.linspace(0.00001, 0.05, 20)

x, y = np.meshgrid(N, q, indexing='ij')
z = x * y * 4
plt.pcolor(x,y,z,shading='auto')
plt.colorbar(label="Added mobile reads")
plt.xlabel("N")
plt.ylabel("q")
plt.savefig(path_to_images + "added_reads.png",dpi=300)
plt.show()

In [None]:
## Figure 2: Evaluating using absolute reads to determine mobility

## Run the simulations

#!python scripts/simulations.py
!python scripts/simulations_hom1000.py

In [None]:
import pandas as pd

df = pd.read_csv("data/N_q_sims.rep_0hom1000.csv")
df_mob = df[df["mobile"]==True]
df_mob.sort_values(["N","q"])[["N","q","eco2","Nhomo1","nhomo1","Nhomo2","nhomo2","log10BF"]].to_csv("dfmob_hom1000.csv")

In [None]:
## Plot the absolute analysis
!python scripts/plot_abs_reads.py
!python scripts/plot_universal_reads.py
!python scripts/plot_bayes_reads.py
!python scripts/plot_bayes_added_reads.py

In [None]:
## Run the simulations for the Bayes analysis with the different added mobile reads
## Uncomment to run - this is slow!
#!python3 scripts/bayes_added_reads.py

In [None]:
## Plot the simulations for the Bayes analysis with the different added mobile reads
!python3 scripts/plot_bayes_added_reads.py

In [None]:
## Run differences between mobile and nonmobile populations
!python3 scripts/mobile_nonmobile_diff.py

In [None]:

!python3 scripts/plot_mobile_nonmobile_diff.py

In [None]:
## Create an accuracy plot for the simulation output
#Precision = True Positive/Predicted Positive
#Recall = True Positive/ Actual Positive
# F1 = 2 * (precision * recall) / (precision + recall)
!python scripts/plot_accuracy_bayes.py
!python scripts/plot_accuracy_universal.py
!python scripts/plot_accuracy_abs.py



In [None]:
path_to_data = "data/"
path_to_output = "images/"
df1 = pd.read_csv(path_to_data + "N_q_sims.rep_0.csv")
df2 = pd.read_csv(path_to_data +"N_q_sims.rep_1.csv")

## Apply data caps
df1["log10BF"].loc[df1["log10BF"]>2]= 2
df1["log10BF"].loc[df1["log10BF"]<-2]= -2
df2["log10BF"].loc[df2["log10BF"]>2]= 2
df2["log10BF"].loc[df2["log10BF"]<-2]= -2

## Change datapoints where mobile transcripts have no reads mapping to the other ecotype
df1["mobile"].loc[(df1["mobile"]==True) & (df1["n_mobile"]==0)]=False
df2["mobile"].loc[(df2["mobile"]==True) & (df2["n_mobile"]==0)]=False

## Sum the Bayes factor
df_bayes = df1
df_bayes["log10BF"] = df1["log10BF"] + df2["log10BF"]

test = df_bayes.loc[(df_bayes["q"]<0.01) * (df_bayes["N"]<200)]
test = test[["mobile","N","q","n_mobile","Nhomo1","nhomo1","Nhomo2","nhomo2","log10BF"]]
test.loc[test["mobile"]==True]


In [None]:
!python scripts/plot_accuracy_bayes_hom1000.py

In [None]:
## Script to run the code for the main simulations to be analysed by the 3 methods

import numpy as np
import pandas as pd
import itertools
import random

## Simulations that only create heterograft data and usse the average error rate as the prior

## Bayes factors code
## Define the main functions

def safebeta(N,n, alpha,beta):
    facterm = 1.0
    if alpha>1:
        a = np.arange(1, alpha)
        facterm = np.prod((n+a)/(N+a))
    if beta > 1.0:
        a = np.arange(1, beta)
        facterm = facterm * np.prod((N-n+a)/(N+alpha+a))
    return facterm / (N+alpha)

safebeta = np.vectorize(safebeta)

## Function to calculate the posterior ratio
def fasterpostN2(Nhomo1,nhomo1,Nhomo2,nhomo2,N,n,nmax):
    N = int(N)
    alpha1 = nhomo1+1
    beta1 = Nhomo1-nhomo1+1
    alpha2 = nhomo2+1
    beta2 = Nhomo2-nhomo2+1
    postN2 = np.zeros(N+2)
    postN2xN2 = np.zeros(N+2)
    PN2max = -10.0
    N2max = 0
    for N2 in np.arange(0,min(N+1,n+nmax+1)):
        N2 = int(N2)
        i = N2 + 1
        n2_min = max(0,N2-n)
        n2_max = min(N-n, N2)
        n2_array = np.arange(n2_min, n2_max+1)
        postN2[i] = np.sum(safebeta(N-N2,n-N2+n2_array,alpha1,beta1) * safebeta(N2,n2_array,alpha2,beta2))
        postN2[i]=postN2[i]/i
        if (N2>0) & (postN2[i]>PN2max):
            PN2max = postN2[i]
            N2max = N2
        
        postN2xN2[i]=postN2[i]*N2
    
    sumpostN2=sum(postN2)
    if sumpostN2 != 0:
        postN2=postN2/sumpostN2
        postN2xN2=postN2xN2/sumpostN2
        logBF21N2 = np.log10(postN2[N2max+1]/postN2[1]) # +1 because of the index
        meanN2=sum(postN2xN2)
        results = [meanN2,N2max,logBF21N2]
        results = logBF21N2
    else:
        ## If sumpost = 0, can't calculate BF. Return nan
         results = [np.nan, np.nan, np.nan]
         results = np.nan
    return results

## Reads to add to mobile SNPs
mobile_add = 4

## Define the functions
## Create reads function
def create_reads(row, q):
    rand_numbs = np.random.rand(int(row))
    nhom1 = (rand_numbs < q).sum()
    return int(nhom1)

## Function to assign transported SNP reads
def mobile_reads(N, mobile, q,mobile_add):
    range = mobile_add * (q * N)
    mob_reads = range * mobile
    return int(mob_reads)

## Define parameters
transcript_no = 100

## Het reads
## Create the dataframes
N = np.random.uniform(low = 10, high = 1000, size = transcript_no)
q = np.random.uniform(low = 0.0, high = 0.1, size = transcript_no)
tp_list = []
tn_list = []
fp_list = []
fn_list = []
## Replicates to calculate ROC-AUC
bf_thresh = np.linspace(-1,1, 10)
for thresh in bf_thresh:
    df = pd.DataFrame([N,q])
    df=df.T
    df.columns=["N","q"]

    ## Calculate the average error 
    av_error = np.mean(q)
    ## Run analysis for each value
    df['eco2'] = df[['N','q']].apply(lambda x: create_reads(*x),axis=1)
    df["mobile"] = random.choices([True, False], weights=[0.5, 0.5], k=transcript_no)
    df['n_mobile'] = df[["N", "mobile", "q"]].apply(lambda x: mobile_reads(*x,mobile_add), axis=1)
    ## Add an extra read to make sure there are no zero values
    df["n_mobile"].loc[df["mobile"]==True] += 1
    df["eco2"] = df["eco2"] + df["n_mobile"]
    ## Assume well-defined error rates
    Nhomo1 = int(1000 * (1 - av_error))
    nhomo1 = int(1000 * av_error)
    Nhomo2 = int(1000 * (1 - av_error))
    nhomo2 = int(1000 * av_error)
    df["log10BF"] = df[["N","eco2"]].apply(lambda x: fasterpostN2(Nhomo1, nhomo1, Nhomo2, nhomo2, *x, 10), axis=1)
    ## Sum the Bayes factor
    df_bayes = df
    #df_bayes["log10BF"] = df1["log10BF"] + df2["log10BF"]

    df_bayes["TP"] = 0
    df_bayes["TN"] = 0
    df_bayes["FP"] = 0
    df_bayes["FN"] = 0

    ## Bayesian analysis
    df_bayes["TP"].loc[(df_bayes["mobile"]==True) & (df_bayes["log10BF"]>=thresh)] = 1
    df_bayes["TN"].loc[(df_bayes["mobile"]==False) & (df_bayes["log10BF"]<thresh)] = 1
    df_bayes["FN"].loc[(df_bayes["mobile"]==True) & (df_bayes["log10BF"]<thresh)] = 1
    df_bayes["FP"].loc[(df_bayes["mobile"]==False) & (df_bayes["log10BF"]>=thresh)] = 1

    TP = df_bayes["TP"].sum()
    TN = df_bayes["TN"].sum()
    FP = df_bayes["FP"].sum()
    FN = df_bayes["FN"].sum()


    tp_list.append(TP)
    tn_list.append(TN)
    fp_list.append(FP)
    fn_list.append(FN)



In [None]:
## TPR y
## FPR x

tpr = np.array(tp_list) / (np.array(tp_list) + np.array(fn_list))
fpr = np.array(fp_list) / (np.array(fp_list) + np.array(tn_list))

import matplotlib.pyplot as plt

plt.plot(fpr,tpr,'x')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.xlim(0,1)
plt.ylim(0,1)
plt.show()


In [None]:
display(tpr)
display(fpr)
display(bf_thresh)

In [None]:
## Run z-score analysis on the simulation data
## Get average error rates across all homograft datasets
## Plot individual error rates against each other for each SNP / exp - are the ones with the high BFs consistent in errors?
from os import listdir, mkdir, getcwd
from os.path import isfile, join, isdir


## Load in all of the homograft data
hompath = "../mrna-data-mining/bayesian_SNPs/raw_data/col_ler/homfiles/"
homfiles = [f for f in listdir(hompath) if isfile(join(hompath, f))]
homfiles = [f for f in homfiles if "Store" not in f]
col_hom = [f for f in homfiles if "Col" in f]
ler_hom = [f for f in homfiles if "Ler" in f]
display(homfiles)

error_list = []
## load in and calculate error rates
for f in col_hom:
    df = pd.read_csv(hompath + f, delimiter="\t")
    df = df[["colDepth","lerDepth","depth"]]
    df[["colDepth","lerDepth","depth"]] = df[["colDepth","lerDepth","depth"]].apply(pd.to_numeric, errors="coerce")
    df = df.loc[df["depth"]>0]
    df.dropna(inplace = True)
    df["error"] = (df["lerDepth"] + 1) / (df["depth"]  + 2)
    error_list.append(df["error"].mean())

error_list  = np.array(error_list)
print(error_list.mean())
print(error_list.std())

error_list=[]
for f in ler_hom:
    df = pd.read_csv(hompath + f, delimiter="\t")
    df = df[["colDepth","lerDepth","depth"]]
    df[["colDepth","lerDepth","depth"]] = df[["colDepth","lerDepth","depth"]].apply(pd.to_numeric, errors="coerce")
    df = df.loc[df["depth"]>0]
    df.dropna(inplace = True)
    df["error"] = (df["colDepth"] + 1) / (df["depth"]  + 2)
    error_list.append(df["error"].mean())


error_list  = np.array(error_list)
print(error_list.mean())
print(error_list.std())

In [None]:
## Plot the errors
col_hom_stem = [f for f in col_hom if "Stem" in f]
col_hom_stem

df1 = pd.read_csv(hompath + col_hom_stem[0], delimiter = "\t", low_memory = False)
df2 = pd.read_csv(hompath + col_hom_stem[1], delimiter = "\t", low_memory = False)
df1[["colDepth","lerDepth","depth"]] = df1[["colDepth","lerDepth","depth"]].apply(pd.to_numeric, errors="coerce")
df1.dropna(inplace = True)
d1 = df1.loc[df1["depth"]>0]

df2[["colDepth","lerDepth","depth"]] = df2[["colDepth","lerDepth","depth"]].apply(pd.to_numeric, errors="coerce")
df2.dropna(inplace = True)
df2 = df2.loc[df2["depth"]>0]

df_stem = pd.merge(df1, df2, on="SNP")
df_stem["error1"] = (df_stem["lerDepth_x"] + 1) / (df_stem["depth_x"] + 2)
df_stem["error2"] = (df_stem["lerDepth_y"] + 1) / (df_stem["depth_y"] + 2)

plt.plot(df_stem["error1"], df_stem["error2"],'x')
plt.xlabel("Error rep1")
plt.ylabel("Error rep2")
plt.title("Stem")
plt.savefig("stem_error.png",dpi=300)

plt.show()


col_hom_rosette = [f for f in col_hom if "Rosette" in f]

df1 = pd.read_csv(hompath + col_hom_rosette[0], delimiter = "\t", low_memory = False)
df2 = pd.read_csv(hompath + col_hom_rosette[1], delimiter = "\t", low_memory = False)
df1[["colDepth","lerDepth","depth"]] = df1[["colDepth","lerDepth","depth"]].apply(pd.to_numeric, errors="coerce")
df1.dropna(inplace = True)
df1 = df1.loc[df1["depth"]>0]

df2[["colDepth","lerDepth","depth"]] = df2[["colDepth","lerDepth","depth"]].apply(pd.to_numeric, errors="coerce")
df2.dropna(inplace = True)

df_rosette = pd.merge(df1, df2, on="SNP")
df_rosette["error1"] = (df_rosette["lerDepth_x"] + 1) / (df_rosette["depth_x"] + 2)
df_rosette["error2"] = (df_rosette["lerDepth_y"] + 1) / (df_rosette["depth_y"] + 2)

plt.plot(df_rosette["error1"], df_rosette["error2"],'x')
plt.xlabel("Error rep1")
plt.ylabel("Error rep2")
plt.title("Rosette")
plt.savefig("rosette_error.png",dpi=300)
plt.show()

col_hom_flower = [f for f in col_hom if "Flower" in f]

df1 = pd.read_csv(hompath + col_hom_flower[0], delimiter = "\t", low_memory = False)
df2 = pd.read_csv(hompath + col_hom_flower[1], delimiter = "\t", low_memory = False)
df1[["colDepth","lerDepth","depth"]] = df1[["colDepth","lerDepth","depth"]].apply(pd.to_numeric, errors="coerce")
df1.dropna(inplace = True)
df1 = df1.loc[df1["depth"]>0]

df2[["colDepth","lerDepth","depth"]] = df2[["colDepth","lerDepth","depth"]].apply(pd.to_numeric, errors="coerce")
df2.dropna(inplace = True)
df2 = df2.loc[df2["depth"]>0]

df_flower = pd.merge(df1, df2, on="SNP")
df_flower["error1"] = (df_flower["lerDepth_x"] + 1) / (df_flower["depth_x"] + 2)
df_flower["error2"] = (df_flower["lerDepth_y"] + 1) / (df_flower["depth_y"] + 2)

plt.plot(df_flower["error1"], df_flower["error2"],'x')

plt.title("Flower")
plt.xlabel("Error rep1")
plt.ylabel("Error rep2")
plt.savefig("flower_error.png",dpi=300)

plt.show()

In [None]:
!python scripts/plot_zscores.py

In [None]:
!python scripts/simulations_hom_10.py

In [None]:

!python scripts/plot_zscores_hom10.py

In [None]:
!python scripts/plot_zscores.py

In [None]:
### What difference does taking only the "best" SNPs make?

## Create the simulated data: added in a high level of noise, and different error rates for each SNP in a transcript

## Select the "best" 1/2/3/4/5 etc SNPs and see how that changes the results

import pandas as pd
import numpy as np
import random

transcripts = 1000

## Create a dataframe of transcripts
transcript_ids = []

for t in range(transcripts):
    transcript_ids.append("Transcript_"+str(t))

df = pd.DataFrame(data=transcript_ids, columns=["transcripts"])

## Create columns for each SNP
no_snps = 100

## Create matrix of error rates
errors = np.random.rand(transcripts, no_snps) / 10 ## errors between 0 and 0.1
errors = np.ones([transcripts, no_snps]) * 0.01
## Create matrix of read depths
reads = np.random.randint(low = 1, high=100, size=[transcripts, no_snps])

## Create matrix of ns
def create_reads(row, q):
    rand_numbs = np.random.rand(int(row))
    nhom1 = (rand_numbs < q).sum()
    return int(nhom1)

## Function to assign transported SNP reads
def mobile_reads(N, mobile, q,mobile_add):
    range = mobile_add * (q * N)
    mob_reads = range * mobile
    return int(mob_reads)

create_reads = np.vectorize(create_reads)
mobile_reads = np.vectorize(mobile_reads)

ns = create_reads(reads, errors)

## Create mobile values

## Add on mobile reads
mobile_add = 5
mobile_def = random.choices([True, False], weights=[0.5, 0.5], k=transcripts)
mobile_def = np.repeat(mobile_def,no_snps)
mobile = mobile_def.reshape(transcripts, no_snps)

n_mobs = mobile_reads(reads, mobile, errors, mobile_add)
ns = ns + n_mobs

## Create homograft data using the same error rates
hom1_reads = np.random.randint(low = 1, high=100, size=[transcripts, no_snps])
hom2_reads = np.random.randint(low = 1, high=100, size=[transcripts, no_snps])
hom1_n = create_reads(hom1_reads, errors)
hom2_n = create_reads(hom2_reads, errors)

## Create the dataframe of Bayes Factors
## Add in the functions
import numpy as np
import pandas as pd
import os
import warnings
from tqdm import tqdm
import multiprocessing as mp
import sys

## Define the main functions
def safebeta(N,n, alpha,beta):
    facterm = 1.0
    if alpha>1:
        a = np.arange(1, alpha)
        facterm = np.prod((n+a)/(N+a))
    if beta > 1.0:
        a = np.arange(1, beta)
        facterm = facterm * np.prod((N-n+a)/(N+alpha+a))
    return facterm / (N+alpha)

safebeta = np.vectorize(safebeta)

## Function to calculate the posterior ratio
def fasterpostN2(Nhomo1,nhomo1,Nhomo2,nhomo2,N,n,nmax):
    N = int(N)
    alpha1 = nhomo1+1
    beta1 = Nhomo1-nhomo1+1
    alpha2 = nhomo2+1
    beta2 = Nhomo2-nhomo2+1
    postN2 = np.zeros(N+2)
    postN2xN2 = np.zeros(N+2)
    PN2max = -10.0
    N2max = 0
    for N2 in np.arange(0,min(N+1,n+nmax+1)):
        N2 = int(N2)
        i = N2 + 1
        n2_min = max(0,N2-n)
        n2_max = min(N-n, N2)
        n2_array = np.arange(n2_min, n2_max+1)
        postN2[i] = np.sum(safebeta(N-N2,n-N2+n2_array,alpha1,beta1) * safebeta(N2,n2_array,alpha2,beta2))
        postN2[i]=postN2[i]/i
        if (N2>0) & (postN2[i]>PN2max):
            PN2max = postN2[i]
            N2max = N2
        
        postN2xN2[i]=postN2[i]*N2
    
    sumpostN2=sum(postN2)
    if sumpostN2 != 0:
        postN2=postN2/sumpostN2
        postN2xN2=postN2xN2/sumpostN2
        logBF21N2 = np.log10(postN2[N2max+1]/postN2[1]) # +1 because of the index
        meanN2=sum(postN2xN2)
        results = [meanN2,N2max,logBF21N2]
        results = logBF21N2
    else:
        ## If sumpost = 0, can't calculate BF. Return nan
         results = [np.nan, np.nan, np.nan]
         results = np.nan
    return results
fasterpostN2 = np.vectorize(fasterpostN2)

## Iterate through every value in the dataframes
bfs = fasterpostN2(hom1_reads,hom1_n,hom2_reads,hom2_n,reads,ns,10)



In [None]:
## Create results using all the SNPs

summed_bfs = bfs.sum(axis=1)
#summed_bfs[np.isnan(summed_bfs)] = 20
#summed_bfs[np.isinf(summed_bfs)] = -2
mobile_list = mobile[:,0]
tp = summed_bfs[(summed_bfs>=1) & (mobile_list==True) ]
tn = summed_bfs[(summed_bfs<1) & (mobile_list==False) ]
fp = summed_bfs[(summed_bfs>=1) & (mobile_list==False)]
fn = summed_bfs[(summed_bfs<1) & (mobile_list==True) ]

print(len(tp), len(tn), len(fp), len(fn))


In [None]:
## Can we improve this by only taking the best SNPs in the homograft data?
## Let's start with just the N values - choose the best from both homografts

## Check all of the different SNP combinations
import matplotlib.pyplot as plt

snps = range(100)

test_hom = hom1_reads.copy()

for snp in snps:
    new_bfs = []
    for i in range(1000):
        args = np.argpartition(test_hom[i], -snp)[-snp:]
        new_bfs.append(bfs[i,args].sum())

    new_bfs = np.array(new_bfs)
    new_bfs[np.isinf(new_bfs)] = 20
    new_bfs[np.isnan(new_bfs)] = -2
    mobile_list = mobile[:,0]
    tp = new_bfs[(new_bfs>=1) & (mobile_list==True) ]
    tn = new_bfs[(new_bfs<1) & (mobile_list==False) ]
    fp = new_bfs[(new_bfs>=1) & (mobile_list==False)]
    fn = new_bfs[(new_bfs<1) & (mobile_list==True) ]

    print(len(tp), len(tn), len(fp), len(fn))
    print("TPR")
    tpr = len(tp)/(len(tp)+len(fn))
    plt.plot(snp, tpr, 'kx')
plt.xlabel("SNPs")
plt.ylabel("TPR")
plt.savefig("snps_summed.png",dpi=300)
plt.show()


In [None]:
test_hom = hom1_reads.copy()
hom_total = hom1_reads + hom2_reads
new_bfs = []
for i in range(1000):
    #test = np.stack((hom1_reads[i],hom2_reads[i]),axis=1)
    args = np.argpartition(hom_total[i], -3)[-3:]
    new_bfs.append(bfs[i,args].sum())

new_bfs = np.array(new_bfs)
new_bfs[np.isnan(new_bfs)] = 20
new_bfs[np.isinf(new_bfs)] = -2
mobile_list = mobile[:,0]
tp = new_bfs[(new_bfs>=1) & (mobile_list==True) ]
tn = new_bfs[(new_bfs<1) & (mobile_list==False) ]
fp = new_bfs[(new_bfs>=1) & (mobile_list==False)]
fn = new_bfs[(new_bfs<1) & (mobile_list==True) ]

print(len(tp), len(tn), len(fp), len(fn))

In [None]:
import matplotlib.pyplot as plt

snps = range(100)

test_hom = hom1_reads.copy()
test_bfs = bfs.copy()
print(np.shape(test_hom))
x,y = np.where(test_hom>50)
print(test_hom[x,y])
test_bfs = test_bfs[test_hom>50]
test_hom = test_hom[test_hom>50]
print(np.shape(test_hom))

test_bfs[np.isinf(test_bfs)] = 20
test_bfs[np.isnan(test_bfs)] = -2
mobile_list = mobile[:,0]


tp = new_bfs[(new_bfs>=1) & (mobile_list==True) ]
tn = new_bfs[(new_bfs<1) & (mobile_list==False) ]
fp = new_bfs[(new_bfs>=1) & (mobile_list==False)]
fn = new_bfs[(new_bfs<1) & (mobile_list==True) ]

print(len(tp), len(tn), len(fp), len(fn))
print("TPR")
tpr = len(tp)/(len(tp)+len(fn))
plt.plot(snp, tpr, 'kx')
plt.xlabel("SNPs")
plt.ylabel("TPR")
plt.savefig("snps_summed_cutoff.png",dpi=300)
plt.show()

In [None]:
## Create the final results plots for the paper

N = 100
q = 0.01

## Can't use  different values of N2 as that only covers the mobile transcripts. Doh!

## Use different values of SNPs - just create all of the values at once though.

no_SNPs = 20

## Create the dataframe
transcripts = 1000

## Create list of dataframes
df_list = []

## Create the mobile transcripts - same for each dataframe
mobile_def = random.choices([True, False], weights=[0.5, 0.5], k=transcripts)

## Function to create sequencing errors
def create_errors(row, q):
        rand_numbs = np.random.rand(int(row))
        nhom1 = (rand_numbs < q).sum()
        return int(nhom1)

## Create the homograft datasets
Nhom1 = np.random.randint(low = 10, high = 100, size = transcripts)
Nhom2 = np.random.randint(low = 10, high = 100, size = transcripts)

for snp in range(no_SNPs):

    ## Create the heterograft datasets
    
    Nhet = np.random.randint(low = 10, high = 100, size = transcripts)
    df = pd.DataFrame([Nhom1, Nhom2, Nhet, mobile_def])
    df = df.T
    df.columns = ["Nhom1","Nhom2","Nhet","mobile"]
    df["N2"] = 5

    ## Create the sequencing errors

    df["nhom1"] = df["Nhom1"].apply(lambda x: create_errors(x, q))
    df["nhom2"] = df["Nhom2"].apply(lambda x: create_errors(x, q))
    df["n"] = df["Nhet"].apply(lambda x: create_errors(x, q))
    df["N2"] = df["N2"] * df["mobile"]

    df["n"] = df["n"] + df["N2"]

    ## Run Bayes analysis
    df["log10BF"] = df.apply(lambda x: fasterpostN2(x.Nhom1,x.nhom1,x.Nhom2,x.nhom2,x.Nhet,x.n,10), axis=1)
    df_list.append(df)

df_list

In [None]:
from functools import reduce

## Plot the accuracy for each number of SNPs
for snp in np.arange(1,20):
    dfs = df_list[0:snp]

    ## Classification requires positive identification in all SNPs
    for i in range(len(dfs)):
        dfs[i]["abs"] = dfs[i]["n"]
        dfs[i]["abs"][dfs[i]["abs"]>0] = 1

    ## Add together the dataframes
    this_df = reduce(lambda x, y: x.add(y, fill_value=0), dfs)
    this_df["abs"][this_df["abs"]==len(dfs)] = 1
    this_df["abs"][this_df["abs"]<len(dfs)] = 0
    
    this_df["TP_bf"] = 0
    this_df["TN_bf"] = 0
    this_df["FP_bf"] = 0
    this_df["FN_bf"] = 0

    this_df["TP_class"] = 0
    this_df["TN_class"] = 0
    this_df["FP_class"] = 0
    this_df["FN_class"] = 0

    this_df["TP_bf"].loc[(this_df["log10BF"]>=1) & (this_df["mobile"] > 0)] = 1
    this_df["TN_bf"].loc[(this_df["log10BF"]<1) & (this_df["mobile"] == 0)] = 1
    this_df["FP_bf"].loc[(this_df["log10BF"]>=1) & (this_df["mobile"] == 0)] = 1
    this_df["FN_bf"].loc[(this_df["log10BF"]<1) & (this_df["mobile"] > 0)] = 1

    this_df["TP_class"].loc[(this_df["abs"]>0) & (this_df["mobile"] > 0)] = 1
    this_df["TN_class"].loc[(this_df["abs"]==0) & (this_df["mobile"] == 0)] = 1
    this_df["FP_class"].loc[(this_df["abs"]==0) & (this_df["mobile"] == 0)] = 1
    this_df["FN_class"].loc[(this_df["abs"]<0) & (this_df["mobile"] > 0)] = 1

    ## For the universal pipeline method, remove all values with non-zeros counts in the homograft data
    cutoff = 1
    for df_u in dfs:
        df_u["universal"] = 0
        df_u["universal"].loc[df_u["n"]>cutoff]=1
        df_u["universal"].loc[(df_u["nhom1"]>0)|(df_u["nhom2"]>0)]=0

    ## Has to be identified in two replicates
    df_u_main = df_list[0]
    df_list_summed = reduce(lambda x, y: x.add(y, fill_value=0), dfs)
    df_u_main["universal_summed"] = df_list_summed["universal"]
    ## Calculate TP, TN, FP, FN
    df_u_main["TP"] = 0
    df_u_main["TN"] = 0
    df_u_main["FP"] = 0
    df_u_main["FN"] = 0

    df_u_main["TP"].loc[(df_u_main["mobile"]==True) & (df_u_main["universal_summed"]==len(dfs))] = 1
    df_u_main["TN"].loc[(df_u_main["mobile"]==False) & (df_u_main["universal_summed"]<len(dfs))] = 1
    df_u_main["FN"].loc[(df_u_main["mobile"]==True) & (df_u_main["universal_summed"]<len(dfs))] = 1
    df_u_main["FP"].loc[(df_u_main["mobile"]==False) & (df_u_main["universal_summed"]==len(dfs))] = 1

    tp = this_df["TP_bf"].sum()
    fp = this_df["FP_bf"].sum()
    tn = this_df["TN_bf"].sum()
    fn = this_df["FN_bf"].sum()
    bf_accuracy = (tp + tn) / (tp + tn + fp + fn)
    plt.plot(snp, bf_accuracy,"kx")

    tp = this_df["TP_class"].sum()
    fp = this_df["FP_class"].sum()
    tn = this_df["TN_class"].sum()
    fn = this_df["FN_class"].sum()
    abs_accuracy = (tp + tn) / (tp + tn + fp + fn)
    plt.plot(snp, abs_accuracy,"rx")

    tp = df_u_main["TP"].sum()
    fp = df_u_main["FP"].sum()
    tn = df_u_main["TN"].sum()
    fn = df_u_main["FN"].sum()
    uni_accuracy = (tp + tn) / (tp + tn + fp + fn)
    plt.plot(snp, uni_accuracy,"bx")

plt.plot(snp, bf_accuracy, "kx", label = "BF")
plt.plot(snp, abs_accuracy, "rx", label = "Method A")
plt.plot(snp, uni_accuracy, "bx",label = "Method B")

plt.xlabel("No of SNPs")
plt.ylabel("Accuracy")
plt.xlim(0,20)
plt.ylim(0,1)
plt.xticks(np.arange(0, 20, step=2)) 
plt.legend()
plt.savefig("accuracy_comp.png",dpi=300)
plt.show()


In [None]:
## Change Method A and Method B so that they only require N SNPs

req_SNPs = 2

from functools import reduce
## Plot the accuracy for each number of SNPs
for snp in np.arange(1,20):
    dfs = df_list[0:snp]

    ## Classification requires positive identification in all SNPs
    for i in range(len(dfs)):
        ## Classification
        dfs[i]["abs"] = dfs[i]["n"]
        dfs[i]["abs"][dfs[i]["abs"]>0] = 1

        ## Universal pipeline
        dfs[i]["universal"] = dfs[i]["n"]
        dfs[i]["universal"].loc[dfs[i]["universal"]>0]=1
        dfs[i]["universal"].loc[(dfs[i]["nhom1"]>0)|(dfs[i]["nhom2"]>0)]=0

    ## Add together the dataframes
    this_df = reduce(lambda x, y: x.add(y, fill_value=0), dfs)
    this_df["abs"][this_df["abs"]<req_SNPs] = 0
    this_df["abs"][this_df["abs"]>=req_SNPs] = 1

    this_df["universal"][this_df["universal"]<req_SNPs] = 0
    this_df["universal"][this_df["universal"]>=req_SNPs] = 1
    
    this_df["TP_bf"] = 0
    this_df["TN_bf"] = 0
    this_df["FP_bf"] = 0
    this_df["FN_bf"] = 0

    this_df["TP_class"] = 0
    this_df["TN_class"] = 0
    this_df["FP_class"] = 0
    this_df["FN_class"] = 0

    this_df["TP_bf"].loc[(this_df["log10BF"]>=1) & (this_df["mobile"] > 0)] = 1
    this_df["TN_bf"].loc[(this_df["log10BF"]<1) & (this_df["mobile"] == 0)] = 1
    this_df["FP_bf"].loc[(this_df["log10BF"]>=1) & (this_df["mobile"] == 0)] = 1
    this_df["FN_bf"].loc[(this_df["log10BF"]<1) & (this_df["mobile"] > 0)] = 1

    this_df["TP_class"].loc[(this_df["abs"]>0) & (this_df["mobile"] > 0)] = 1
    this_df["TN_class"].loc[(this_df["abs"]==0) & (this_df["mobile"] == 0)] = 1
    this_df["FP_class"].loc[(this_df["abs"]>0) & (this_df["mobile"] == 0)] = 1
    this_df["FN_class"].loc[(this_df["abs"]==0) & (this_df["mobile"] > 0)] = 1

    ## Calculate TP, TN, FP, FN
    this_df["TP_universal"] = 0
    this_df["TN_universal"] = 0
    this_df["FP_universal"] = 0
    this_df["FN_universal"] = 0

    this_df["TP_universal"].loc[(this_df["mobile"]>0) & (this_df["universal"]>=req_SNPs)] = 1
    this_df["TN_universal"].loc[(this_df["mobile"]==0) & (this_df["universal"]<req_SNPs)] = 1
    this_df["FN_universal"].loc[(this_df["mobile"]>0) & (this_df["universal"]<req_SNPs)] = 1
    this_df["FP_universal"].loc[(this_df["mobile"]==0) & (this_df["universal"]>=req_SNPs)] = 1

    tp = this_df["TP_bf"].sum()
    fp = this_df["FP_bf"].sum()
    tn = this_df["TN_bf"].sum()
    fn = this_df["FN_bf"].sum()
    bf_accuracy = (tp + tn) / (tp + tn + fp + fn)
    plt.plot(snp, bf_accuracy,"kx")

    tp = this_df["TP_class"].sum()
    fp = this_df["FP_class"].sum()
    tn = this_df["TN_class"].sum()
    fn = this_df["FN_class"].sum()
    abs_accuracy = (tp + tn) / (tp + tn + fp + fn)
    plt.plot(snp, abs_accuracy,"rx")

    tp = this_df["TP_universal"].sum()
    fp = this_df["FP_universal"].sum()
    tn = this_df["TN_universal"].sum()
    fn = this_df["FN_universal"].sum()
    uni_accuracy = (tp + tn) / (tp + tn + fp + fn)
    plt.plot(snp, uni_accuracy,"bx")

plt.plot(snp, bf_accuracy, "kx", label = "BF")
plt.plot(snp, abs_accuracy, "rx", label = "Method A")
plt.plot(snp, uni_accuracy, "bx",label = "Method B")

plt.xlabel("No of SNPs")
plt.ylabel("Accuracy")
plt.xlim(0,20)
plt.ylim(0,1)
plt.xticks(np.arange(0, 20, step=2)) 
plt.legend()
plt.savefig("accuracy_req_snps2_error_p01.png",dpi=300)
plt.show()
    

In [None]:
## Run again for 1 SNP, but different values for q

## Create the final results plots for the paper

N = 1000
qs = np.linspace(0.001,0.01,10)

## Create the dataframe
transcripts = 1000

## Create list of dataframes
df_list = []

## Create the mobile transcripts - same for each dataframe
mobile_def = random.choices([True, False], weights=[0.5, 0.5], k=transcripts)
mobile_def = np.repeat(mobile_def, len(qs))

## Function to create sequencing errors
def create_errors(row, q):
        rand_numbs = np.random.rand(int(row))
        nhom1 = (rand_numbs < q).sum()
        return int(nhom1)

## Create the q values
qvalues = np.tile(qs, transcripts)

## Create the homograft datasets
#Nhom1 = np.random.randint(low = 100, high = 100, size = transcripts)
#Nhom2 = np.random.randint(low = 100, high = 100, size = transcripts)

Nhom1 = np.ones(transcripts) * 1000
Nhom2 = np.ones(transcripts) * 1000

## Create the heterograft datasets
#Nhet = np.random.randint(low = 100, high = 100, size = transcripts)
Nhet = np.ones(transcripts) * 1000

## Repeat the homograft and heterograft datasets for the qvalues
Nhom1 = np.repeat(Nhom1, len(qs))
Nhom2 = np.repeat(Nhom2, len(qs))
Nhet = np.repeat(Nhet, len(qs))
df = pd.DataFrame([Nhom1, Nhom2, Nhet, mobile_def, qvalues])
df = df.T
df.columns = ["Nhom1","Nhom2","Nhet","mobile","q"]
df["N2"] = df["q"] * 1000 * 5

## Create the sequencing errors

df["nhom1"] = df.apply(lambda x: create_errors(x.Nhom1, x.q),axis=1)
df["nhom2"] = df.apply(lambda x: create_errors(x.Nhom2, x.q),axis = 1)
df["n"] = df.apply(lambda x: create_errors(x.Nhet, x.q), axis = 1)
df["N2"] = df["N2"] * df["mobile"]

df["n"] = df["n"] + df["N2"]

## Run Bayes analysis
df["log10BF"] = df.apply(lambda x: fasterpostN2(x.Nhom1,x.nhom1,x.Nhom2,x.nhom2,x.Nhet,x.n,10), axis=1)


In [None]:
req_SNPs = 1
## Plot the accuracy for each value for q
for q in qs:
    print(q)
    this_df = df[df["q"] == q]

    this_df["abs"] = this_df["n"]
    this_df["abs"][this_df["abs"]>0] = 1

    ## Universal pipeline
    this_df["universal"] = this_df["n"]
    this_df["universal"].loc[this_df["universal"]>0]=1
    this_df["universal"].loc[(this_df["nhom1"]>0)|(this_df["nhom2"]>0)]=0

    this_df["TP_bf"] = 0
    this_df["TN_bf"] = 0
    this_df["FP_bf"] = 0
    this_df["FN_bf"] = 0

    this_df["TP_class"] = 0
    this_df["TN_class"] = 0
    this_df["FP_class"] = 0
    this_df["FN_class"] = 0

    this_df["TP_bf"].loc[(this_df["log10BF"]>=1) & (this_df["mobile"] > 0)] = 1
    this_df["TN_bf"].loc[(this_df["log10BF"]<1) & (this_df["mobile"] == 0)] = 1
    this_df["FP_bf"].loc[(this_df["log10BF"]>=1) & (this_df["mobile"] == 0)] = 1
    this_df["FN_bf"].loc[(this_df["log10BF"]<1) & (this_df["mobile"] > 0)] = 1

    this_df["TP_class"].loc[(this_df["abs"]>0) & (this_df["mobile"] > 0)] = 1
    this_df["TN_class"].loc[(this_df["abs"]==0) & (this_df["mobile"] == 0)] = 1
    this_df["FP_class"].loc[(this_df["abs"]>0) & (this_df["mobile"] == 0)] = 1
    this_df["FN_class"].loc[(this_df["abs"]==0) & (this_df["mobile"] > 0)] = 1

    ## Calculate TP, TN, FP, FN
    this_df["TP_universal"] = 0
    this_df["TN_universal"] = 0
    this_df["FP_universal"] = 0
    this_df["FN_universal"] = 0

    this_df["TP_universal"].loc[(this_df["mobile"]>0) & (this_df["universal"]>=req_SNPs)] = 1
    this_df["TN_universal"].loc[(this_df["mobile"]==0) & (this_df["universal"]<req_SNPs)] = 1
    this_df["FN_universal"].loc[(this_df["mobile"]>0) & (this_df["universal"]<req_SNPs)] = 1
    this_df["FP_universal"].loc[(this_df["mobile"]==0) & (this_df["universal"]>=req_SNPs)] = 1

    tp = this_df["TP_bf"].sum()
    fp = this_df["FP_bf"].sum()
    tn = this_df["TN_bf"].sum()
    fn = this_df["FN_bf"].sum()
    bf_accuracy = (tp + tn) / (tp + tn + fp + fn)
    plt.plot(q, bf_accuracy,"kx")

    tp = this_df["TP_class"].sum()
    fp = this_df["FP_class"].sum()
    tn = this_df["TN_class"].sum()
    fn = this_df["FN_class"].sum()
    abs_accuracy = (tp + tn) / (tp + tn + fp + fn)
    plt.plot(q, abs_accuracy,"rx")

    tp = this_df["TP_universal"].sum()
    fp = this_df["FP_universal"].sum()
    tn = this_df["TN_universal"].sum()
    fn = this_df["FN_universal"].sum()
    uni_accuracy = (tp + tn) / (tp + tn + fp + fn)
    plt.plot(q, uni_accuracy,"bx")

plt.plot(q, bf_accuracy, "kx", label = "BF")
plt.plot(q, abs_accuracy, "rx", label = "Method A")
plt.plot(q, uni_accuracy, "bx",label = "Method B")

plt.xlabel("q")
plt.ylabel("Accuracy")
plt.legend()
plt.savefig("q_comp_1000N2.png",dpi=300)
plt.show()

In [None]:
## Tweak the code so that the analysis is added to the dataframes

path_to_data = "data/"
path_to_output = "images/"
df1 = pd.read_csv(path_to_data + "N_q_sims.rep_0.csv")
df2 = pd.read_csv(path_to_data +"N_q_sims.rep_1.csv")

df1_q=df1[(df1["q"]>0.01) & (df1["q"]<0.011)]
df2_q=df2[(df2["q"]>0.01) & (df2["q"]<0.011)]

print(df1_q)

## Create an output dataframe
df1_q_output = df.copy()

## Add in the columns for the results
df1_q_output["TP_bf"] = 0
df1_q_output["TN_bf"] = 0
df1_q_output["FP_bf"] = 0
df1_q_output["FN_bf"] = 0

df1_q_output["TP_Method_A"] = 0
df1_q_output["TN_Method_A"] = 0
df1_q_output["FP_Method_A"] = 0
df1_q_output["FN_Method_A"] = 0

df1_q_output["TP_Method_B"] = 0
df1_q_output["TN_Method_B"] = 0
df1_q_output["FP_Method_B"] = 0
df1_q_output["FN_Method_B"] = 0

ns = set(df1_q["N"].to_list())

## Now we are updating the output dataframe

df1_q_output["Method A"] = df1_q_output["n"]
df1_q_output["Method A"][df1_q_output["Method A"]>0] = 1

## Universal pipeline
df1_q_output["Method B"] = df1_q_output["n"]
df1_q_output["Method B"].loc[df1_q_output["Method B"]>0]=1
df1_q_output["Method B"].loc[(df1_q_output["nhom1"]>0)|(df1_q_output["nhom2"]>0)]=0

df1_q_output["TP_bf"].loc[(df1_q_output["log10BF"]>=1) & (df1_q_output["mobile"] > 0)] = 1
df1_q_output["TN_bf"].loc[(df1_q_output["log10BF"]<1) & (df1_q_output["mobile"] == 0)] = 1
df1_q_output["FP_bf"].loc[(df1_q_output["log10BF"]>=1) & (df1_q_output["mobile"] == 0)] = 1
df1_q_output["FN_bf"].loc[(df1_q_output["log10BF"]<1) & (df1_q_output["mobile"] > 0)] = 1

df1_q_output["TP_Method_A"].loc[(df1_q_output["Method A"]>0) & (df1_q_output["mobile"] > 0)] = 1
df1_q_output["TN_Method_A"].loc[(df1_q_output["Method A"]==0) & (df1_q_output["mobile"] == 0)] = 1
df1_q_output["FP_Method_A"].loc[(df1_q_output["Method A"]>0) & (df1_q_output["mobile"] == 0)] = 1
df1_q_output["FN_Method_A"].loc[(df1_q_output["Method A"]==0) & (df1_q_output["mobile"] > 0)] = 1

df1_q_output["TP_Method_B"].loc[(df1_q_output["mobile"]>0) & (df1_q_output["Method B"]>=1)] = 1
df1_q_output["TN_Method_B"].loc[(df1_q_output["mobile"]==0) & (df1_q_output["Method B"]<1)] = 1
df1_q_output["FP_Method_B"].loc[(df1_q_output["mobile"]>0) & (df1_q_output["Method B"]<1)] = 1
df1_q_output["FN_Method_B"].loc[(df1_q_output["mobile"]==0) & (df1_q_output["Method B"]>=1)] = 1

## Run the plots
for n in qs:
    df_slice = df1_q_output[df1_q_output["q"]==n]
    tp = df_slice["TP_bf"].sum()
    fp = df_slice["FP_bf"].sum()
    tn = df_slice["TN_bf"].sum()
    fn = df_slice["FN_bf"].sum()
    bf_accuracy = (tp + tn) / (tp + tn + fp + fn)
    plt.plot(n, bf_accuracy,"kx")

    tp = df_slice["TP_Method_A"].sum()
    fp = df_slice["FP_Method_A"].sum()
    tn = df_slice["TN_Method_A"].sum()
    fn = df_slice["FN_Method_A"].sum()
    abs_accuracy = (tp + tn) / (tp + tn + fp + fn)
    plt.plot(n, abs_accuracy,"rx")

    tp = df_slice["TP_Method_B"].sum()
    fp = df_slice["FP_Method_B"].sum()
    tn = df_slice["TN_Method_B"].sum()
    fn = df_slice["FN_Method_B"].sum()
    uni_accuracy = (tp + tn) / (tp + tn + fp + fn)
    plt.plot(n, uni_accuracy,"bx")

plt.plot(n, bf_accuracy, "kx", label = "BF")
plt.plot(n, abs_accuracy, "rx", label = "Method A")
plt.plot(n, uni_accuracy, "bx",label = "Method B")

plt.xlabel("N")
plt.ylabel("Accuracy")
plt.legend()
#plt.savefig("N_comp_p01q.png",dpi=300)
plt.show()


In [None]:


tp = this_df["TP_bf"].sum()
fp = this_df["FP_bf"].sum()
tn = this_df["TN_bf"].sum()
fn = this_df["FN_bf"].sum()
bf_accuracy = (tp + tn) / (tp + tn + fp + fn)
plt.plot(n, bf_accuracy,"kx")

tp = this_df["TP_class"].sum()
fp = this_df["FP_class"].sum()
tn = this_df["TN_class"].sum()
fn = this_df["FN_class"].sum()
abs_accuracy = (tp + tn) / (tp + tn + fp + fn)
plt.plot(n, abs_accuracy,"rx")

tp = this_df["TP_universal"].sum()
fp = this_df["FP_universal"].sum()
tn = this_df["TN_universal"].sum()
fn = this_df["FN_universal"].sum()
uni_accuracy = (tp + tn) / (tp + tn + fp + fn)
plt.plot(n, uni_accuracy,"bx")

plt.plot(n, bf_accuracy, "kx", label = "BF")
plt.plot(n, abs_accuracy, "rx", label = "Method A")
plt.plot(n, uni_accuracy, "bx",label = "Method B")

plt.xlabel("N")
plt.ylabel("Accuracy")
plt.legend()
plt.savefig("N_comp_p01q.png",dpi=300)
plt.show()


In [None]:
df1_q_output

In [None]:
df1_q_output.to_csv("df1_N1000_full.csv", index=None)

In [None]:
import random
import pandas as pd
import numpy as np
import itertools
import configparser
import json
from distutils.util import strtobool
## Script to run data and analysis in one go

## Function definitions
## Define the main functions
def safebeta(N,n, alpha,beta):
    facterm = 1.0
    if alpha>1:
        a = np.arange(1, alpha)
        facterm = np.prod((n+a)/(N+a))
    if beta > 1.0:
        a = np.arange(1, beta)
        facterm = facterm * np.prod((N-n+a)/(N+alpha+a))
    return facterm / (N+alpha)

safebeta = np.vectorize(safebeta)

## Function to calculate the posterior ratio
def fasterpostN2(Nhomo1,nhomo1,Nhomo2,nhomo2,N,n,nmax):
    N = int(N)
    alpha1 = nhomo1+1
    beta1 = Nhomo1-nhomo1+1
    alpha2 = nhomo2+1
    beta2 = Nhomo2-nhomo2+1
    postN2 = np.zeros(N+2)
    postN2xN2 = np.zeros(N+2)
    PN2max = -10.0
    N2max = 0
    for N2 in np.arange(0,min(N+1,n+nmax+1)):
        N2 = int(N2)
        i = N2 + 1
        n2_min = max(0,N2-n)
        n2_max = min(N-n, N2)
        n2_array = np.arange(n2_min, n2_max+1)
        postN2[i] = np.sum(safebeta(N-N2,n-N2+n2_array,alpha1,beta1) * safebeta(N2,n2_array,alpha2,beta2))
        postN2[i]=postN2[i]/i
        if (N2>0) & (postN2[i]>PN2max):
            PN2max = postN2[i]
            N2max = N2
        
        postN2xN2[i]=postN2[i]*N2
    
    sumpostN2=sum(postN2)
    if sumpostN2 != 0:
        postN2=postN2/sumpostN2
        postN2xN2=postN2xN2/sumpostN2
        logBF21N2 = np.log10(postN2[N2max+1]/postN2[1]) # +1 because of the index
        meanN2=sum(postN2xN2)
        results = [meanN2,N2max,logBF21N2]
        results = logBF21N2
    else:
        ## If sumpost = 0, can't calculate BF. Return nan
         results = [np.nan, np.nan, np.nan]
         results = np.nan
    return results
fasterpostN2 = np.vectorize(fasterpostN2)
## Function to create sequencing errors
def create_errors(row, q):
        rand_numbs = np.random.rand(int(row))
        nhom1 = (rand_numbs < q).sum()
        return int(nhom1)

## Input the parameters - what should we be able to vary? 
## These should be lists
config = configparser.ConfigParser()
config.read('parameters.cfg')
N_values = json.loads(config.get("Simulation parameters","N_values"))
Nhom1_values = json.loads(config.get("Simulation parameters","Nhom1_values"))
Nhom2_values = json.loads(config.get("Simulation parameters","Nhom2_values"))
q_values = json.loads(config.get("Simulation parameters","q_values"))
N2_values = json.loads(config.get("Simulation parameters","N2_values")) 

## Check if the homografts should have separate read depths. Else make them the same as the heterografts
constant_Nhom = bool(strtobool((config.get("Simulation parameters","constant_Nhom"))))

if constant_Nhom: 
    constant_Nhom_value = int(config.get("Simulation parameters","constant_Nhom_value"))
## This is the value the error and N is multiplied by

## Set the random flags
random_N = bool(strtobool((config.get("Simulation parameters","random_N"))))
random_Nhom = bool(strtobool((config.get("Simulation parameters","random_Nhom"))))
random_q = bool(strtobool((config.get("Simulation parameters","random_q"))))

## These should be integers
no_transcripts = int(config.get("Simulation parameters","no_transcripts"))

## Create iterations of all parameter values
settings_list = [N_values, Nhom1_values, Nhom2_values, q_values, N2_values]
data = (list(itertools.product(*settings_list)))
df = pd.DataFrame(data, columns = ['N','q',"N2_func"])
df = pd.concat([df]*no_transcripts)

if constant_Nhom:
    df["Nhom1"] = constant_Nhom_value
    df["Nhom2"] = constant_Nhom_value
else:
    df["Nhom1"] = df["N"]
    df["Nhom2"] = df["N"]

## Define our mobile transcripts - each transcript should have one unique definition which is kept the same for all parameter values
mobile_def = random.choices([True, False], weights=[0.5, 0.5], k=no_transcripts)
mobile_def = np.repeat(mobile_def, len(data))
df["mobile"] = mobile_def

## Apply random flags (if appropriate)
if random_N:
    df["N"] = df["N"].apply(lambda x: np.random.randint(low = 10, high = x))
if random_Nhom:
    df["Nhom1"] = df["Nhom1"].apply(lambda x: np.random.randint(low = 10, high = x))
    df["Nhom2"] = df["Nhom2"].apply(lambda x: np.random.randint(low = 10, high = x))
if random_q:
    df["q"] = df["q"].apply(lambda x: np.random.uniform(low = 0, high = x))

## Create the sequencing errors
df["nhom1"] = df.apply(lambda x: create_errors(x.Nhom1, x.q),axis=1)
df["nhom2"] = df.apply(lambda x: create_errors(x.Nhom2, x.q),axis = 1)
df["n"] = df.apply(lambda x: create_errors(x.N, x.q), axis = 1)
df["N2"] = df["N2_func"] * df["mobile"] * df["q"] * df["N"]

df["n"] = df["n"] + df["N2"]

## Run Bayes analysis
df["log10BF"] = df.apply(lambda x: fasterpostN2(x.Nhom1,x.nhom1,x.Nhom2,x.nhom2,x.N,x.n,10), axis=1)

## Run the analysis for Method A and Method B
## Add in the columns for the results
df["TP_bf"] = 0
df["TN_bf"] = 0
df["FP_bf"] = 0
df["FN_bf"] = 0

df["TP_Method_A"] = 0
df["TN_Method_A"] = 0
df["FP_Method_A"] = 0
df["FN_Method_A"] = 0

df["TP_Method_B"] = 0
df["TN_Method_B"] = 0
df["FP_Method_B"] = 0
df["FN_Method_B"] = 0

## Now we are updating the output dataframe
df["Method A"] = df["n"]
df["Method A"][df["Method A"]>0] = 1

## Universal pipeline
df["Method B"] = df["n"]
df["Method B"].loc[df["Method B"]>0]=1
df["Method B"].loc[(df["nhom1"]>0)|(df["nhom2"]>0)]=0

df["TP_bf"].loc[(df["log10BF"]>=1) & (df["mobile"] > 0)] = 1
df["TN_bf"].loc[(df["log10BF"]<1) & (df["mobile"] == 0)] = 1
df["FP_bf"].loc[(df["log10BF"]>=1) & (df["mobile"] == 0)] = 1
df["FN_bf"].loc[(df["log10BF"]<1) & (df["mobile"] > 0)] = 1

df["TP_Method_A"].loc[(df["Method A"]>0) & (df["mobile"] > 0)] = 1
df["TN_Method_A"].loc[(df["Method A"]==0) & (df["mobile"] == 0)] = 1
df["FP_Method_A"].loc[(df["Method A"]>0) & (df["mobile"] == 0)] = 1
df["FN_Method_A"].loc[(df["Method A"]==0) & (df["mobile"] > 0)] = 1

df["TP_Method_B"].loc[(df["mobile"]>0) & (df["Method B"]>=1)] = 1
df["TN_Method_B"].loc[(df["mobile"]==0) & (df["Method B"]<1)] = 1
df["FP_Method_B"].loc[(df["mobile"]>0) & (df["Method B"]<1)] = 1
df["FN_Method_B"].loc[(df["mobile"]==0) & (df["Method B"]>=1)] = 1

In [None]:
import matplotlib.pyplot as plt
## Plot the data
## Choose the parameters of interest: can only vary one parameter

fixed_q = 0.01
fixed_N2 = 5

## Choose the correct section of the dataframe
df_slice = df[(df["N2_func"] == fixed_N2) & (df["q"]==fixed_q)]
## Get the list of N values (we are varying N)

Ns = set(df_slice["N"].to_list())

for N in Ns:
    ## Choose the same hom values
    this_df = df_slice[(df_slice["N"]==N) & (df_slice["Nhom1"]==N) & (df_slice["Nhom2"]==N)]
    print(this_df)
    tp = this_df["TP_bf"].sum()
    fp = this_df["FP_bf"].sum()
    tn = this_df["TN_bf"].sum()
    fn = this_df["FN_bf"].sum()
    bf_accuracy = (tp + tn) / (tp + tn + fp + fn)
    plt.plot(N, bf_accuracy,"kx")

    tp = this_df["TP_Method_A"].sum()
    fp = this_df["FP_Method_A"].sum()
    tn = this_df["TN_Method_A"].sum()
    fn = this_df["FN_Method_A"].sum()
    abs_accuracy = (tp + tn) / (tp + tn + fp + fn)
    plt.plot(N, abs_accuracy,"rx")

    tp = this_df["TP_Method_B"].sum()
    fp = this_df["FP_Method_B"].sum()
    tn = this_df["TN_Method_B"].sum()
    fn = this_df["FN_Method_B"].sum()
    uni_accuracy = (tp + tn) / (tp + tn + fp + fn)
    plt.plot(N, uni_accuracy,"bx")

plt.plot(N, bf_accuracy, "kx", label = "BF")
plt.plot(N, abs_accuracy, "rx", label = "Method A")
plt.plot(N, uni_accuracy, "bx",label = "Method B")

plt.xlabel("N")
plt.ylabel("Accuracy")
plt.legend()
#plt.savefig("N_comp_p01q.png",dpi=300)
plt.show()

In [1]:
import bf_calculator.baymobil as baymob
import plot_data as plotdata

df = baymob.create_simulated_data()
plotdata.plot_data(df,"q")

AttributeError: module 'bf_calculator.baymobil' has no attribute 'create_simulated_data'

In [None]:
df
