In [None]:
## Reanalyse the Thieme data based on the Excel spreadsheet.
## Choose an error value

## Question: do we see any consistency of function in the "false positives" between Thieme and grapevine?

import pandas as pd

df = pd.read_excel("thieme_original_data.xlsx", sheet_name="SNP in all transcripts")

In [None]:
import baymobil as baymob

exps = [col for col in df.columns if "*" in col]
output = []
## Calculate the BF for each entry in each experiment, summing over the SNPs

def calculate_bf(row):
    bf_list = []
    ## Separate into ints
    ## Check if there are multiple values
    if "," in str(row.N):
        N_list = row.N.split(",")
        n_list = row.n.split(",")
        ## Remove the poxy "NA" values (why? why? why?)
        N_list = [int(n) for n in N_list if "NA" not in n]
        n_list = [int(n) for n in n_list if "NA" not in n]
    else:
        try:
            N_list = [int(row.N)]
            n_list = [int(row.n)]
        except:
            print("NAN")
            return "NA"
    
    for N, n in zip(N_list, n_list):
        bf = baymob.fasterpostN2(500,1,500,1,int(N) + int(n), int(n), 10)
        ## Cap the BF values
        if bf[2] > 2: bf[2] = 2
        if bf[2] <-2: bf[2] = -2
        bf_list.append(bf[2])
    return sum(bf_list)

for exp in exps:
    ind = df.columns.get_loc(exp)
    df_subset = df.iloc[:,ind:ind +2]
    df_subset["Transcript"] = df["Transcript"]
    df_subset["Mobile"] = df.iloc[:,ind+7]
    ## Get rid of NaNs (but not NAs, because we hate you.)
    df_subset.dropna(inplace=True)
    df_subset = df_subset[1:] #take the data less the header row
    ## Get the sampling location
    star_index = exp.index("*")
    sample = exp[star_index-2:star_index]
    if sample[0] == "C":
        type = "Col"
    if sample[0] == "P":
        type = "Ped"
    if sample[1] == "r":
        tissue = "root"
    if sample[1] == "s":
        tissue = "shoot"
    if type == "Col":
        df_subset.columns = ["N","n", "Transcript","Mobile"]
    if type == "Ped":
        df_subset.columns = ["n","N", "Transcript","Mobile"]
    df_subset["log10BF"] = df_subset.apply(lambda x: calculate_bf(x), axis= 1)
    output.append(df_subset)


In [None]:
output_exp = []
for df, exp in zip(output, exps):
    print(exp)
    print("Total Thieme find to be mobile:")
    print(len(df[df["Mobile"] == "mobile"]))
    print("Total BF find to be mobile:")
    print(len(df[df["log10BF"] >=1]))
    df["exp"] = exp
    output_exp.append(df)

In [None]:
df_output = pd.concat(output_exp)
df_output["Gene"] = df_output["Transcript"].apply(lambda x: x.split(".")[0])
thieme_mobile = set(df_output[df_output["Mobile"]=="mobile"]["Gene"].to_list())
bf_mobile = set(df_output[df_output["log10BF"]>=1]["Gene"].to_list())

print(len(thieme_mobile))
print(len(bf_mobile))

In [None]:
df_output[df_output["Transcript"]=="AT1G01010.1"]

In [None]:
display(df_output[(df_output["Mobile"]=="mobile")])

df_output.to_csv("thieme_1_500.csv", index=None)

In [None]:
pd.set_option("display.max_rows",100)

df_sorted = df_output.sort_values(by="log10BF", ascending = False)

display(df_sorted[(df_sorted["Mobile"] == "mobile")].head(99))
display(df_sorted[(df_sorted["Mobile"] == "mobile")].head(99)[["Transcript","exp","Total reads","Mobile reads","No. of SNPs","log10BF"]])

In [None]:
## Check the homograft

df_snp = pd.read_csv("raw_data/thieme/snp_ref.csv")

snps = df_snp[df_snp["transcripts"]=="AT3G45140"]["SNP"].to_list()

df_hom1 = pd.read_csv("raw_data/thieme/homfiles/C-C-shoot-FN.txt", delimiter="\t")
df_hom1[df_hom1["SNP"].isin(snps)]

df_hom2 = pd.read_csv("raw_data/thieme/homfiles/P-P-shoot-FN.txt", delimiter="\t")
df_hom2[df_hom2["SNP"].isin(snps)]

In [None]:
## Sum the number of endogenous : distal reads per transcript

def sumN(row):
    if "," in str(row):
        N_list = row.split(",")
        ## Remove the poxy "NA" values (why? why? why?)
        N_list = [int(n) for n in N_list if "NA" not in n]
    else:
        try:
            N_list = [int(row)]
        except:
            print("NAN")
            return "NA"
    return sum(N_list)

def countN(row):
    if "," in str(row):
        N_list = row.split(",")
        ## Remove the poxy "NA" values (why? why? why?)
        N_list = [int(n) for n in N_list if "NA" not in n]
    else:
        try:
            N_list = [int(row)]
        except:
            print("NAN")
            return "NA"
    return len(N_list)
    
df_output["sum_N"] = df_output["N"].apply(lambda x: sumN(x))
df_output["sum_n"] = df_output["n"].apply(lambda x: sumN(x))

df_output["No. of SNPs"] = df_output["N"].apply(lambda x: countN(x))
df_output

In [None]:
ans = df_output[(df_output["Mobile"] == "mobile") & ((df_output["sum_n"] / (df_output["sum_N"] + df_output["sum_n"])) < (1 / 500))]


overs = df_output[(df_output["Mobile"] == "mobile") & ((df_output["sum_n"] / (df_output["sum_N"] + df_output["sum_n"])) > (1 / 500))]

ans = ans[~ans["Gene"].isin(overs["Gene"].to_list())]
len(set(ans["Gene"].to_list()))
## Need the ones that only appear once - or all occurences have the low ratio. Why is nothing simple??/
## Select those with reads above 1: 500, and take those out of the list

In [186]:
## Load in the new snp list

df_snp = pd.read_csv("new_thieme_og_snps.csv", index_col=None)
df_snp

Unnamed: 0,Transcript,snp_list,SNP
0,AT1G50920.1,220,Chr1_18870656
1,AT1G50920.1,1351,Chr1_18871787
2,AT1G51380.1,209,Chr1_19048090
3,AT1G51380.1,987,Chr1_19049605
4,AT1G51380.1,1376,Chr1_19050086
...,...,...,...
54648,AT5G25560.4,1361,Chr5_8899341
54649,AT5G25560.4,1418,Chr5_8899398
54650,AT5G25560.4,1465,Chr5_8899445
54651,AT5G25560.4,1466,Chr5_8899446


In [6]:
## Convert df into a multi-level dataframe
import pandas as pd
df = pd.read_excel("thieme_original_data.xlsx", sheet_name="SNP in all transcripts", header=[0,1])
cols = df.columns.to_list()
exp_cols = [col for col in cols if "*" in col[0]]
## For each set up, we only want to keep COL base and PED base
## loop through the exp_cols list and keep those that contain "PED base counts" or "COL base counts"

exp_cols_cp = [col for col in cols if "COL base " in col[1] or "PED base " in col[1]]
df_exps = df[exp_cols_cp]

df_exps


  warn(msg)


Unnamed: 0_level_0,Cr*Ps r1,Cr*Ps r1,Cr*Ps r2,Cr*Ps r2,Cr*Ps r3,Cr*Ps r3,PrCs* r1,PrCs* r1,PrCs* r2,PrCs* r2,...,CrPs* -N r3,CrPs* -N r3,CrPs* -P r1,CrPs* -P r1,CrPs* -P r2,CrPs* -P r2,CrPs* -P r3,CrPs* -P r3,Pr*Cs FN_root,Pr*Cs FN_root
Unnamed: 0_level_1,COL base counts per informative SNP site (NA in cases of no read confirming either of two alleles),PED base counts per informative SNP site,COL base counts per informative SNP site,PED base counts per informative SNP site,COL base counts per informative SNP site,PED base counts per informative SNP site,COL base counts per informative SNP site,PED base counts per informative SNP site,COL base counts per informative SNP site,PED base counts per informative SNP site,...,COL base counts per informative SNP site,PED base counts per informative SNP site,COL base counts per informative SNP site,PED base counts per informative SNP site,COL base counts per informative SNP site,PED base counts per informative SNP site,COL base counts per informative SNP site,PED base counts per informative SNP site,COL base counts per informative SNP site,PED base counts per informative SNP site
0,"707, 597","0, 0","672, 643","0, 0","668, 661","0, 0","388, 405","0, 0","732, 612","0, 0",...,"0, 0","322, 203","0, 0","299, 164","0, 0","310, 192","0, 0","153, 110","4, 0","937, 1178"
1,"66, 77, 53, 89, 25","0, 0, 0, 0, 0","91, 99, 60, 97, 22","0, 0, 0, 0, 0","93, 89, 59, 98, 45","0, 0, 0, 0, 0","51, 54, 52, 58, 21","0, 0, 0, 0, 0","89, 111, 55, 102, 29","0, 0, 0, 0, 0",...,"0, 0, 0, 0, 0","37, 44, 19, 53, 6","0, 0, 0, 0, 0","52, 55, 21, 35, 8","0, 0, 0, 0, 0","44, 61, 33, 56, 15","NA, 0, 0, 0, 0","NA, 17, 14, 11, 3","0, 0, 0, 0, 0","87, 134, 135, 123, 54"
2,"24, 9, 6","0, 0, 0","30, 7, 5","0, 0, 0","35, 11, 6","0, 0, 0","13, 7, NA","0, 0, NA","17, 8, 6","0, 0, 0",...,"0, 0, NA","19, 13, NA","0, 0, NA","21, 10, NA","0, 0, NA","20, 12, NA","0, 0, NA","17, 5, NA","0, 0, 0","131, 125, 30"
3,348,0,366,0,329,0,258,0,440,0,...,0,268,0,291,0,309,0,137,0,1187
4,"49, 72, 49, 49, 45, 49, 37, 31, 28, 30, 33, 31...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","60, 91, 63, 61, 57, 61, 52, 34, 34, 41, 40, 37...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","53, 101, 76, 73, 58, 63, 52, 36, 33, 33, 30, 3...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","39, 46, 32, 32, 34, 34, 33, 23, 24, 25, 25, 26...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","86, 135, 92, 89, 72, 76, 62, 48, 42, 49, 55, 5...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",...,"0, 0, 0, 0, 0, 0, 0, NA, NA, NA, NA, NA, NA, N...","52, 50, 46, 43, 22, 19, 16, NA, NA, NA, NA, NA...","0, 0, 0, 0, 0, 0, NA, NA, NA, NA, NA, 0, 0, 0,...","30, 40, 36, 36, 18, 18, NA, NA, NA, NA, NA, 4,...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, ...","32, 41, 42, 35, 24, 17, 13, 4, 4, 4, 6, 3, NA,...","0, 0, 0, 0, NA, NA, 0, NA, NA, NA, 0, 0, 0, 0,...","15, 21, 24, 22, NA, NA, 10, NA, NA, NA, 8, 7, ...","0, 0, 0, 0, 0, 0, 0, NA, NA, NA, 0, 0, 0, 0, 0...","86, 103, 93, 84, 52, 52, 18, NA, NA, NA, 5, 3,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12062,"159, 249, 219, 395, 1154, 1382, 1386, 574, 919...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","197, 287, 240, 364, 1213, 1415, 1438, 590, 867...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","306, 449, 361, 387, 1363, 1716, 1779, 627, 981...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","63, 96, 79, 156, 493, 578, 617, 239, 364, 363,...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","138, 205, 157, 226, 772, 959, 985, 363, 618, 6...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",...,"NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","NA, 43, 31, 134, 377, 295, 294, 93, 28, 79, 12...","0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","33, 43, NA, 87, 271, 242, 243, 89, 30, 97, 144...","NA, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","NA, 61, NA, 122, 334, 266, 268, 88, 39, 124, 1...","NA, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","NA, 44, NA, 71, 232, 182, 196, 56, 24, 64, 99,...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","52, 43, 49, 237, 530, 579, 494, 649, 197, 377,..."
12063,,,6,0,5,0,3,0,6,0,...,0,3,,,,,,,,
12064,"4, NA","0, NA","6, NA","0, NA","3, NA","0, NA","141, 123","0, 0","244, 234","0, 0",...,"0, 0","567, 373","0, 0","572, 437","0, 0","443, 304","0, 0","1705, 1255","0, 3","12, 8"
12065,,,,,,,28,0,30,0,...,0,275,0,196,0,301,0,1048,,


In [7]:
## Get just the experiments
exp_list = [exp[0] for exp in exp_cols]
exp_list = list(set(exp_list))
exp_list

['Cr*Ps r1',
 'PrCs* FN_flower',
 'PrCs* r1',
 'Pr*Cs r1',
 'CrPs* -N r1',
 'CrPs* -P r1',
 'CrPs* r2',
 'PrCs* FN_stemUpper',
 'CrPs* -N r3',
 'CrPs* -P r2',
 'Cr*Ps -N r3',
 'Cr*Ps -N r1',
 'Cr*Ps r3',
 'PrCs* r2',
 'CrPs* r3',
 'CrPs* r1',
 'Cr*Ps r2',
 'Cr*Ps -N r2',
 'PrCs* r3',
 'Pr*Cs FN_root',
 'Pr*Cs r2',
 'PrCs* FN_stemLower',
 'Pr*Cs r3',
 'CrPs* -N r2',
 'Cr*Ps -P r1',
 'Cr*Ps -P r3',
 'CrPs* -P r3',
 'PrCs* FN_rosette',
 'Cr*Ps -P r2']

In [8]:
## Add in the transcript information
df_info = df[["Transcript","Informative positions"]]
## Drop level 1
df_info.columns = df_info.columns.droplevel(1)

df_list = []
for exp in exp_list:
    df_subset = df_exps[exp].copy()
    df_subset["rep"] = exp
    df_subset = pd.concat([df_info, df_subset], axis=1)
    cols = df_subset.columns.to_list()
    col_col = [col for col in cols if "COL base" in col]
    col_ped = [col for col in cols if "PED base" in col]
    df_subset.rename(columns = {col_col[0]:"COL base", col_ped[0]:"PED base"}, inplace=True)
    df_list.append(df_subset)

df_list[0]


Unnamed: 0,Transcript,Informative positions,COL base,PED base,rep
0,AT1G50920.1,"220, 1351","707, 597","0, 0",Cr*Ps r1
1,AT1G73440.1,"122, 158, 345, 421, 813","66, 77, 53, 89, 25","0, 0, 0, 0, 0",Cr*Ps r1
2,AT1G51380.1,"209, 987, 1376","24, 9, 6","0, 0, 0",Cr*Ps r1
3,AT1G10950.1,221,348,0,Cr*Ps r1
4,AT1G31870.1,"100, 246, 273, 289, 363, 368, 395, 457, 469, 4...","49, 72, 49, 49, 45, 49, 37, 31, 28, 30, 33, 31...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",Cr*Ps r1
...,...,...,...,...,...
12062,AT5G25560.4,"108, 189, 231, 417, 469, 593, 614, 715, 971, 9...","159, 249, 219, 395, 1154, 1382, 1386, 574, 919...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",Cr*Ps r1
12063,AT5G45790.2,1252,,,Cr*Ps r1
12064,ATCG00490.1,"864, 1116","4, NA","0, NA",Cr*Ps r1
12065,ATCG00680.1,222,,,Cr*Ps r1


In [9]:
## Convert the 3 middle columns to lists and explode the dataframe
def convert_to_list(row):
    snp_list = []
    ## Separate into ints
    ## Check if there are multiple values
    if "," in str(row):
        snp_list = row.split(",")
    else:
        try:
            snp_list = [int(row)]
        except:
            return ["NA"]

    return snp_list

for df in df_list:
    df["COL base"] = df["COL base"].apply(lambda x: convert_to_list(x))
    df["PED base"] = df["PED base"].apply(lambda x: convert_to_list(x))
    df["Informative positions"] = df["Informative positions"].apply(lambda x: convert_to_list(x))
    df.explode(["COL base","PED base","Informative positions"])



In [10]:
len(df_list)

29

In [11]:
df_list_new = []
for df in df_list:
    df_list_new.append(df.explode(["Informative positions","COL base","PED base"]))

In [12]:
len(df_list_new)

29

In [13]:
## Add in the SNP information
df_snp = pd.read_csv("new_thieme_og_snps.csv", index_col=None)
df_list_snps = []

for df in df_list_new:
    df["Informative positions"] = pd.to_numeric(df["Informative positions"], errors='coerce')
    new_df = pd.merge(df, df_snp,  left_on=["Transcript","Informative positions"], right_on = ["Transcript","snp_list"])
    new_df["COL base"] = pd.to_numeric(new_df["COL base"], errors='coerce')
    new_df["PED base"] = pd.to_numeric(new_df["PED base"], errors='coerce')
    ## Let's update the column names in a slightly cheating fashion
    if new_df["COL base"].sum() > new_df["PED base"].sum():
        new_df.rename(columns = {"COL base":"N","PED base":"n"}, inplace=True)
        ## Get if its root or shoot
        new_df["type"] = "Col"
        if "Cs" in new_df["rep"][0]:
            new_df["tissue"] = "shoot"
        if "Cr" in new_df["rep"][0]:
            new_df["tissue"] = "root"
    else:
        new_df.rename(columns = {"PED base":"N","COL base":"n"}, inplace=True)
        new_df["type"] = "Ped"
        if "Ps" in new_df["rep"][0]:
            new_df["tissue"] = "shoot"
        if "Pr" in new_df["rep"][0]:
            new_df["tissue"] = "root"
    df_list_snps.append(new_df)


In [14]:
df_list_snps[0]

Unnamed: 0,Transcript,Informative positions,N,n,rep,snp_list,SNP,type,tissue
0,AT1G50920.1,220,707.0,0.0,Cr*Ps r1,220,Chr1_18870656,Col,root
1,AT1G50920.1,1351,597.0,0.0,Cr*Ps r1,1351,Chr1_18871787,Col,root
2,AT1G51380.1,209,24.0,0.0,Cr*Ps r1,209,Chr1_19048090,Col,root
3,AT1G51380.1,987,9.0,0.0,Cr*Ps r1,987,Chr1_19049605,Col,root
4,AT1G51380.1,1376,6.0,0.0,Cr*Ps r1,1376,Chr1_19050086,Col,root
...,...,...,...,...,...,...,...,...,...
54648,AT5G25560.4,2225,61.0,0.0,Cr*Ps r1,2225,Chr5_8901116,Col,root
54649,AT5G25560.4,2240,62.0,0.0,Cr*Ps r1,2240,Chr5_8901131,Col,root
54650,AT5G25560.4,2249,53.0,0.0,Cr*Ps r1,2249,Chr5_8901140,Col,root
54651,AT5G25560.4,2252,49.0,0.0,Cr*Ps r1,2252,Chr5_8901143,Col,root


In [15]:
## Time to add in the homograft data! Huzzah!

ccroot = pd.read_csv("raw_data/thieme/homfiles/C-C-Root-FN.txt",delimiter = "\t", low_memory = False)
pproot = pd.read_csv("raw_data/thieme/homfiles/P-P-Root-FN.txt",delimiter = "\t", low_memory = False)
ccshoot = pd.read_csv("raw_data/thieme/homfiles/C-C-Shoot-FN.txt",delimiter = "\t", low_memory = False)
ppshoot = pd.read_csv("raw_data/thieme/homfiles/P-P-Shoot-FN.txt",delimiter = "\t", low_memory = False)

ccroot['depth'] = pd.to_numeric(ccroot['depth'], errors='coerce')
ccroot['lerDepth'] = pd.to_numeric(ccroot['lerDepth'], errors='coerce')
ccroot['rawDepth'] = pd.to_numeric(ccroot['rawDepth'], errors='coerce')

ccroot = ccroot.dropna(subset=['depth', 'lerDepth', 'rawDepth'])
ccroot = ccroot[ccroot["depth"]>0]

## Repeat all the above code for pproot
pproot['depth'] = pd.to_numeric(pproot['depth'], errors='coerce')
pproot['colDepth'] = pd.to_numeric(pproot['colDepth'], errors='coerce')
pproot['rawDepth'] = pd.to_numeric(pproot['rawDepth'], errors='coerce')

pproot = pproot.dropna(subset=['depth', 'colDepth', 'rawDepth'])
pproot = pproot[pproot["depth"]>0]

## Repeat above for ccshoot

ccshoot['depth'] = pd.to_numeric(ccshoot['depth'], errors='coerce')
ccshoot['lerDepth'] = pd.to_numeric(ccshoot['lerDepth'], errors='coerce')
ccshoot['rawDepth'] = pd.to_numeric(ccshoot['rawDepth'], errors='coerce')

ccshoot = ccshoot.dropna(subset=['depth', 'lerDepth', 'rawDepth'])
ccshoot = ccshoot[ccshoot["depth"]>0]

## Repeat above for ppshoot

ppshoot['depth'] = pd.to_numeric(ppshoot['depth'], errors='coerce')
ppshoot['colDepth'] = pd.to_numeric(ppshoot['colDepth'], errors='coerce')
ppshoot['rawDepth'] = pd.to_numeric(ppshoot['rawDepth'], errors='coerce')

ppshoot = ppshoot.dropna(subset=['depth', 'colDepth', 'rawDepth'])
ppshoot = ppshoot[ppshoot["depth"]>0]

In [16]:
## Need to sum across the replicates before adding the homograft data
df_bayes_exp = []
for df in df_list_snps:
    if "FN" not in df["rep"][0]:
        df["exp"] = df["rep"].apply(lambda x: x.split(" ")[0])
    else:
        df["exp"] = df["rep"]
    df_bayes_exp.append(df)

In [17]:
df_bayes_exp[0]

Unnamed: 0,Transcript,Informative positions,N,n,rep,snp_list,SNP,type,tissue,exp
0,AT1G50920.1,220,707.0,0.0,Cr*Ps r1,220,Chr1_18870656,Col,root,Cr*Ps
1,AT1G50920.1,1351,597.0,0.0,Cr*Ps r1,1351,Chr1_18871787,Col,root,Cr*Ps
2,AT1G51380.1,209,24.0,0.0,Cr*Ps r1,209,Chr1_19048090,Col,root,Cr*Ps
3,AT1G51380.1,987,9.0,0.0,Cr*Ps r1,987,Chr1_19049605,Col,root,Cr*Ps
4,AT1G51380.1,1376,6.0,0.0,Cr*Ps r1,1376,Chr1_19050086,Col,root,Cr*Ps
...,...,...,...,...,...,...,...,...,...,...
54648,AT5G25560.4,2225,61.0,0.0,Cr*Ps r1,2225,Chr5_8901116,Col,root,Cr*Ps
54649,AT5G25560.4,2240,62.0,0.0,Cr*Ps r1,2240,Chr5_8901131,Col,root,Cr*Ps
54650,AT5G25560.4,2249,53.0,0.0,Cr*Ps r1,2249,Chr5_8901140,Col,root,Cr*Ps
54651,AT5G25560.4,2252,49.0,0.0,Cr*Ps r1,2252,Chr5_8901143,Col,root,Cr*Ps


In [18]:
df_bayes = pd.concat(df_bayes_exp)
## Need to separate by experiment type
df_bayes["condition"] = "FN"
df_bayes.loc[df_bayes["rep"].str.contains("-N"),"condition"] = "N"
df_bayes.loc[df_bayes["rep"].str.contains("-P"),"condition"] = "P"
df_bayes



Unnamed: 0,Transcript,Informative positions,N,n,rep,snp_list,SNP,type,tissue,exp,condition
0,AT1G50920.1,220,707.0,0.0,Cr*Ps r1,220,Chr1_18870656,Col,root,Cr*Ps,FN
1,AT1G50920.1,1351,597.0,0.0,Cr*Ps r1,1351,Chr1_18871787,Col,root,Cr*Ps,FN
2,AT1G51380.1,209,24.0,0.0,Cr*Ps r1,209,Chr1_19048090,Col,root,Cr*Ps,FN
3,AT1G51380.1,987,9.0,0.0,Cr*Ps r1,987,Chr1_19049605,Col,root,Cr*Ps,FN
4,AT1G51380.1,1376,6.0,0.0,Cr*Ps r1,1376,Chr1_19050086,Col,root,Cr*Ps,FN
...,...,...,...,...,...,...,...,...,...,...,...
54648,AT5G25560.4,2225,42.0,0.0,Cr*Ps -P r2,2225,Chr5_8901116,Col,root,Cr*Ps,P
54649,AT5G25560.4,2240,41.0,0.0,Cr*Ps -P r2,2240,Chr5_8901131,Col,root,Cr*Ps,P
54650,AT5G25560.4,2249,36.0,0.0,Cr*Ps -P r2,2249,Chr5_8901140,Col,root,Cr*Ps,P
54651,AT5G25560.4,2252,38.0,0.0,Cr*Ps -P r2,2252,Chr5_8901143,Col,root,Cr*Ps,P


In [19]:

df_bayes_grouped = df_bayes.groupby(["SNP","condition","tissue","type","exp"], as_index=False).agg({"Transcript": "first", "tissue":"first","type":"first","N": "sum", "n": "sum"})
df_bayes_grouped

Unnamed: 0,SNP,condition,exp,Transcript,tissue,type,N,n
0,Chr1_1000114,FN,Cr*Ps,AT1G03910.1,root,Col,178.0,0.0
1,Chr1_1000114,FN,Pr*Cs,AT1G03910.1,root,Ped,187.0,0.0
2,Chr1_1000114,FN,Pr*Cs FN_root,AT1G03910.1,root,Ped,235.0,0.0
3,Chr1_1000114,FN,PrCs*,AT1G03910.1,shoot,Col,133.0,0.0
4,Chr1_1000114,FN,PrCs* FN_flower,AT1G03910.1,shoot,Col,223.0,0.0
...,...,...,...,...,...,...,...,...
584397,Chr5_9987837,FN,CrPs*,AT5G27950.1,shoot,Ped,25.0,0.0
584398,Chr5_9987837,N,Cr*Ps,AT5G27950.1,root,Col,43.0,0.0
584399,Chr5_9987837,N,CrPs*,AT5G27950.1,shoot,Ped,0.0,0.0
584400,Chr5_9987837,P,Cr*Ps,AT5G27950.1,root,Col,32.0,0.0


In [20]:
set(df_bayes_grouped["exp"].to_list())

{'Cr*Ps',
 'CrPs*',
 'Pr*Cs',
 'Pr*Cs FN_root',
 'PrCs*',
 'PrCs* FN_flower',
 'PrCs* FN_rosette',
 'PrCs* FN_stemLower',
 'PrCs* FN_stemUpper'}

In [21]:
## Time to add it all together!

df_bayes_croot = df_bayes_grouped[(df_bayes_grouped["tissue"]=="root") & (df_bayes_grouped["type"]=="Col")]
df_bayes_proot = df_bayes_grouped[(df_bayes_grouped["tissue"]=="root") & (df_bayes_grouped["type"]=="Ped")]
df_bayes_cshoot = df_bayes_grouped[(df_bayes_grouped["tissue"]=="shoot") & (df_bayes_grouped["type"]=="Col")]
df_bayes_pshoot = df_bayes_grouped[(df_bayes_grouped["tissue"]=="shoot") & (df_bayes_grouped["type"]=="Ped")]

hom1 = ccroot.copy()
hom2 = pproot.copy()
hom1.rename(columns = {"depth":"Nh1","lerDepth":"nh1"}, inplace=True)
hom2.rename(columns = {"depth":"Nh2","colDepth":"nh2"}, inplace=True)
df_bayes_croot = df_bayes_croot.merge(hom1[["SNP","Nh1","nh1"]], on = "SNP")
df_bayes_croot = df_bayes_croot.merge(hom2[["SNP","Nh2","nh2"]], on = "SNP")

hom1 = pproot.copy()
hom2 = ccroot.copy()
hom1.rename(columns = {"depth":"Nh1","colDepth":"nh1"}, inplace=True)
hom2.rename(columns = {"depth":"Nh2","lerDepth":"nh2"}, inplace=True)
df_bayes_proot = df_bayes_proot.merge(hom1[["SNP","Nh1","nh1"]], on = "SNP")
df_bayes_proot = df_bayes_proot.merge(hom2[["SNP","Nh2","nh2"]], on = "SNP")

hom1 = ccshoot.copy()
hom2 = ppshoot.copy()
hom1.rename(columns = {"depth":"Nh1","lerDepth":"nh1"}, inplace=True)
hom2.rename(columns = {"depth":"Nh2","colDepth":"nh2"}, inplace=True)
df_bayes_cshoot = df_bayes_cshoot.merge(hom1[["SNP","Nh1","nh1"]], on = "SNP")
df_bayes_cshoot = df_bayes_cshoot.merge(hom2[["SNP","Nh2","nh2"]], on = "SNP")

hom1 = ppshoot.copy()
hom2 = ccshoot.copy()
hom1.rename(columns = {"depth":"Nh1","colDepth":"nh1"}, inplace=True)
hom2.rename(columns = {"depth":"Nh2","lerDepth":"nh2"}, inplace=True)
df_bayes_pshoot = df_bayes_pshoot.merge(hom1[["SNP","Nh1","nh1"]], on = "SNP")
df_bayes_pshoot = df_bayes_pshoot.merge(hom2[["SNP","Nh2","nh2"]], on = "SNP")

df_bayes_final = pd.concat([df_bayes_croot, df_bayes_proot, df_bayes_cshoot, df_bayes_pshoot])
df_bayes_final

Unnamed: 0,SNP,condition,exp,Transcript,tissue,type,N,n,Nh1,nh1,Nh2,nh2
0,Chr1_1000114,FN,Cr*Ps,AT1G03910.1,root,Col,178.0,0.0,201.0,0.0,125.0,0.0
1,Chr1_1000114,N,Cr*Ps,AT1G03910.1,root,Col,117.0,0.0,201.0,0.0,125.0,0.0
2,Chr1_1000114,P,Cr*Ps,AT1G03910.1,root,Col,84.0,0.0,201.0,0.0,125.0,0.0
3,Chr1_10010338,FN,Cr*Ps,AT1G28470.1,root,Col,153.0,0.0,75.0,0.0,43.0,0.0
4,Chr1_10010338,N,Cr*Ps,AT1G28470.1,root,Col,402.0,0.0,75.0,0.0,43.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
125974,Chr5_9987395,N,CrPs*,AT5G27950.1,shoot,Ped,286.0,0.0,76.0,0.0,316.0,0.0
125975,Chr5_9987395,P,CrPs*,AT5G27950.1,shoot,Ped,205.0,0.0,76.0,0.0,316.0,0.0
125976,Chr5_9987837,FN,CrPs*,AT5G27950.1,shoot,Ped,25.0,0.0,15.0,0.0,52.0,0.0
125977,Chr5_9987837,N,CrPs*,AT5G27950.1,shoot,Ped,0.0,0.0,15.0,0.0,52.0,0.0


In [25]:
import baymobil as baymob

## WE seem to have some dodgy values where n is greater than N - maybe check those out
df_bayes_final = df_bayes_final[df_bayes_final["n"] <= df_bayes_final["N"]]
df_results = baymob.run_bayes(df_bayes_final, 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')


INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=53482), Label(value='0 / 53482')))…

In [26]:
df_results.to_csv("thieme_new_analysis.csv", index=None)

In [31]:
## Cap the log10BF in df_results between -2 and 2
df_results["log10BF"] = df_results["log10BF"].apply(lambda x: 2 if x > 2 else x)
df_results["log10BF"] = df_results["log10BF"].apply(lambda x: -2 if x < -2 else x)

df_results_grouped = df_results.groupby(["Transcript","condition","exp","type","tissue"]).sum().reset_index()
df_results_grouped[df_results_grouped["log10BF"]>=1]

  df_results_grouped = df_results.groupby(["Transcript","condition","exp","type","tissue"]).sum().reset_index()


Unnamed: 0,Transcript,condition,exp,type,tissue,N,n,Nh1,nh1,Nh2,nh2,nmax,meanN2,N2max,log10BF
2067,AT1G06680.1,FN,Pr*Cs FN_root,Ped,root,19.0,10.0,80.0,0.0,93.0,0.0,10,9.966324,10,2.000000
3258,AT1G09560.1,FN,Pr*Cs FN_root,Ped,root,29922.0,83.0,22715.0,11.0,13272.0,1.0,90,57.131826,70,4.667092
3303,AT1G09640.1,FN,Pr*Cs,Ped,root,2123.0,16.0,1569.0,1.0,2389.0,1.0,10,12.588884,15,1.481299
4080,AT1G11580.1,FN,PrCs* FN_rosette,Col,shoot,2289.0,43.0,3515.0,1.0,1785.0,2.0,60,36.237985,43,7.475608
4299,AT1G12010.1,FN,PrCs* FN_rosette,Col,shoot,53.0,8.0,405.0,0.0,3.0,0.0,30,12.093939,9,3.114719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88666,AT5G65220.1,FN,Pr*Cs FN_root,Ped,root,153.0,6.0,179.0,0.0,209.0,0.0,10,4.800331,6,1.198098
89217,AT5G66570.1,FN,Pr*Cs FN_root,Ped,root,354.0,140.0,204.0,1.0,603.0,0.0,20,137.047010,139,4.000000
89223,AT5G66570.1,N,Cr*Ps,Col,root,349.0,12.0,603.0,0.0,204.0,1.0,20,10.610598,12,3.076923
89225,AT5G66570.1,P,Cr*Ps,Col,root,171.0,15.0,603.0,0.0,204.0,1.0,20,14.589373,15,4.000000
