In [1]:
## Import statements
import pandas as pd
import baymobil as baymob
from pandarallel import pandarallel
pandarallel.initialize(progress_bar = False)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
## Define functions
## Function to calculating Bayes Factors for the Thieme data, based on a set error rate (the error rate is defined as 1 in "error")

def calculate_bf(row, error):
        """
        Calculate the Bayes Factor (BF) for a given row.
        The function handles cases where N and n values may be in comma-separated format.
        """
        bf_list = []

        # Separate into integers and handle multiple values
        endo_list, distal_list = [], []
        
        if "," in str(row.endo):
            endo_list = [int(n) for n in row.endo.split(",") if "NA" not in n]
            distal_list = [int(n) for n in row.distal.split(",") if "NA" not in n]
        else:
            try:
                endo_list = [int(row.endo)]
                distal_list = [int(row.distal)]
            except ValueError:
               # print("NAN encountered in row:", row)
                return "NA"
        
        # Calculate the Bayes Factor for each pair of N and n
        for endo, distal in zip(endo_list, distal_list):
            N = endo + distal
            n = distal
            bf = baymob.fasterpostN2(error, 1, error, 1, N, n, 10)
            bf_value = min(max(bf[2], -2), 2)  # Cap the BF values between -2 and 2
            bf_list.append(bf_value)

        return sum(bf_list)

In [None]:
## Load in the Excel table from the supplementary material: https://www.nature.com/articles/nplants201525#Sec15 - the cell runs in ~ 14 secs.
df = pd.read_excel("../Data/thieme_original_data.xlsx", sheet_name="SNP in all transcripts", header=[0, 1])

# Merge the two levels of the header into a single level with a more meaningful separator
df.columns = [':'.join(col).strip() for col in df.columns.values]

## Drop the extra columns from the dataframe
df = df.drop(columns=df.filter(like="Status.").columns).copy()

  warn(msg)


In [4]:
## Data taken from: "Endogenous Arabidopsis messenger RNAs transported to distant tissues" Thieme et al. 2015
## Code to run for table 1 - runs in < 1 sec.

exps = [col for col in df.columns if "*" in col]

## The columns we are using: COL base counts, PED base counts
exps_split = [exp.split(":")[0] for exp in exps]
exps = set(exps_split)
df_thieme_list = []

for exp in exps:
    col = exp + ":COL"
    ped = exp + ":PED"
    mobile = exp + ":Status"
    
    col_base_counts = df.filter(like=col)
    ped_base_counts = df.filter(like=ped)
    mobile_status = df.filter(like=mobile)

    # Rename columns
    col_base_counts.columns = [col]
    ped_base_counts.columns = [ped]
    mobile_status.columns = [mobile]
    
    df_thieme_list.append(pd.concat([ped_base_counts, col_base_counts, mobile_status], axis=1))

# Convert df_thieme_list to a DataFrame
df_thieme = pd.concat(df_thieme_list, axis=1)
df_thieme["ID"] = df.filter(like="Transcript:")

display(df_thieme.head())

Unnamed: 0,PrCs* FN_stemUpper:PED,PrCs* FN_stemUpper:COL,PrCs* FN_stemUpper:Status,CrPs* -P r1:PED,CrPs* -P r1:COL,CrPs* -P r1:Status,Pr*Cs FN_root:PED,Pr*Cs FN_root:COL,Pr*Cs FN_root:Status,Cr*Ps -N r2:PED,...,Pr*Cs r2:PED,Pr*Cs r2:COL,Pr*Cs r2:Status,CrPs* -N r1:PED,CrPs* -N r1:COL,CrPs* -N r1:Status,Cr*Ps -P r2:PED,Cr*Ps -P r2:COL,Cr*Ps -P r2:Status,ID
0,"0, 0","564, 914",not mobile,"299, 164","0, 0",not mobile,"937, 1178","4, 0",mobile,"0, 0",...,"470, 597","0, 0",not mobile,"456, 276","0, 0",not mobile,"0, 0","268, 232",not mobile,AT1G50920.1
1,"0, 0, 0, 0, 0","95, 147, 114, 108, 32",not mobile,"52, 55, 21, 35, 8","0, 0, 0, 0, 0",not mobile,"87, 134, 135, 123, 54","0, 0, 0, 0, 0",not mobile,"0, 0, 0, 0, 0",...,"30, 35, 28, 79, 22","0, 0, 0, 0, 0",not mobile,"50, 70, 21, 61, 7","0, 0, 0, 0, 0",not mobile,"0, 0, 0, 0, 0","63, 73, 38, 76, 18",not mobile,AT1G73440.1
2,"0, 0, 0","40, 32, 6",not mobile,"21, 10, NA","0, 0, NA",not mobile,"131, 125, 30","0, 0, 0",not mobile,"0, 0, 0",...,"27, 35, 15","0, 0, 0",not mobile,"33, 17, NA","0, 0, NA",not mobile,"0, 0, NA","20, 8, NA",not mobile,AT1G51380.1
3,0,820,not mobile,291,0,not mobile,1187,0,not mobile,0,...,222,0,not mobile,286,0,not mobile,0,192,not mobile,AT1G10950.1
4,"0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","37, 131, 102, 80, 32, 31, 24, 18, 19, 20, 23, ...",not mobile,"30, 40, 36, 36, 18, 18, NA, NA, NA, NA, NA, 4,...","0, 0, 0, 0, 0, 0, NA, NA, NA, NA, NA, 0, 0, 0,...",not mobile,"86, 103, 93, 84, 52, 52, 18, NA, NA, NA, 5, 3,...","0, 0, 0, 0, 0, 0, 0, NA, NA, NA, 0, 0, 0, 0, 0...",not mobile,"0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",...,"25, 33, 30, 28, 21, 13, 11, 9, 9, 7, 14, 12, 1...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",not mobile,"38, 42, 45, 43, 29, 26, 22, 10, 8, 7, 11, 12, ...","0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",not mobile,"0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","29, 79, 64, 72, 60, 61, 49, 30, 34, 32, 26, 22...",not mobile,AT1G31870.1


In [5]:
## Run for 2 different error rates: 1 in 1000 and 1 in 10000. This cell runs in ~ 23 mins.
output1000_list = []
output10000_list = []

for exp in exps:

    output1000 = df_thieme.filter(like=exp).copy()
    output10000 = df_thieme.filter(like=exp).copy()

    ## Rename columns that include the word "Status" as "Status"
    output1000.rename(columns={exp + ":Status": "Status"}, inplace=True)
    output10000.rename(columns={exp + ":Status": "Status"}, inplace=True)
    
    ## Get whether the sample is from col or ped
    if "Ps*" in exp or "Pr*" in exp:
         output1000.rename(columns={exp + ":PED": "endo", exp + ":COL": "distal"}, inplace=True)
         output10000.rename(columns={exp + ":PED": "endo", exp + ":COL": "distal"}, inplace=True)
    elif "Cs*" in exp or "Cr*" in exp:
         output1000.rename(columns={exp + ":COL": "endo", exp + ":PED": "distal"}, inplace=True)
         output10000.rename(columns={exp + ":COL": "endo", exp + ":PED": "distal"}, inplace=True)
    else:
        print("Error")

    output1000["log10BF"] = output1000.parallel_apply(calculate_bf, args=(1000,), axis=1)
    output10000["log10BF"] = output10000.parallel_apply(calculate_bf, args=(10000,), axis=1)
    output1000["exp"] = exp
    output10000["exp"] = exp
    output1000.dropna(inplace=True)
    output10000.dropna(inplace=True)

    ## Uncomment the below to see the results for each experiment

    #print("Experiment: ", exp)

    #print("Error = 1 in 1000")
    #print("Total Thieme find to be mobile:")
    #print(len(output1000[output1000["Status"] == "mobile"]))
    #print("Total BF find to be mobile:")
    #print(len(output1000[output1000["log10BF"] >= 1]))

    #print("Error = 1 in 10000")
    #print(len(output10000[output10000["Status"] == "mobile"]))
    #print("Total Thieme find to be mobile:")
    #print("Total BF find to be mobile:")
    #print(len(output10000[output10000["log10BF"] >= 1]))

    output1000_list.append(output1000)
    output10000_list.append(output10000)
    

In [6]:
## Create the final dataframes and add in the transcript information

df_output1000 = pd.concat(output1000_list)
df_output10000 = pd.concat(output10000_list)

df_output1000["ID"] = df_thieme["ID"]
df_output10000["ID"] = df_thieme["ID"]

df_output1000["Transcript"] = df_output1000["ID"].apply(lambda x: x.split(".")[0])
df_output10000["Transcript"] = df_output10000["ID"].apply(lambda x: x.split(".")[0])

display(df_output1000.head())

Unnamed: 0,distal,endo,Status,log10BF,exp,ID,Transcript
0,"0, 0","564, 914",not mobile,-4,PrCs* FN_stemUpper,AT1G50920.1,AT1G50920
1,"0, 0, 0, 0, 0","95, 147, 114, 108, 32",not mobile,-10,PrCs* FN_stemUpper,AT1G73440.1,AT1G73440
2,"0, 0, 0","40, 32, 6",not mobile,-6,PrCs* FN_stemUpper,AT1G51380.1,AT1G51380
3,0,820,not mobile,-2,PrCs* FN_stemUpper,AT1G10950.1,AT1G10950
4,"0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","37, 131, 102, 80, 32, 31, 24, 18, 19, 20, 23, ...",not mobile,-70,PrCs* FN_stemUpper,AT1G31870.1,AT1G31870


In [7]:
## Save the processed data
df_output1000.to_csv("../Data/thieme_bf_1000.csv")
df_output10000.to_csv("../Data/thieme_bf_10000.csv")

In [8]:
## For each error rates, print the number of transcripts with a log10BF >= 1, and the amount found mobile in Thieme et al. 2015

## Import the data
df_output1000 = pd.read_csv("../Data/thieme_bf_1000.csv")
df_output10000 = pd.read_csv("../Data/thieme_bf_10000.csv")

## 1 in 1000
thieme_mobile = set(df_output1000[df_output1000["Status"]=="mobile"]["Transcript"].to_list())
bf_mobile = set(df_output1000[df_output1000["log10BF"]>=1]["Transcript"].to_list())

print(f"Thieme mobile: {len(thieme_mobile)}")
print(f"logBF>=1: {len(bf_mobile)}")

## 1 in 10000
thieme_mobile = set(df_output10000[df_output10000["Status"]=="mobile"]["Transcript"].to_list())
bf_mobile = set(df_output10000[df_output10000["log10BF"]>=1]["Transcript"].to_list())

print(f"Thieme mobile: {len(thieme_mobile)}")
print(f"logBF >= 1: {len(bf_mobile)}")

Thieme mobile: 2006
logBF>=1: 551
Thieme mobile: 2006
logBF >= 1: 920


In [9]:
## Same process as above, but for the grapevine data

df = pd.read_excel("../Data/Vitis_vinifera_mobile_mRNA.xlsx", sheet_name="Dataset S2", skiprows=1, header=[0, 1])

# Merge the two levels of the header into a single level with a more meaningful separator
df.columns = [':'.join(col).strip() for col in df.columns.values]

## Get the columns relating to V. girdiana rootstock
df_vgir = df.filter(like="V. girdiana scion-1").copy()

df_vgir["SNP"] = df["Gene ID:Unnamed: 0_level_1"]

## Replace the dashes with 0s
df_vgir.replace("-", 0, inplace=True)

df_vgir.rename(columns={"V. girdiana scion-1:No. source reads": "endo", "V. girdiana scion-1:No. reads": "distal"}, inplace=True)

df_vgir["N"] = df_vgir["endo"] + df_vgir["distal"]
df_vgir["n"] = df_vgir["distal"]

display(df_vgir.head())



  df_vgir.replace("-", 0, inplace=True)


Unnamed: 0,V. girdiana scion-1:RPKM,endo,distal,V. girdiana scion-1:No. unique reads,SNP,N,n
0,11.2,0,0,0,GSVIVG01000021001,0,0
1,4.9,0,0,0,GSVIVG01000034001,0,0
2,8.98,0,0,0,GSVIVG01000035001,0,0
3,21.6,0,0,0,GSVIVG01000037001,0,0
4,14.34,0,0,0,GSVIVG01000040001,0,0


In [10]:
## Calculate the errors rates for 1 in 1000 and 1 in 10000 - runs in less than 10 secs.
error1 = 1000
error2 = 10000

## Run for 1 in 1000

df_vgir["Nh1"] = error1
df_vgir["nh1"] = 1
df_vgir["Nh2"] = error1
df_vgir["nh2"] = 1

## Convert N and n to integers
df_vgir["N"] = df_vgir["N"].astype(int)
df_vgir["n"] = df_vgir["n"].astype(int)

df_vgir1000 = baymob.run_bayes(df_vgir,10)
print("Numbers of mobile found in paper:")
print(len(df_vgir1000))
print("No. consistent with error of 0.1%:")
print(len(df_vgir1000) - len(df_vgir1000[df_vgir1000["log10BF"]>=1]))

## Run for 1 in 10000

df_vgir["Nh1"] = error2
df_vgir["nh1"] = 1
df_vgir["Nh2"] = error2
df_vgir["nh2"] = 1

## Convert N and n to integers
df_vgir["N"] = df_vgir["N"].astype(int)
df_vgir["n"] = df_vgir["n"].astype(int)

df_vgir10000 = baymob.run_bayes(df_vgir,10)
print("Numbers of mobile found in paper:")
print(len(df_vgir10000))
print("No. consistent with error of 0.01%:")
print(len(df_vgir10000) - len(df_vgir10000[df_vgir10000["log10BF"]>=1]))

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=113), Label(value='0 / 113'))), HB…

Numbers of mobile found in paper:
1130
No. consistent with error of 0.1%:
945
INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=113), Label(value='0 / 113'))), HB…

Numbers of mobile found in paper:
1130
No. consistent with error of 0.01%:
384
