In [32]:
## Import statements
import pandas as pd
import baymobil as baymob
from pandarallel import pandarallel

pandarallel.initialize(progress_bar = False)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [33]:
## Define functions
## Function to calculating Bayes Factors for the Thieme data, based on a set error rate (the error rate is defined as 1 in "error")

def calculate_bf(row, error):
        """
        Calculate the Bayes Factor (BF) for a given row.
        The function handles cases where N and n values may be in comma-separated format.
        """
        bf_list = []

        # Separate into integers and handle multiple values
        endo_list, distal_list = [], []
        
        if "," in str(row.endo):
            endo_list = [int(n) for n in row.endo.split(",") if "NA" not in n]
            distal_list = [int(n) for n in row.distal.split(",") if "NA" not in n]
        else:
            try:
                endo_list = [int(row.endo)]
                distal_list = [int(row.distal)]
            except ValueError:
               # print("NAN encountered in row:", row)
                return "NA"
        
        # Calculate the Bayes Factor for each pair of N and n
        for endo, distal in zip(endo_list, distal_list):
            N = endo + distal
            n = distal
            bf = baymob.fasterpostN2(error, 1, error, 1, N, n, 10)
            bf_value = min(max(bf[2], -2), 2)  # Cap the BF values between -2 and 2
            bf_list.append(bf_value)

        return sum(bf_list)

In [34]:
## Load in the Excel table from the supplementary material: https://www.nature.com/articles/nplants201525#Sec15
df = pd.read_excel("../Data/thieme_original_data.xlsx", sheet_name="SNP in all transcripts", header=[0, 1])

# Merge the two levels of the header into a single level with a more meaningful separator
df.columns = [':'.join(col).strip() for col in df.columns.values]

## Drop the extra columns from the dataframe
df = df.drop(columns=df.filter(like="Status.").columns).copy()

  warn(msg)


In [35]:
## Data taken from: "Endogenous Arabidopsis messenger RNAs transported to distant tissues" Thieme et al. 2015
## Code to run for table 1

exps = [col for col in df.columns if "*" in col]

## The columns we are using: COL base counts, PED base counts
exps_split = [exp.split(":")[0] for exp in exps]
exps = set(exps_split)
df_thieme_list = []

for exp in exps:
    col = exp + ":COL"
    ped = exp + ":PED"
    mobile = exp + ":Status"
    
    col_base_counts = df.filter(like=col)
    ped_base_counts = df.filter(like=ped)
    mobile_status = df.filter(like=mobile)

    # Rename columns
    col_base_counts.columns = [col]
    ped_base_counts.columns = [ped]
    mobile_status.columns = [mobile]
    
    df_thieme_list.append(pd.concat([ped_base_counts, col_base_counts, mobile_status], axis=1))

# Convert df_thieme_list to a DataFrame
df_thieme = pd.concat(df_thieme_list, axis=1)
df_thieme["ID"] = df.filter(like="Transcript:")

In [37]:
output1000_list = []
output10000_list = []

for exp in exps:

    output1000 = df_thieme.filter(like=exp).copy()
    output10000 = df_thieme.filter(like=exp).copy()

    ## Rename columns that include the word "Status" as "Status"
    output1000.rename(columns={exp + ":Status": "Status"}, inplace=True)
    output10000.rename(columns={exp + ":Status": "Status"}, inplace=True)
    
    ## Get whether the sample is from col or ped
    if "Ps*" in exp or "Pr*" in exp:
         output1000.rename(columns={exp + ":PED": "endo", exp + ":COL": "distal"}, inplace=True)
         output10000.rename(columns={exp + ":PED": "endo", exp + ":COL": "distal"}, inplace=True)
    elif "Cs*" in exp or "Cr*" in exp:
         output1000.rename(columns={exp + ":COL": "endo", exp + ":PED": "distal"}, inplace=True)
         output10000.rename(columns={exp + ":COL": "endo", exp + ":PED": "distal"}, inplace=True)
    else:
        print("Error")

    output1000["log10BF"] = output1000.parallel_apply(calculate_bf, args=(1000,), axis=1)
    output10000["log10BF"] = output10000.parallel_apply(calculate_bf, args=(10000,), axis=1)
    output1000["exp"] = exp
    output10000["exp"] = exp
    output1000.dropna(inplace=True)
    output10000.dropna(inplace=True)

    ## Uncomment the below to see the results for each experiment

    #print("Experiment: ", exp)

    #print("Error = 1 in 1000")
    #print("Total Thieme find to be mobile:")
    #print(len(output1000[output1000["Status"] == "mobile"]))
    #print("Total BF find to be mobile:")
    #print(len(output1000[output1000["log10BF"] >= 1]))

    #print("Error = 1 in 10000")
    #print(len(output10000[output10000["Status"] == "mobile"]))
    #print("Total Thieme find to be mobile:")
    #print("Total BF find to be mobile:")
    #print(len(output10000[output10000["log10BF"] >= 1]))

    output1000_list.append(output1000)
    output10000_list.append(output10000)
    

In [131]:
## Create the final dataframes and add in the transcript information

df_output1000 = pd.concat(output1000_list)
df_output10000 = pd.concat(output10000_list)

df_output1000["ID"] = df_thieme["ID"]
df_output10000["ID"] = df_thieme["ID"]

df_output1000["Transcript"] = df_output1000["ID"].apply(lambda x: x.split(".")[0])
df_output10000["Transcript"] = df_output10000["ID"].apply(lambda x: x.split(".")[0])

In [134]:
## Save the processed data
df_output1000.to_csv("../Data/thieme_bf_1000.csv")
df_output10000.to_csv("../Data/thieme_bf_10000.csv")

In [None]:
## For each error rates, print the number of transcripts with a log10BF >= 1, and the amount found mobile in Thieme et al. 2015

## Import the data
df_output1000 = pd.read_csv("../Data/thieme_bf_1000.csv")
df_output10000 = pd.read_csv("../Data/thieme_bf_10000.csv")

## 1 in 1000
thieme_mobile = set(df_output1000[df_output1000["Status"]=="mobile"]["Transcript"].to_list())
bf_mobile = set(df_output1000[df_output1000["log10BF"]>=1]["Transcript"].to_list())

print(f"Thieme mobile: {len(thieme_mobile)}")
print(f"logBF>=1: {len(bf_mobile)}")

## 1 in 10000
thieme_mobile = set(df_output10000[df_output10000["Status"]=="mobile"]["Transcript"].to_list())
bf_mobile = set(df_output10000[df_output10000["log10BF"]>=1]["Transcript"].to_list())

print(f"Thieme mobile: {len(thieme_mobile)}")
print(f"logBF >= 1: {len(bf_mobile)}")

In [27]:
## Same process as above, but for the grapevine data
import pandas as pd

df = pd.read_excel("../Data/Vitis_vinifera_mobile_mRNA.xlsx", sheet_name="Dataset S2", skiprows=1, header=[0, 1])

# Merge the two levels of the header into a single level with a more meaningful separator
df.columns = [':'.join(col).strip() for col in df.columns.values]

## Get the columns relating to V. girdiana rootstock
df_vgir = df.filter(like="V. girdiana scion-1").copy()

df_vgir["SNP"] = df["Gene ID:Unnamed: 0_level_1"]

## Replace the dashes with 0s
df_vgir.replace("-", 0, inplace=True)

df_vgir.rename(columns={"V. girdiana scion-1:No. source reads": "endo", "V. girdiana scion-1:No. reads": "distal"}, inplace=True)

df_vgir["N"] = df_vgir["endo"] + df_vgir["distal"]
df_vgir["n"] = df_vgir["distal"]



Index(['Gene ID:Unnamed: 0_level_1', 'Annotation:Unnamed: 1_level_1',
       'Mobile RNAs detected in the 8 different graft tissues as listed on the right:Unnamed: 2_level_1',
       'V. girdiana scion-1:RPKM', 'V. girdiana scion-1:No. source reads',
       'V. girdiana scion-1:No. reads', 'V. girdiana scion-1:No. unique reads',
       'V. palmata scion-2:RPKM', 'V. palmata scion-2:No. source reads',
       'V. palmata scion-2:No. reads', 'V. palmata scion-2:No. unique reads',
       'V. vinifera cv. 'Riesling' scion (pH 5.5)-3:RPKM',
       'V. vinifera cv. 'Riesling' scion (pH 5.5)-3:No. source reads',
       'V. vinifera cv. 'Riesling' scion (pH 5.5)-3:No. reads',
       'V. vinifera cv. 'Riesling' scion (pH 5.5)-3:No. unique reads',
       'V. vinifera cv. 'Riesling' scion (pH 6.5)-4:RPKM',
       'V. vinifera cv. 'Riesling' scion (pH 6.5)-4:No. source reads',
       'V. vinifera cv. 'Riesling' scion (pH 6.5)-4:No. reads',
       'V. vinifera cv. 'Riesling' scion (pH 6.5)-4:No. uni

  df_vgir.replace("-", 0, inplace=True)


In [31]:
error1 = 1000
error2 = 10000

## Run for 1 in 1000

df_vgir["Nh1"] = error1
df_vgir["nh1"] = 1
df_vgir["Nh2"] = error1
df_vgir["nh2"] = 1

## Convert N and n to integers
df_vgir["N"] = df_vgir["N"].astype(int)
df_vgir["n"] = df_vgir["n"].astype(int)

df_vgir1000 = baymob.run_bayes(df_vgir,10)
print("Numbers of mobile found in paper:")
print(len(df_vgir1000))
print("No. consistent with error of 0.1%:")
print(len(df_vgir1000) - len(df_vgir1000[df_vgir1000["log10BF"]>=1]))

## Run for 1 in 10000

df_vgir["Nh1"] = error2
df_vgir["nh1"] = 1
df_vgir["Nh2"] = error2
df_vgir["nh2"] = 1

## Convert N and n to integers
df_vgir["N"] = df_vgir["N"].astype(int)
df_vgir["n"] = df_vgir["n"].astype(int)

df_vgir10000 = baymob.run_bayes(df_vgir,10)
print("Numbers of mobile found in paper:")
print(len(df_vgir10000))
print("No. consistent with error of 0.01%:")
print(len(df_vgir10000) - len(df_vgir10000[df_vgir10000["log10BF"]>=1]))

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=113), Label(value='0 / 113'))), HB…

Numbers of mobile found in paper:
1130
No. consistent with error of 0.1%:
945
INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=113), Label(value='0 / 113'))), HB…

Numbers of mobile found in paper:
1130
No. consistent with error of 0.01%:
384
