In [8]:
import pandas as pd
import difflib
import matplotlib.pyplot as plt
import os
import numpy as np

# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

cell_lines_subset = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH7", "A549"]
cell_lines_rest = ["HaCaT", "JEG3", "Tera1", "PC3"]
cell_lines_additional = ["K562"]
cell_lines_measured = cell_lines_subset + cell_lines_rest + cell_lines_additional

In [9]:
df = pd.read_csv('../microrna_data/mRNA_data/rna_cellline.tsv', sep='\t')

In [11]:
cell_lines_data = df['Cell line'].unique()

In [12]:
# Convert all entries to lowercase to ensure case-insensitivity
cell_lines_data = [line.upper() for line in cell_lines_data]
cell_lines_measured = [line.upper() for line in cell_lines_measured]

In [13]:
matches = {data_line: difflib.get_close_matches(data_line, cell_lines_data, n=2, cutoff=0.8)[0]
           if difflib.get_close_matches(data_line, cell_lines_data, n=2, cutoff=0.8)
           else None for data_line in cell_lines_measured}

In [14]:
# 'invert' matches for renaming purposes
matches_invert = {v: k for k, v in matches.items()}

In [15]:
# make the cell line name upper
df['Cell line'] = df['Cell line'].apply(lambda x: x.upper())

# remove the cell lines that are not in the list
df = df[df['Cell line'].isin(matches.values())]

In [16]:
# rename the cell ines according to matches
df = df.replace({'Cell line': matches_invert})
df.index = df['Cell line']

In [17]:
# get the AGO2 expression
ago1 = df[df['Gene name'] == 'AGO1']
ago2 = df[df['Gene name'] == 'AGO2']
ago3 = df[df['Gene name'] == 'AGO3']
ago4 = df[df['Gene name'] == 'AGO4']
snd1 = df[df['Gene name'] == 'SND1']
fmr1 = df[df['Gene name'] == 'FMR1']

In [18]:
component_dfs = [ago1, ago2, ago3, ago4, snd1, fmr1]

# Ago2 expression

In [19]:
import pickle

with open('../outputs/3_fitting/combined_dataset/combined_dataset_scale_dict_wo_crosstalk.pkl', 'rb') as f:
    scale_dict = pickle.load(f)
    
scale_dict = {k.upper(): 10**v for k, v in scale_dict.items()}
scale_df = pd.DataFrame.from_dict(scale_dict, orient='index', columns=['scale'])

In [20]:
# make the plot folder
plot_folder = '../plots/13_guess_scale_factor/'
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)

In [None]:
# calculate the Pearson r2
import numpy as np
from scipy.stats import pearsonr

# plot scale dict versus ago2 expression
plt.figure(figsize=(2.4, 2))
curr_df = ago1
cell_lines_present = curr_df.index.intersection(scale_df.index)
plt.scatter(curr_df.loc[cell_lines_present, 'nTPM'], scale_df.loc[cell_lines_present, 'scale'])

# label the individual points
for i, txt in enumerate(cell_lines_present):
    plt.annotate(txt, (curr_df.loc[txt, 'nTPM']+0.5, scale_df.loc[txt, 'scale']-0.035))

# plt.legend(loc=[4, 1.75])
r = np.corrcoef(curr_df.loc[cell_lines_present, 'nTPM'], scale_df.loc[cell_lines_present, 'scale'])[0, 1]
plt.title(f'AGO1 expression, r = {r:.2f}', fontsize=8)
plt.xlabel('expression (nTPM)')
plt.ylabel('fitted scaling factor')
plt.xlim(3, 20)
plt.ylim(0.4, 2.2)
plt.tight_layout()
for format in ['.svg', '.png']:
    plt.savefig(plot_folder + 'AGO1_expression_vs_scale_factor' + format, dpi=300)

In [None]:
# plot scale dict versus ago2 expression
plt.figure(figsize=(2.4, 2))
curr_df = ago2
cell_lines_present = curr_df.index.intersection(scale_df.index)
plt.scatter(curr_df.loc[cell_lines_present, 'nTPM'], scale_df.loc[cell_lines_present, 'scale'])

# label the individual points
for i, txt in enumerate(cell_lines_present):
    plt.annotate(txt, (curr_df.loc[txt, 'nTPM']+0.5, scale_df.loc[txt, 'scale']-0.045))

# plt.legend(loc=[4, 1.75])
r = np.corrcoef(curr_df.loc[cell_lines_present, 'nTPM'], scale_df.loc[cell_lines_present, 'scale'])[0, 1]
# plt.text(10, 1.75, f'r = {r:.2f}', fontsize=7)
plt.title(f'AGO2 expression, r = {r:.2f}', fontsize=8)
plt.xlabel('expression (nTPM)')
plt.ylabel('fitted scaling factor')
plt.xlim(8, 28)
plt.ylim(0.4, 2.2)
plt.tight_layout()
for format in ['.svg', '.png']:
    plt.savefig(plot_folder + 'AGO2_expression_vs_scale_factor' + format, dpi=300)

In [None]:
curr_df

# Look at context stability

In [76]:
relative_stability = pd.read_csv("../outputs/6_context_impact/relative_context_stability.csv", index_col=0).astype(float)
# rename the column 0 to "scale"
relative_stability.columns = ["scale"]

context_scaling_df = relative_stability
context_scaling_df = 1/10**context_scaling_df
context_scaling_df.index = context_scaling_df.index.str.upper()

In [None]:
plt.figure(figsize=(2.4, 2))

plt.scatter(context_scaling_df.loc[cell_lines_present], scale_df.loc[cell_lines_present])
r = pearsonr(context_scaling_df.loc[cell_lines_present].values.flatten(), scale_df.loc[cell_lines_present].values.flatten())[0]
rmsd = np.sqrt(np.mean((context_scaling_df.loc[cell_lines_present].values.flatten() - scale_df.loc[cell_lines_present].values.flatten())**2))
plt.plot([0, 2], [0, 2], 'k--')

# label each data point
for i, txt in enumerate(cell_lines_present):
    plt.annotate(txt, (context_scaling_df.loc[txt]+0.08, scale_df.loc[txt, 'scale']-0.05))

# plt.xlim(0, 1.5)
plt.title(f'r = {r:.2f}', fontsize=8)
plt.xlabel('1/mean(context stabilities)')
plt.ylabel('fitted scaling factor')
plt.tight_layout()
for format in ['.svg', '.png']:
    plt.savefig(plot_folder + 'context_stability_vs_scale_factor' + format, dpi=300)

# Combine the two to guess the scale factor

In [None]:
# normalize ago2 to the HEK293 expression
ago2['nTPM'] = ago2['nTPM'] / ago2.loc['HEK293T', 'nTPM']
ago1['nTPM'] = ago1['nTPM'] / ago1.loc['HEK293T', 'nTPM']

In [83]:
combined_stability = ago2.loc[cell_lines_present, "nTPM"].values*(context_scaling_df.loc[cell_lines_present, "scale"].values)

In [None]:
plt.figure(figsize=(2.4, 2))

plt.scatter(combined_stability, scale_df.loc[cell_lines_present])
r = pearsonr(combined_stability, scale_df.loc[cell_lines_present].values.flatten())[0]
rmsd = np.sqrt(np.mean((combined_stability - scale_df.loc[cell_lines_present].values.flatten())**2))
plt.plot([0, 2], [0, 2], 'k--')

# label each data point
for i, txt in enumerate(cell_lines_present):
    plt.annotate(txt, (combined_stability[i]+0.08, scale_df.loc[txt, 'scale']-0.05))

# plt.xlim(0, 1.5)
plt.title(f'scaling factor = rel. Ago2 expression / mean(context stabilities)\nr = {r:.2f}', fontsize=7)
plt.xlabel('calculated scaling factor')
plt.ylabel('fitted scaling factor')
plt.tight_layout()
for format in ['.svg', '.png']:
    plt.savefig(plot_folder + 'combined_stability_vs_scale_factor' + format, dpi=300)