In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from scipy.stats import mannwhitneyu ## for those wonderful p-values!

## Nature journal settings
plt.rcParams["font.family"] = "Arial"
colors = ["#E64B35FF", "#3C5488FF", "#00A087FF", "#4DBBD5FF", "#F39B7FFF", "#8491B4FF", "#91D1C2FF", "#DC0000FF", "#7E6148FF", "#B09C85FF"]
sns.set_palette(sns.color_palette(colors))


In [2]:
# - write function to create m, mp for each pair
# - for each pair, plot: interpolation curve, percent-to-match-TL vs percent-used, efficiency gain vs percent-used
# - together, plot the efficiency gain in the nice box plot (for the 3 chosen pairs)

In [3]:
d = pd.read_csv('run_2d/analysis/results.csv')
raw_overlap = pd.read_csv('../2023-08-31/run_2d_raw_overlap/analysis/results.csv')
d = d.merge(raw_overlap, on=['source', 'target', 'percent-heldout', 'seed'])
d['percent-used'] = 100 - d['percent-heldout']
d['pair'] = d['source'] + '_' + d['target']
d.head()

Unnamed: 0,source,target,percent-heldout,seed,transfer,raw,target_only,raw_overlap,percent-used,pair
0,REP,GDSC,20,0,0.81071,0.501466,0.806638,0.575767,80,REP_GDSC
1,REP,GDSC,20,1,0.818809,0.521551,0.817445,0.601614,80,REP_GDSC
2,REP,GDSC,20,2,0.812328,0.513712,0.80402,0.58236,80,REP_GDSC
3,REP,GDSC,20,3,0.827824,0.50319,0.815933,0.588492,80,REP_GDSC
4,REP,GDSC,20,4,0.829023,0.508852,0.808236,0.589836,80,REP_GDSC


In [4]:
m_list = []
mp_list = []
for pair in list(d.pair.unique()):
    source, target = pair.split('_')
    df = d.loc[(d.source == source) & (d.target == target)]
    
    # get target means
    targ_mean_df = df[['percent-used', 'target_only']].groupby(['percent-used']).mean().reset_index()
    targ_mean_df = targ_mean_df.rename(columns={'target_only': 'target_only_mean'})
    
    # create new df m by merge in target_only_mean:
    # df: source, target, pair, percent-used, seed, transfer, target-only-mean 
    m = df.merge(targ_mean_df, on='percent-used', validate='many_to_one')
    m_cols = ['source', 'target', 'pair', 'percent-heldout', 'percent-used', 'seed', 'transfer', 'target_only_mean']
    m = m[m_cols]
    
    # define mp
    mp = m[['pair', 'target_only_mean', 'percent-used']].drop_duplicates()
    mp = mp.sort_values('target_only_mean')
    
    # calculate gains
    m['percent-to-match-TL'] = np.interp(m['transfer'], mp['target_only_mean'], mp['percent-used'])
    m['efficiency-gain'] = m['percent-to-match-TL'] / m['percent-used']
    
    # append to lists
    m_list.append(m)
    mp_list.append(mp)

# concatenate so that you have one large m frame and one large mp frame
m_full = pd.concat(m_list, ignore_index=True)
mp_full = pd.concat(mp_list, ignore_index=True)

assert len(m_full) == len(d)
assert len(mp_full) == (d['percent-used'].nunique()) * 6

In [5]:
m_fn = 'run_2d/analysis/m.csv'
mp_fn = 'run_2d/analysis/mp.csv'
m_full.to_csv(m_fn, index=False)
mp_full.to_csv(mp_fn, index=False)

In [None]:
# ==== MAIN CODE ENDS HERE ==== #

In [None]:
# JUST WORK WITH ONE DATA PAIR AT A TIME
source = "GDSC"
target = "REP"
df = d.loc[(d.source == source) & (d.target == target)]

# get target means
targ_mean_df = df[['percent-used', 'target_only']].groupby(['percent-used']).mean().reset_index()
targ_mean_df = targ_mean_df.rename(columns={'target_only': 'target_only_mean'})

# create new df m by merge in target_only_mean:
# df: source, target, pair, percent-used, seed, transfer, target-only-mean 
m = df.merge(targ_mean_df, on='percent-used', validate='many_to_one')
m_cols = ['source', 'target', 'pair', 'percent-heldout', 'percent-used', 'seed', 'transfer', 'target_only_mean']
m = m[m_cols]

In [None]:
m.head()

In [None]:
mp = m[['pair', 'target_only_mean', 'percent-used']].drop_duplicates()
mp = mp.sort_values('target_only_mean')
mp.head()

In [None]:
m['percent-to-match-TL'] = np.interp(m['transfer'], mp['target_only_mean'], mp['percent-used'])
m['efficiency-gain'] = m['percent-to-match-TL'] / m['percent-used']

In [None]:
x = mp['target_only_mean'].to_numpy()
y = mp['percent-used'].to_numpy()
x1 = np.linspace(np.min(x), np.max(x), 101)
y1 = np.interp(x=x1, xp=x, fp=y)
plt.plot(x, y, 'o')
plt.plot(x1, y1, '-')

In [None]:
plt.scatter(x=m['percent-used'], y=m['percent-to-match-TL'], alpha=.2)

In [None]:
plt.scatter(x=m['percent-used'], y=m['efficiency-gain'], alpha=.2)

In [None]:
# ===== PLOT CODE IS DONE FOR NOW ====== #

In [None]:
m['efficiency-gain'] = m['percent-to-match-TL'] / m['percent-used']

In [None]:
m.head()

In [None]:
plt.scatter(x=m['percent-used'], y=m['percent-to-match-TL'])

In [None]:
plt.scatter(x=m['percent-used'], y=m['efficiency-gain'])

In [None]:
x = np.linspace(0, 2*np.pi, 10)

y = np.sin(x)

xvals = np.linspace(0, 2*np.pi, 50)

yinterp = np.interp(xvals, x, y)

import matplotlib.pyplot as plt

plt.plot(x, y, 'o')

plt.plot(xvals, yinterp, '-x')

plt.show()

In [None]:
xnew

In [None]:
m.loc[m['percent-used'] == 5]

In [None]:
# JUST WORK WITH ONE DATA PAIR AT A TIME
source = "GDSC"
target = "REP"
df = d.loc[(d.source == source) & (d.target == target)]

count_expts = 'percent-used'

trans_std_df = df[[count_expts, 'transfer']].groupby([count_expts]).std().reset_index()
trans_mean_df = df[[count_expts, 'transfer']].groupby([count_expts]).mean().reset_index()

targ_std_df = df[[count_expts, 'target_only']].groupby([count_expts]).std().reset_index()
targ_mean_df = df[[count_expts, 'target_only']].groupby([count_expts]).mean().reset_index()

raw_std_df = df[[count_expts, 'raw_overlap']].groupby([count_expts]).std().reset_index()
raw_mean_df = df[[count_expts, 'raw_overlap']].groupby([count_expts]).mean().reset_index()

# Plot
plt.rcParams.update({"font.size":12}) ## Set fontsize

plt.clf()
fig, ax = plt.subplots(figsize=(7,4))

offset = 0.15

i=0
#plt.errorbar(x=(raw_mean_df[count_expts] + (i-1)*offset ), y=raw_mean_df["raw_overlap"], yerr=raw_std_df["raw_overlap"], color=colors[i], fmt='-o', label='raw_overlap', zorder=0)
#plt.scatter(x=(raw_mean_df[count_expts] + (i-1)*offset ), y=raw_mean_df["raw_overlap"], color=colors[i], zorder=2)

i=1
#plt.errorbar(x=(trans_mean_df[count_expts] + (i-1)*offset ), y=trans_mean_df["transfer"], yerr=trans_std_df["transfer"], color=colors[i], fmt='-o', label='transfer', zorder=0)
plt.scatter(x=(trans_mean_df[count_expts] + (i-1)*offset ), y=trans_mean_df["transfer"], color=colors[i], zorder=2)
xnew = np.linspace(5, 90, num=10001)
trans_ynew = np.interp(xnew, trans_mean_df[count_expts], trans_mean_df["transfer"])
plt.plot(xnew, trans_ynew, '-', color=colors[i], label='transfer', zorder=0)

i=2
#plt.errorbar(x=(targ_mean_df[count_expts] + (i-1)*offset ), y=targ_mean_df["target_only"], yerr=targ_std_df["target_only"], color=colors[i], fmt='-o', label='target_only', zorder=0)
plt.scatter(x=(targ_mean_df[count_expts] + (i-1)*offset ), y=targ_mean_df["target_only"], color=colors[i], zorder=2)
targ_ynew = np.interp(xnew, targ_mean_df[count_expts], targ_mean_df["target_only"])
plt.plot(xnew, targ_ynew, '-', color=colors[i], label='target-only', zorder=0)

plt.legend(fontsize=16)
plt.xticks(sorted(d[count_expts].unique()))
plt.xlabel(count_expts)
plt.ylabel("Pearson Correlation")
plt.title("source: " + source + ", target: " + target)
#plt.savefig("plots/line-" + source + "-" + target + ".png", bbox_inches="tight")
plt.show()


In [None]:
# converting to percent-used is good and helpful
# also, before interpolation, add 0,0 points (0 percent-used --> 0 correlation)
# i need to take pearson correlation as the x, percent-used as the y, and interpolate from there
# also, set bounds on the domain for subtraction (so that the same correlations are covered)

In [None]:
i=1
plt.plot(trans_ynew, xnew, color=colors[i], label='transfer', zorder=0)

i=2
plt.plot(targ_ynew, xnew, color=colors[i], label='target-only', zorder=0)

plt.legend(fontsize=16)
plt.yticks(sorted(d[count_expts].unique()))
plt.ylabel(count_expts)
plt.xlabel("Pearson Correlation")
plt.title("source: " + source + ", target: " + target)

In [None]:
i=3
plt.plot(targ_ynew - trans_ynew, xnew, color=colors[i], label='expts saved', zorder=0)

plt.legend(fontsize=16)
plt.yticks(sorted(d[count_expts].unique()))
plt.ylabel(count_expts)
plt.xlabel("Pearson Correlation")
plt.title("source: " + source + ", target: " + target)

In [None]:
# create list of x-values from .61 --> 1
# for ea. x value, find index for targ
# for ea. x value, find index for transfer
# use indices to get corresponding percentages

In [None]:
def get_indices_from_values(vals, arr):
    idx = []
    for v in vals:
        if len(arr[arr >= v]) > 0:
            f = np.min(arr[arr >= v])
            idx.append(np.where(arr == f)[0])
        else:
            return idx
    return idx

vals = np.linspace(.61, 1, 40)
targ_idx = get_indices_from_values(vals, targ_ynew)
trans_idx = get_indices_from_values(vals, trans_ynew)

In [None]:
#plt.plot(targ_ynew[targ_idx], xnew[targ_idx])
#plt.plot(trans_ynew[trans_idx], xnew[trans_idx])
plt.plot(trans_ynew[trans_idx[:-1]], (xnew[targ_idx] - xnew[trans_idx[:-1]]) * TOTAL_EXPTS / 100.0)
plt.xlabel("pearson correlation")
plt.ylabel("# experiments saved")

In [None]:
np.min(targ_ynew[targ_ynew > .2])

In [None]:
np.where(targ_ynew == 0.2002708028550556)

In [None]:
xnew[116]

In [None]:
trans_ynew