In [1]:
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import pandas as pd

import sys
sys.path.append('../complete_run/')
from utils.seq import DHS_COLORS, CANONICAL_ORDER, COMPONENT_CLASS_NAMES

In [2]:
def get_component_name(component):
    idx = np.where(CANONICAL_ORDER == component + 1)[0][0]
    return COMPONENT_CLASS_NAMES[idx]

In [None]:
TOTAL_SEQS = 100

plt.figure(figsize=(20, 15))

# for c in CANONICAL_ORDER - 1:
for c in list(range(16)):
    hits_df = pd.read_csv(f'../tuning/fimo_analysis/{c}/hits.txt', sep='\t', header=None)
    hits_df.columns = ['iter', 'hits']
    hits_df.iter = np.array([int(x[:-6]) for x in hits_df.iter])
    hits_df = hits_df.sort_values('iter').reset_index(drop=True)
    
    plt.plot(hits_df.iter, (hits_df.hits / TOTAL_SEQS) * 100, linewidth=6, c=DHS_COLORS[c], label=get_component_name(c))
    
lgd = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=24)
plt.xticks(fontsize=24)
plt.yticks(range(0, 64, 8), fontsize=24)
plt.xlabel('Tuning iterations', fontsize=30)
plt.ylabel('Percent sequences with at least one FIMO "hit"', fontsize=30)
plt.title('Change in known motif FIMO matches during tuning', fontsize=35)

# plt.savefig('../figures/fimo_hits_during_tuning_deep_PB20200612.png', bbox_extra_artists=(lgd,), bbox_inches='tight')



In [None]:
fig, ax = plt.subplots(2, 1, figsize=(20, 35))

for component in list(range(16)):
    history = {
        'loss': np.zeros((100, 10000)),
        'softmax': np.zeros((100, 10000)),
    }

    for i, label in enumerate(['loss', 'softmax']):
        for j in range(100):
            path = f'../tuning/optimization_analysis/{component}/{label}/{j}.txt'
            history[label][j] = np.loadtxt(path)
            
        values = history[label]
        means = np.mean(values, axis=0)
        stds = np.std(values, axis=0)
        
        ax[i].plot(np.arange(10000), means, c=DHS_COLORS[component], label=get_component_name(component))
#         ax[i].fill_between(np.arange(10000), means - stds, means + stds, color=DHS_COLORS[component], alpha=0.05)
        ax[i].set_xlabel('Tuning iterations', fontsize=24)
    
        leg = ax[i].legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=24)
        
        for legobj in leg.legendHandles:
            legobj.set_linewidth(4.0)
        
ax[0].set_title('Pre-softmax layer loss over the tuning process', fontsize=30)
ax[0].set_ylabel('Pre-softmax layer node value', fontsize=24)
ax[1].set_title('Softmax prediction probability over the tuning process', fontsize=30)
ax[1].set_ylabel('Softmax layer prediction probability', fontsize=24)

plt.savefig('../figures/optimization_analysis_preliminary_PB20200624.png')

In [5]:
for i in range(16):
    component = i
    ITERS = 10000
    TOTAL_SEQS = 100

    component_name = get_component_name(component).replace('/', '_')

    history = {
        'loss': np.zeros((TOTAL_SEQS, ITERS)),
        'softmax': np.zeros((TOTAL_SEQS, ITERS)),
    }

    for label in ['loss', 'softmax']:
        for j in range(TOTAL_SEQS):
            path = f'../tuning/optimization_analysis/{component}/{label}/{j}.txt'
            history[label][j] = np.loadtxt(path)

    means = np.mean(history['loss'], axis=0)
    stds = np.std(history['loss'], axis=0)


    hits_df = pd.read_csv(f'../tuning/fimo_analysis/{component}/hits.txt', sep='\t', header=None)
    hits_df.columns = ['iter', 'hits']
    hits_df.iter = np.array([int(x[:-6]) for x in hits_df.iter])
    hits_df = hits_df.sort_values('iter').reset_index(drop=True)
    hits = hits_df.hits
    hits_pcts = (hits / TOTAL_SEQS) * 100

    real_hits_df = pd.read_csv(f'../figures/fimo_hits_real_data/train/{component}/hits.txt', sep='\t', header=None)
    real_hits = int(real_hits_df[1][0])
    real_hits_pct = (real_hits / ITERS) * TOTAL_SEQS

    fig, ax1 = plt.subplots(figsize=(20, 15))

    ax1.plot(np.arange(ITERS),
             means,
             c='Gray',
             linewidth=5,
             linestyle=(0, (1, 10)),
             label='tuning loss')

    ax1.fill_between(np.arange(ITERS),
                     means - stds,
                     means + stds,
                     color='Gray',
                     alpha=0.1)
    ax1.tick_params(axis='y', labelcolor='Gray', labelsize=26)
    ax1.tick_params(axis='x', labelsize=26)
    ax1.set_ylabel('Tuning loss', fontsize=26, color='Gray')
    ax1.set_xlabel('Tuning iterations', fontsize=26)
    ax1.set_title(f'Sequence loss and FIMO data during tuning ({component_name})', fontsize=35)

    ax2 = ax1.twinx()

    ax2.plot(np.arange(ITERS),
             hits_pcts,
             linewidth=6,
             c=DHS_COLORS[component],
             label='FIMO hits, tuning')

    xmin = np.where(hits > real_hits_pct)[0].min()

    col = DHS_COLORS[component]
    ax2.axhline(real_hits_pct,
                xmin=xmin/ITERS,
                linewidth=4,
                c=col,
                linestyle='--',
                label='FIMO hits, real data')
    ax2.plot(xmin, real_hits_pct, 'ro', markersize=25, c='Black')
    y_len = hits_pcts.max() - hits_pcts.min()
    ax2.text(xmin + 10, real_hits_pct + (y_len / 20), '(pct training seqs with FIMO hit)', fontsize=30)
    ax2.tick_params(axis='y', labelcolor=col, labelsize=26)
    ax2.set_ylabel('Percentage of tuning sequences with a FIMO "hit"', fontsize=26, color=col)

    fig.legend(loc=(0.7, 0.75), fontsize=20)

    plt.savefig(f'../figures/convergence_stats_{component_name}_PB20200624.png')
    plt.close()

In [None]:
hits = []
for component in range(16):
    real_hits_df = pd.read_csv(f'../figures/fimo_hits_real_data/train/{component}/hits.txt', sep='\t', header=None)
    real_hits = int(real_hits_df[1][0])
    hits.append(real_hits)

In [None]:
hits = np.array(hits)

In [None]:
(hits / 10000) * 100


In [None]:
hits