In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
tf.keras.backend.set_floatx('float64')
from misc import *
from collections import deque
from nets import Q1
import random

In [2]:
def greedy_action(q1, betas, ep=1):
    if np.random.random()<ep:
        l = np.random.choice(range(len(betas)), 1)[0]
        return l, betas[l]
    else:

        qs= np.squeeze(q1(np.expand_dims(betas, axis=1)).numpy())
        l=np.where(qs==max(qs))[0][0]
        return l, betas[l]

In [6]:
def real_training(run_id, number_betas=10, lr = 10**-2, ep=10**-2, T=1000):

    q1=Q1()
    optimizer = tf.keras.optimizers.Adam(lr = lr)
    pt=[]
    rt=[]
    rts = []
    
    
    betas = np.arange(-1,0,1/number_betas)
        ntable=np.zeros(len(betas))

    optimal = max(ps(betas))
    buffer = ReplayBuffer(buffer_size=100)
    for episode in tqdm(range(T)):
        ep = max(np.exp(-episode/100),0.1)
        label, beta = greedy_action(q1, betas, ep)
        ntable[label]+=1
        reward = np.random.choice([1.,0.],1, p=[ps(beta), 1-ps(beta)])[0]
        rt.append(reward)
        rts.append(np.sum(rt))
        buffer.add(beta, reward)
        if episode > 100:
            actions_did, rewards = buffer.sample(100)
            with tf.device("/cpu:0"):
                with tf.GradientTape() as tape:
                    tape.watch(q1.trainable_variables)
                    predictions = q1(np.expand_dims(np.array(actions_did),axis=1))
                    pt.append(ps(greedy_action(q1,betas,0)[1]))
                    loss_sum = tf.keras.losses.MSE(predictions,np.expand_dims(np.array(rewards),axis=1))
                    loss = tf.reduce_mean(loss_sum)
                    grads = tape.gradient(loss, q1.trainable_variables)
                    optimizer.apply_gradients(zip(grads, q1.trainable_variables))
        else:
            pt.append(0.5)
    rtsum = rts/np.arange(1,T+1)
    predictions = q1.prediction(betas)
    plot_evolution(rtsum, pt, optimal, betas, predictions , ntable, run_id)
    return

In [10]:
def plot_evolution(rt, pt, optimal, betas, preds, ntable, run_id):
    plt.figure(figsize=(20,10))
    T=len(rt)
    ax1=plt.subplot2grid((2,2),(0,0))
    ax2=plt.subplot2grid((2,2),(1,0))
    ax3=plt.subplot2grid((2,2),(0,1), rowspan=2)

    ax1.plot(np.arange(1,T+1),rt, color="red", linewidth=7, alpha=0.8, label=r'$R_t$')
    ax1.plot(optimal*np.ones(T), color="black",  linewidth=7,alpha=0.5, label="optimal")
    ax2.plot(np.arange(1,T+1),pt, color="red", linewidth=7, alpha=0.8, label=r'$P_t$')
    ax2.plot(optimal*np.ones(T), color="black",  linewidth=7,alpha=0.5, label="optimal")
    ax3.scatter(betas, preds, color="red", s=30, label="predictions", alpha=0.6)
    ax3.scatter(betas, ps(betas), color="blue", s=30, label="true values", alpha=0.6)

    for ax in [ax1, ax2, ax3]:
        ax.legend(prop={"size":30})
    plt.savefig(run_id+"/learning_curves.png")
    plt.close()
    return

In [11]:
run_id=record()
number_run = "run_"+str(run_id)
real_training(number_run, T=240)

100%|██████████| 240/240 [00:02<00:00, 100.19it/s]


In [9]:
os.getcwd()

'/home/cooper-cooper/Desktop/deeper'