# 2048 Deep Reinforcement Learning

### Import Required Libraries

In [1]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import numpy as np
from copy import deepcopy
import random 
import math
import matplotlib.pyplot as plt

Instructions for updating:
non-resource variables are not supported in the long term


### Game Logic

In [2]:
#initialize a new game
def new_game(n):
    matrix = np.zeros([n,n])
    return matrix

#add 2 or 4 in the matrix
def add_two(mat):
    empty_cells = []
    for i in range(len(mat)):
        for j in range(len(mat[0])):
            if(mat[i][j]==0):
                empty_cells.append((i,j))
    if(len(empty_cells)==0):
        return mat
    
    index_pair = empty_cells[random.randint(0,len(empty_cells)-1)]
    
    prob = random.random()
    if(prob>=0.9):
        mat[index_pair[0]][index_pair[1]]=4
    else:
        mat[index_pair[0]][index_pair[1]]=2
    return mat

#to check state of the game
def game_state(mat):
    #if 2048 in mat:
    #    return 'win'
    
    for i in range(len(mat)-1): #intentionally reduced to check the row on the right and below
        for j in range(len(mat[0])-1): #more elegant to use exceptions but most likely this will be their solution
            if mat[i][j]==mat[i+1][j] or mat[i][j+1]==mat[i][j]:
                return 'not over'
            
    for i in range(len(mat)): #check for any zero entries
        for j in range(len(mat[0])):
            if mat[i][j]==0:
                return 'not over'
            
    for k in range(len(mat)-1): #to check the left/right entries on the last row
        if mat[len(mat)-1][k]==mat[len(mat)-1][k+1]:
            return 'not over'
        
    for j in range(len(mat)-1): #check up/down entries on last column
        if mat[j][len(mat)-1]==mat[j+1][len(mat)-1]:
            return 'not over'
        
    return 'lose'


def reverse(mat):
    # reverse member of rows in mat
    # testmat = [[1,2,3,4],[5,6,7,8]]
    # revmat =reverse(testmat) # [[4,3,2,1],[8,7,6,5]]
    new=[]
    for i in range(len(mat)):
        new.append([])
        for j in range(len(mat[0])):
            new[i].append(mat[i][len(mat[0])-j-1])
    return new

def transpose(mat):
    # transpose
    # testmat = [[1,2,3,4],[5,6,7,8]]
    # [[1 5]
    # [2 6]
    # [3 7]
    # [4 8]]  
    return np.transpose(mat)

def cover_up(mat):
    # retval
    # new: new matrix which has all 0 in each row shifted to left
    # done: true if new matrix different from mat
    
    new = [[0,0,0,0],[0,0,0,0],[0,0,0,0],[0,0,0,0]]
    done = False
    for i in range(len(mat)):
        count = 0
        for j in range(len(mat[i])):
            if mat[i][j]!=0:
                new[i][count] = mat[i][j]
                if j!=count:
                    done=True
                count+=1
    return (new,done)

def merge(mat):
    done=False
    score = 0
    for i in range(4):
        for j in range(3):
            if mat[i][j]==mat[i][j+1] and mat[i][j]!=0:
                mat[i][j]*=2
                score += mat[i][j]   
                mat[i][j+1]=0
                done=True
    return (mat,done,score)

#up move
def up(game):
        game = transpose(game)
        game,done = cover_up(game)
        temp = merge(game)
        game = temp[0]
        done = done or temp[1]
        game = cover_up(game)[0]
        game = transpose(game)
        return (game,done,temp[2])

#down move
def down(game):
        game=reverse(transpose(game))
        game,done=cover_up(game)
        temp=merge(game)
        game=temp[0]
        done=done or temp[1]
        game=cover_up(game)[0]
        game=transpose(reverse(game))
        return (game,done,temp[2])

#left move
def left(game):
        game,done=cover_up(game)
        temp=merge(game)
        game=temp[0]
        done=done or temp[1]
        game=cover_up(game)[0]
        return (game,done,temp[2])

#right move
def right(game):
        game=reverse(game)
        game,done=cover_up(game)
        temp=merge(game)
        game=temp[0]
        done=done or temp[1]
        game=cover_up(game)[0]
        game=reverse(game)
        return (game,done,temp[2])

### Controls

In [3]:
controls = {0:up,1:left,2:right,3:down}

### Important Functions
* Find Empty Cell Function (Used in Reward)
* Convert Input Values

In [4]:
#convert the input game matrix into corresponding power of 2 matrix.
def change_values(X):
    power_mat = np.zeros(shape=(1,4,4,16),dtype=np.float32)
    for i in range(4):
        for j in range(4):
            if(X[i][j]==0):
                power_mat[0][i][j][0] = 1.0
            else:
                power = int(math.log(X[i][j],2))
                power_mat[0][i][j][power] = 1.0
    return power_mat        

#find the number of empty cells in the game matrix.
def findemptyCell(mat):
    count = 0
    for i in range(len(mat)):
        for j in range(len(mat)):
            if(mat[i][j]==0):
                count+=1
    return count

### Hyper Parameters

In [5]:
#hyper parameters
start_learning_rate = 0.0005

#gamma for Q-learning
gamma = 0.9

#epsilon greedy approach
epsilon = 0.9

#to store states and lables of the game for training
#states of the game
replay_memory = list()

#labels of the states
replay_labels = list()

#capacity of memory
mem_capacity = 6000

### Network Architecture

![](https://github.com/navjindervirdee/2048-deep-reinforcement-learning/blob/master/Architecture/Architecture.JPG?raw=true)

In [6]:
#first convolution layer depth
depth1 = 256

#second convolution layer depth
depth2 = 128

#batch size for batch gradient descent
batch_size = 512

#input units
input_units = 16
input_depth = input_units

#fully connected layer neurons
hidden_units = 256

#output neurons = number of moves
output_units = 4

conv1_layer1_shape = [1,2,input_units,depth1]
conv1_layer2_shape = [1,2,depth1,depth2]
conv2_layer1_shape = [2,1,input_units,depth1]
conv2_layer2_shape = [2,1,depth1,depth2]

fc_layer1_w_shape = [3*4*depth1*2+ 4*2*depth2*2 + 3*3*depth2*2,hidden_units]
fc_layer1_b_shape = [hidden_units]
fc_layer2_w_shape = [hidden_units,output_units]
fc_layer2_b_shape = [output_units]

### Let's make the Tensorflow Graph
* Loss = mean ( square( Q(st,at) - (r + gamma x max(Q(st+1,a))) ) )
* Activation = RELU
* Optimizer = RMSProp

In [7]:
#input data
tf_batch_dataset = tf.placeholder(tf.float32,shape=(batch_size,4,4,16))
tf_batch_labels  = tf.placeholder(tf.float32,shape=(batch_size,output_units))

single_dataset   = tf.placeholder(tf.float32,shape=(1,4,4,16))

#CONV LAYERS
#conv layer1 weights
# conv1_layer1_weights = tf.Variable(tf.truncated_normal([1,2,input_units,depth1],mean=0,stddev=0.01), name='conv1_layer1_weights')
# conv2_layer1_weights = tf.Variable(tf.truncated_normal([2,1,input_units,depth1],mean=0,stddev=0.01), name='conv2_layer1_weights')

# conv layer2 weights
# conv1_layer2_weights = tf.Variable(tf.truncated_normal([1,2,depth1,depth2],mean=0,stddev=0.01),name='conv1_layer2_weights')
# conv2_layer2_weights = tf.Variable(tf.truncated_normal([2,1,depth1,depth2],mean=0,stddev=0.01),name='conv2_layer2_weights')
 
#FUllY CONNECTED LAYERS
expand_size = 2*4*depth2*2 + 3*3*depth2*2 + 4*3*depth1*2
# fc_layer1_weights = tf.Variable(tf.truncated_normal([expand_size,hidden_units],mean=0,stddev=0.01),name='fc_layer1_weights')
# fc_layer1_biases = tf.Variable(tf.truncated_normal([1,hidden_units],mean=0,stddev=0.01),name='fc_layer1_biases')
# fc_layer2_weights = tf.Variable(tf.truncated_normal([hidden_units,output_units],mean=0,stddev=0.01),name='fc_layer2_weights')
# fc_layer2_biases = tf.Variable(tf.truncated_normal([1,output_units],mean=0,stddev=0.01),name='fc_layer2_biases')

import os
import pandas as pd
THIS_FOLDER = os.path.abspath('')
PARENT_FOLDER = os.path.dirname(THIS_FOLDER)
WEIGHT_FOLDER = os.path.join(PARENT_FOLDER, 'ver2_result')
#conv layer1 weights

conv1_layer1_weights = tf.Variable(np.array(pd.read_csv(os.path.join(WEIGHT_FOLDER, 'conv1_layer1_weights.csv'))['Weight']).reshape(conv1_layer1_shape),dtype=tf.float32, name='conv1_layer1_weights')
conv2_layer1_weights = tf.Variable(np.array(pd.read_csv(os.path.join(WEIGHT_FOLDER, 'conv2_layer1_weights.csv'))['Weight']).reshape(conv2_layer1_shape),dtype=tf.float32, name='conv2_layer1_weights')

#conv layer2 weights
conv1_layer2_weights = tf.Variable(np.array(pd.read_csv(os.path.join(WEIGHT_FOLDER, 'conv1_layer2_weights.csv'))['Weight']).reshape(conv1_layer2_shape),dtype=tf.float32, name='conv1_layer2_weights')
conv2_layer2_weights = tf.Variable(np.array(pd.read_csv(os.path.join(WEIGHT_FOLDER, 'conv2_layer2_weights.csv'))['Weight']).reshape(conv2_layer2_shape),dtype=tf.float32, name='conv2_layer2_weights')

# FUllY CONNECTED LAYERS
fc_layer1_weights = tf.Variable(np.array(pd.read_csv(os.path.join(WEIGHT_FOLDER, 'fc_layer1_weights.csv'))['Weight']).reshape(fc_layer1_w_shape), dtype=tf.float32, name='fc_layer1_weights')
fc_layer1_biases = tf.Variable(np.array(pd.read_csv(os.path.join(WEIGHT_FOLDER, 'fc_layer1_biases.csv'))['Weight']).reshape(fc_layer1_b_shape), dtype=tf.float32, name='fc_layer1_biases.csv')
fc_layer2_weights = tf.Variable(np.array(pd.read_csv(os.path.join(WEIGHT_FOLDER, 'fc_layer2_weights.csv'))['Weight']).reshape(fc_layer2_w_shape), dtype=tf.float32, name='fc_layer2_weights')
fc_layer2_biases = tf.Variable(np.array(pd.read_csv(os.path.join(WEIGHT_FOLDER, 'fc_layer2_biases.csv'))['Weight']).reshape(fc_layer2_b_shape), dtype=tf.float32, name='fc_layer2_biases.csv')

def model(dataset):
    #layer1
    conv1 = tf.nn.conv2d(dataset,conv1_layer1_weights,[1,1,1,1],padding='VALID') 
    conv2 = tf.nn.conv2d(dataset,conv2_layer1_weights,[1,1,1,1],padding='VALID') 
    
    #layer1 relu activation
    relu1 = tf.nn.relu(conv1)
    relu2 = tf.nn.relu(conv2)
    
    #layer2
    conv11 = tf.nn.conv2d(relu1,conv1_layer2_weights,[1,1,1,1],padding='VALID') 
    conv12 = tf.nn.conv2d(relu1,conv2_layer2_weights,[1,1,1,1],padding='VALID') 

    conv21 = tf.nn.conv2d(relu2,conv1_layer2_weights,[1,1,1,1],padding='VALID') 
    conv22 = tf.nn.conv2d(relu2,conv2_layer2_weights,[1,1,1,1],padding='VALID') 

    #layer2 relu activation
    relu11 = tf.nn.relu(conv11)
    relu12 = tf.nn.relu(conv12)
    relu21 = tf.nn.relu(conv21)
    relu22 = tf.nn.relu(conv22)
    
    #get shapes of all activations
    shape1 = relu1.get_shape().as_list()
    shape2 = relu2.get_shape().as_list()
    
    shape11 = relu11.get_shape().as_list()
    shape12 = relu12.get_shape().as_list()
    shape21 = relu21.get_shape().as_list()
    shape22 = relu22.get_shape().as_list()

    #expansion
    hidden1 = tf.reshape(relu1,[shape1[0],shape1[1]*shape1[2]*shape1[3]])
    hidden2 = tf.reshape(relu2,[shape2[0],shape2[1]*shape2[2]*shape2[3]])
    
    hidden11 = tf.reshape(relu11,[shape11[0],shape11[1]*shape11[2]*shape11[3]])
    hidden12 = tf.reshape(relu12,[shape12[0],shape12[1]*shape12[2]*shape12[3]])
    hidden21 = tf.reshape(relu21,[shape21[0],shape21[1]*shape21[2]*shape21[3]])
    hidden22 = tf.reshape(relu22,[shape22[0],shape22[1]*shape22[2]*shape22[3]])

    #concatenation
    hidden = tf.concat([hidden1,hidden2,hidden11,hidden12,hidden21,hidden22],axis=1)

    #full connected layers
    hidden = tf.matmul(hidden,fc_layer1_weights) + fc_layer1_biases
    hidden = tf.nn.relu(hidden)

    #output layer
    output = tf.matmul(hidden,fc_layer2_weights) + fc_layer2_biases
    
    #return output
    return output

#for single example
single_output = model(single_dataset)

#for batch data
logits = model(tf_batch_dataset)

#loss
loss = tf.square(tf.subtract(tf_batch_labels,logits))
loss = tf.reduce_sum(loss,axis=1,keep_dims=True)
loss = tf.reduce_mean(loss)/2.0

#optimizer
global_step = tf.Variable(0,name='global_step')  # count the number of steps taken.
learning_rate = tf.train.exponential_decay(float(start_learning_rate), global_step, 1000, 0.90, staircase=True)
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss, global_step=global_step)

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [8]:
#loss
J = []

#scores
scores = []

#to store final parameters
final_parameters = {}
doublecheck_parameters = {}
#number of episodes
M = 4000

### Create training dataset and Train Simultaneously
* Current Reward = number of merges + log(new max,2)

In [9]:
with tf.Session() as session:
    tf.global_variables_initializer().run()
    for v in tf.trainable_variables():
        print(v.name[:-2])
        doublecheck_parameters[v.name[:-2]] = session.run(v)

conv1_layer1_weights
conv2_layer1_weights
conv1_layer2_weights
conv2_layer2_weights
fc_layer1_weights
fc_layer1_biases.csv
fc_layer2_weights
fc_layer2_biases.csv
global_step


In [10]:
print(doublecheck_parameters)

{'conv1_layer1_weights': array([[[[-0.00430916, -0.04614374, -0.00970056, ..., -0.07678066,
           0.03294034,  0.05858142],
         [ 0.02102926, -0.03341596, -0.01457464, ...,  0.03906241,
           0.00900546, -0.05467157],
         [ 0.03276553, -0.0068016 , -0.00592402, ...,  0.029501  ,
           0.01680852, -0.06207766],
         ...,
         [-0.01282583, -0.016788  , -0.00695734, ...,  0.00812065,
           0.00513653,  0.01100361],
         [ 0.01048076,  0.00077067, -0.00134941, ...,  0.00591641,
          -0.00939574, -0.01835553],
         [ 0.01184947,  0.01166494, -0.01882423, ...,  0.00098781,
          -0.00577846, -0.00492018]],

        [[ 0.03523835,  0.01165464, -0.00716036, ...,  0.01146713,
           0.01490038, -0.06049449],
         [ 0.03146661, -0.04500999, -0.00247524, ..., -0.03939667,
           0.0297108 , -0.06022599],
         [-0.01836944,  0.03224109, -0.00490449, ..., -0.01639565,
           0.01434602, -0.04852731],
         ...,
         

In [None]:
with tf.Session() as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    
    global epsilon
    global replay_labels
    global replay_memory

    #for episode with max score
    maximum = -1
    episode = -1
    
    #total_iters 
    total_iters = 1
    
    #number of back props
    back=0
    
    for ep in range(M):
        global board
        board = new_game(4)
        add_two(board)
        add_two(board)
        
        #whether episode finished or not
        finish = 'not over'
        
        #total_score of this episode
        total_score = 0
        
        #iters per episode
        local_iters = 1
        
        while(finish=='not over'):
            prev_board = deepcopy(board)
            
            #get the required move for this state
            state = deepcopy(board)
            state = change_values(state)
            state = np.array(state,dtype = np.float32).reshape(1,4,4,16)
            feed_dict = {single_dataset:state}
            control_scores = session.run(single_output,feed_dict=feed_dict)
            
            #find the move with max Q value
            control_buttons = np.flip(np.argsort(control_scores),axis=1)
            
            #copy the Q-values as labels
            labels = deepcopy(control_scores[0]) #somehow control_scores has the shape of [[a,b,c,d]]
            
            #generate random number for epsilon greedy approach
            num = random.uniform(0,1)
            
            #store prev max
            prev_max = np.max(prev_board)
            
            #num is less epsilon generate random move
            if(num<epsilon):
                #find legal moves
                legal_moves = list()
                for i in range(4):
                    temp_board = deepcopy(prev_board)
                    temp_board,_,_ = controls[i](temp_board)
                    if(np.array_equal(temp_board,prev_board)):
                        continue
                    else:
                        legal_moves.append(i)
                if(len(legal_moves)==0):
                    finish = 'lose'
                    continue
                
                #generate random move.
                random_move = random.sample(legal_moves,1)[0]
                
                #apply the move
                temp_state = deepcopy(prev_board)
                temp_state,_,score = controls[random_move](temp_state)
                total_score += score
                finish = game_state(temp_state)
                
                #get number of merges
                empty1 = findemptyCell(prev_board)
                empty2 = findemptyCell(temp_state)
                
                if(finish=='not over'):
                    temp_state = add_two(temp_state)

                board = deepcopy(temp_state)

                #get next max after applying the move
                next_max = np.max(temp_state)
                
                #reward math.log(next_max,2)*0.1 if next_max is higher than prev max
                labels[random_move] = (math.log(next_max,2))*0.1
                
                if(next_max==prev_max):
                    labels[random_move] = 0
                
                #reward is also the number of merges
                labels[random_move] += (empty2-empty1)
                
                #get the next state max Q-value
                temp_state = change_values(temp_state)
                temp_state = np.array(temp_state,dtype = np.float32).reshape(1,4,4,16)
                feed_dict = {single_dataset:temp_state}
                temp_scores = session.run(single_output,feed_dict=feed_dict)
                    
                max_qvalue = np.max(temp_scores)
                
                #final labels add gamma*max_qvalue
                labels[random_move] = (labels[random_move] + gamma*max_qvalue)
            
            #generate the the max predicted move
            else:
                for con in control_buttons[0]:
                    prev_state = deepcopy(prev_board)
                    
                    #apply the LEGAl Move with max q_value
                    temp_state,_,score = controls[con](prev_state)
                    
                    #if illegal move label = 0
                    if(np.array_equal(prev_board,temp_state)):
                        labels[con] = 0
                        continue
                        
                    #get number of merges
                    empty1 = findemptyCell(prev_board)
                    empty2 = findemptyCell(temp_state)

                    
                    temp_state = add_two(temp_state)
                    board = deepcopy(temp_state)
                    total_score += score

                    next_max = np.max(temp_state)
                    
                    #reward
                    labels[con] = (math.log(next_max,2))*0.1
                    if(next_max==prev_max):
                        labels[con] = 0
                    
                    labels[con] += (empty2-empty1)

                    #get next max qvalue
                    temp_state = change_values(temp_state)
                    temp_state = np.array(temp_state,dtype = np.float32).reshape(1,4,4,16)
                    feed_dict = {single_dataset:temp_state}
                    temp_scores = session.run(single_output,feed_dict=feed_dict)

                    max_qvalue = np.max(temp_scores)

                    #final labels
                    labels[con] = (labels[con] + gamma*max_qvalue)
                    break
                    
                if(np.array_equal(prev_board,board)):
                    finish = 'lose'
            
            #decrease the epsilon value
            if((ep>10000) or (epsilon>0.1 and total_iters%2500==0)):
                epsilon = epsilon/1.005
                
           
            #change the matrix values and store them in memory
            prev_state = deepcopy(prev_board)
            prev_state = change_values(prev_state)
            prev_state = np.array(prev_state,dtype=np.float32).reshape(1,4,4,16)
            replay_labels.append(labels)
            replay_memory.append(prev_state)
            
            
            #back-propagation
            if(len(replay_memory)>=mem_capacity):
                back_loss = 0
                batch_num = 0
                z = list(zip(replay_memory,replay_labels))
                np.random.shuffle(z)
                np.random.shuffle(z)
                replay_memory,replay_labels = zip(*z)
                
                for i in range(0,len(replay_memory),batch_size):
                    if(i + batch_size>len(replay_memory)):
                        break
                        
                    batch_data = deepcopy(replay_memory[i:i+batch_size])
                    batch_labels = deepcopy(replay_labels[i:i+batch_size])
                    
                    batch_data = np.array(batch_data,dtype=np.float32).reshape(batch_size,4,4,16)
                    batch_labels = np.array(batch_labels,dtype=np.float32).reshape(batch_size,output_units)
                
                    feed_dict = {tf_batch_dataset: batch_data, tf_batch_labels: batch_labels}
                    _,l = session.run([optimizer,loss],feed_dict=feed_dict)
                    back_loss += l 
                    
                    print("Mini-Batch - {} Back-Prop : {}, Loss : {}".format(batch_num,back,l))
                    batch_num +=1
                back_loss /= batch_num
                J.append(back_loss)
                
                #store the parameters in a dictionary
                final_parameters['conv1_layer1_weights'] = session.run(conv1_layer1_weights)
                final_parameters['conv2_layer1_weights'] = session.run(conv2_layer1_weights)
                final_parameters['fc_layer1_weights'] = session.run(fc_layer1_weights)
                final_parameters['fc_layer1_biases'] = session.run(fc_layer1_biases)
                
                for v in tf.trainable_variables():
                    doublecheck_parameters[v.name[:-2]] = session.run(v)
                    
                #number of back-props
                back+=1
                
                #make new memory 
                replay_memory = list()
                replay_labels = list()
                
            
            if(local_iters%400==0):
                print("Episode : {}, Score : {}, Iters : {}, Finish : {}".format(ep,total_score,local_iters,finish))
            
            local_iters += 1
            total_iters += 1
            
        scores.append(total_score)
        print("Episode {} finished with score {}, result : {} board : {}, epsilon  : {}, learning rate : {} ".format(ep,total_score,finish,board,epsilon,session.run(learning_rate)))
        print()
        
        if((ep+1)%1000==0):
            print("Maximum Score : {} ,Episode : {}".format(maximum,episode))    
            print("Loss : {}".format(J[len(J)-1]))
            print()
            
        if(maximum<total_score):
            maximum = total_score
            episode = ep
    print("Maximum Score : {} ,Episode : {}".format(maximum,episode))    

Initialized
Episode 0 finished with score 1424.0, result : lose board : [[  2.   4.   8.   2.]
 [ 32. 128.  64.   4.]
 [ 16.  32.  16.   8.]
 [  4.   2.   4.   2.]], epsilon  : 0.9, learning rate : 0.0005000000237487257 

Episode 1 finished with score 1480.0, result : lose board : [[8.0, 32.0, 2.0, 4], [2.0, 64.0, 8.0, 2.0], [8.0, 32.0, 128.0, 4.0], [2, 4.0, 8.0, 32.0]], epsilon  : 0.9, learning rate : 0.0005000000237487257 

Episode 2 finished with score 1632.0, result : lose board : [[  2.   8.  16.   2.]
 [ 16.  32.  64.  16.]
 [  2.  64. 128.   2.]
 [  4.   2.   8.   4.]], epsilon  : 0.9, learning rate : 0.0005000000237487257 

Episode 3 finished with score 1416.0, result : lose board : [[  2.   4.  32.   2.]
 [  4.   8.  16.   4.]
 [  8.  32.   8.  64.]
 [  2. 128.   2.   8.]], epsilon  : 0.9, learning rate : 0.0005000000237487257 

Episode 4 finished with score 616.0, result : lose board : [[ 2.  8.  4.  2.]
 [ 8.  4. 64. 16.]
 [16. 32.  8.  4.]
 [ 2.  4.  2. 16.]], epsilon  : 0.

Episode 39 finished with score 628.0, result : lose board : [[ 2.  4. 16.  2.]
 [ 8. 32.  2.  4.]
 [ 4.  8. 16. 64.]
 [ 2. 16.  2.  8.]], epsilon  : 0.8910670527957231, learning rate : 0.0005000000237487257 

Episode 40 finished with score 568.0, result : lose board : [[2.0, 32.0, 64.0, 4.0], [4.0, 16.0, 8.0, 2.0], [2.0, 8.0, 2.0, 16.0], [4.0, 2.0, 4.0, 2]], epsilon  : 0.8910670527957231, learning rate : 0.0005000000237487257 

Episode 41 finished with score 1072.0, result : lose board : [[2, 4, 16.0, 2], [8.0, 32.0, 8.0, 4.0], [16.0, 4.0, 128.0, 16.0], [4.0, 8.0, 4.0, 2]], epsilon  : 0.8910670527957231, learning rate : 0.0005000000237487257 

Episode 42 finished with score 384.0, result : lose board : [[ 2.  4.  2.  8.]
 [32.  8. 32.  2.]
 [ 4. 16.  8.  4.]
 [ 2.  8.  2. 16.]], epsilon  : 0.8910670527957231, learning rate : 0.0005000000237487257 

Episode 43 finished with score 1036.0, result : lose board : [[ 2.  8.  4.  2.]
 [ 8. 32. 64.  4.]
 [16.  8. 32.  8.]
 [ 2. 64. 16.  2.]], 

Episode 74 finished with score 1052.0, result : lose board : [[  4.   2.   8.   2.]
 [ 16.   4. 128.   8.]
 [  4.  16.  32.  16.]
 [  2.   8.   2.   4.]], epsilon  : 0.886633883378829, learning rate : 0.0005000000237487257 

Episode 75 finished with score 728.0, result : lose board : [[ 4.  2.  8.  2.]
 [ 8.  4. 32.  4.]
 [64. 32. 16.  8.]
 [ 4. 16.  8.  4.]], epsilon  : 0.886633883378829, learning rate : 0.0005000000237487257 

Episode 76 finished with score 1244.0, result : lose board : [[  2.  16.   4.   2.]
 [  4. 128.   8.   4.]
 [ 16.  64.   2.   8.]
 [  4.   8.  16.   2.]], epsilon  : 0.886633883378829, learning rate : 0.0005000000237487257 

Episode 77 finished with score 1204.0, result : lose board : [[  4.   2.  16.   2.]
 [  8.  64.   2.   8.]
 [  4.   8. 128.   4.]
 [  2.  16.   8.   2.]], epsilon  : 0.886633883378829, learning rate : 0.0005000000237487257 

Episode 78 finished with score 1052.0, result : lose board : [[  4.   2.   8.   2.]
 [  8.   4.  32.   4.]
 [ 16.   8

Episode 109 finished with score 668.0, result : lose board : [[8.0, 2.0, 4.0, 2], [4.0, 16.0, 32.0, 8.0], [32.0, 64.0, 4.0, 2.0], [2, 4, 8.0, 4.0]], epsilon  : 0.8778336015235555, learning rate : 0.0005000000237487257 

Episode 110 finished with score 2812.0, result : lose board : [[  2.   4.  16.   8.]
 [  4.  32. 128.  16.]
 [  8. 256.   8.   4.]
 [  2.   4.  16.   2.]], epsilon  : 0.8778336015235555, learning rate : 0.0005000000237487257 

Episode 111 finished with score 904.0, result : lose board : [[  4.   8.   4.   8.]
 [  2.   4.  16.   4.]
 [  8. 128.   8.   2.]
 [  2.  16.   2.   4.]], epsilon  : 0.8778336015235555, learning rate : 0.0005000000237487257 

Episode 112 finished with score 1512.0, result : lose board : [[4, 2.0, 4.0, 16.0], [2.0, 64.0, 128.0, 2.0], [8.0, 4.0, 64.0, 4.0], [4.0, 8.0, 4.0, 16.0]], epsilon  : 0.8778336015235555, learning rate : 0.0005000000237487257 

Episode 113 finished with score 1176.0, result : lose board : [[8.0, 2.0, 16.0, 2.0], [16.0, 32.0, 2

Episode 144 finished with score 1796.0, result : lose board : [[  2.   4.  16.   2.]
 [  8.  32.   8.   4.]
 [128.   8. 128.  16.]
 [  4.   2.  16.   4.]], epsilon  : 0.8691206668384998, learning rate : 0.0005000000237487257 

Episode 145 finished with score 1352.0, result : lose board : [[2.0, 16.0, 4.0, 2.0], [8.0, 128.0, 8.0, 4.0], [4.0, 64.0, 32.0, 16.0], [2.0, 16.0, 4.0, 2]], epsilon  : 0.8691206668384998, learning rate : 0.0005000000237487257 

Episode 146 finished with score 500.0, result : lose board : [[ 2.  4.  8.  4.]
 [16.  2.  4. 16.]
 [ 2. 16. 64.  4.]
 [ 8.  4.  8.  2.]], epsilon  : 0.8691206668384998, learning rate : 0.0005000000237487257 

Episode 147 finished with score 2100.0, result : lose board : [[2.0, 16.0, 2.0, 8.0], [8.0, 32.0, 4.0, 2.0], [2.0, 4.0, 256.0, 8.0], [4, 32.0, 4.0, 2]], epsilon  : 0.8691206668384998, learning rate : 0.0005000000237487257 

Episode 148 finished with score 2420.0, result : lose board : [[2.0, 256.0, 2.0, 4.0], [4.0, 64.0, 8.0, 2.0], [

Episode 181 finished with score 1544.0, result : lose board : [[  2.   4.   2.   8.]
 [  4.   8.  32.   4.]
 [  8.  64.   4.  64.]
 [  2.   4. 128.   4.]], epsilon  : 0.860494212359595, learning rate : 0.0005000000237487257 

Episode 182 finished with score 1104.0, result : lose board : [[  2.   8.   4.   2.]
 [ 32.  16. 128.   4.]
 [  2.   8.  16.  32.]
 [  4.   2.   4.   2.]], epsilon  : 0.860494212359595, learning rate : 0.0005000000237487257 

Episode 183 finished with score 1652.0, result : lose board : [[  2.   4.  64.   4.]
 [  8.  16.   4.   8.]
 [ 16. 128.  32.   4.]
 [  4.   8.  64.   2.]], epsilon  : 0.860494212359595, learning rate : 0.0005000000237487257 

Episode 184 finished with score 1336.0, result : lose board : [[2.0, 16.0, 8.0, 2], [4.0, 2.0, 16.0, 64.0], [2.0, 32.0, 128.0, 4.0], [4.0, 16.0, 4.0, 2.0]], epsilon  : 0.860494212359595, learning rate : 0.0005000000237487257 

Episode 185 finished with score 456.0, result : lose board : [[4.0, 2.0, 64.0, 2.0], [8.0, 4.0,

Episode 216 finished with score 1440.0, result : lose board : [[  2. 128.   2.   4.]
 [ 16.  32.  16.   2.]
 [  8.  64.   4.   8.]
 [ 32.   2.   8.   4.]], epsilon  : 0.8562131466264628, learning rate : 0.0005000000237487257 

Episode 217 finished with score 1744.0, result : lose board : [[  8.   2.   4.   2.]
 [  2.  16. 128.   8.]
 [  8.  32.  16.   4.]
 [  2.   4. 128.   2.]], epsilon  : 0.8519533797278237, learning rate : 0.0005000000237487257 

Episode 218 finished with score 1404.0, result : lose board : [[2.0, 16.0, 4.0, 2.0], [32.0, 4.0, 128.0, 16.0], [8.0, 64.0, 8.0, 4.0], [2, 8.0, 32.0, 2]], epsilon  : 0.8519533797278237, learning rate : 0.0005000000237487257 

Episode 219 finished with score 2204.0, result : lose board : [[4.0, 8.0, 256.0, 4.0], [16.0, 64.0, 2.0, 8.0], [8.0, 16.0, 4.0, 2.0], [2, 4.0, 8.0, 4.0]], epsilon  : 0.8519533797278237, learning rate : 0.0005000000237487257 

Episode 220 finished with score 2040.0, result : lose board : [[  4.   2.   8.   2.]
 [ 16. 25

Episode 250 finished with score 1344.0, result : lose board : [[  2.   4.  16.   2.]
 [ 16. 128.  32.   8.]
 [ 64.   2.   8.   4.]
 [  4.  16.   4.   2.]], epsilon  : 0.8477148056993271, learning rate : 0.0005000000237487257 

Episode 251 finished with score 708.0, result : lose board : [[ 2.  4. 32.  2.]
 [ 4. 16.  8.  4.]
 [ 8.  4. 64.  8.]
 [ 4. 32. 16.  4.]], epsilon  : 0.8477148056993271, learning rate : 0.0005000000237487257 

Episode 252 finished with score 2224.0, result : lose board : [[  4.   2.   8.   4.]
 [  8.  16. 256.   8.]
 [  4.  64.   8.   4.]
 [  2.  16.   4.   2.]], epsilon  : 0.8477148056993271, learning rate : 0.0005000000237487257 

Episode 253 finished with score 708.0, result : lose board : [[4.0, 8.0, 4.0, 2], [2.0, 32.0, 8.0, 16.0], [16.0, 64.0, 32.0, 4.0], [4.0, 8.0, 4.0, 2.0]], epsilon  : 0.8477148056993271, learning rate : 0.0005000000237487257 

Episode 254 finished with score 816.0, result : lose board : [[2.0, 32.0, 2.0, 4.0], [4.0, 64.0, 8.0, 2], [2, 8

Episode 285 finished with score 656.0, result : lose board : [[ 2. 32.  2.  4.]
 [ 4.  8. 16.  2.]
 [ 8. 64.  4.  8.]
 [ 2.  4. 32.  4.]], epsilon  : 0.8393008150286649, learning rate : 0.0005000000237487257 

Episode 286 finished with score 436.0, result : lose board : [[2, 4.0, 8.0, 2.0], [4.0, 16.0, 32.0, 8.0], [2.0, 32.0, 4.0, 16.0], [8.0, 16.0, 8.0, 2]], epsilon  : 0.8393008150286649, learning rate : 0.0005000000237487257 

Episode 287 finished with score 692.0, result : lose board : [[ 2. 32.  8.  4.]
 [16. 64. 32.  2.]
 [ 4. 16.  8.  4.]
 [ 2.  8.  4.  2.]], epsilon  : 0.8393008150286649, learning rate : 0.0005000000237487257 

Episode 288 finished with score 2140.0, result : lose board : [[2.0, 32.0, 2.0, 4.0], [4.0, 128.0, 8.0, 16.0], [2, 8.0, 64.0, 4.0], [8.0, 2.0, 32.0, 128.0]], epsilon  : 0.8393008150286649, learning rate : 0.0005000000237487257 

Episode 289 finished with score 692.0, result : lose board : [[ 2.  8.  4. 16.]
 [ 4. 32. 64.  2.]
 [ 8.  4. 16.  4.]
 [ 4. 32. 

Mini-Batch - 0 Back-Prop : 6, Loss : 0.7097104787826538
Mini-Batch - 1 Back-Prop : 6, Loss : 0.7443735003471375
Mini-Batch - 2 Back-Prop : 6, Loss : 0.5701957941055298
Mini-Batch - 3 Back-Prop : 6, Loss : 0.5143861174583435
Mini-Batch - 4 Back-Prop : 6, Loss : 0.7941772937774658
Mini-Batch - 5 Back-Prop : 6, Loss : 0.5406284332275391
Mini-Batch - 6 Back-Prop : 6, Loss : 0.6459806561470032
Mini-Batch - 7 Back-Prop : 6, Loss : 0.7071697115898132
Mini-Batch - 8 Back-Prop : 6, Loss : 0.9081341028213501
Mini-Batch - 9 Back-Prop : 6, Loss : 0.721844494342804
Mini-Batch - 10 Back-Prop : 6, Loss : 0.5711809992790222
Episode 322 finished with score 2292.0, result : lose board : [[32.0, 2.0, 8.0, 4.0], [256.0, 64.0, 2.0, 8.0], [2.0, 4.0, 16.0, 4], [4, 8.0, 4.0, 2]], epsilon  : 0.8309703373962675, learning rate : 0.0005000000237487257 

Episode 323 finished with score 1252.0, result : lose board : [[4.0, 2.0, 8.0, 4.0], [16.0, 8.0, 64.0, 8.0], [4.0, 16.0, 128.0, 16.0], [2, 4, 8.0, 4]], epsilon  :

Episode 357 finished with score 1156.0, result : lose board : [[4.0, 2.0, 8.0, 4], [8.0, 4.0, 32.0, 2.0], [16.0, 32.0, 128.0, 16.0], [2.0, 8.0, 16.0, 4.0]], epsilon  : 0.822722543893733, learning rate : 0.0005000000237487257 

Episode 358 finished with score 944.0, result : lose board : [[4, 16.0, 4, 2], [8.0, 2.0, 32.0, 4.0], [4.0, 128.0, 4.0, 2.0], [2.0, 8.0, 2.0, 4.0]], epsilon  : 0.822722543893733, learning rate : 0.0005000000237487257 

Episode 359 finished with score 1056.0, result : lose board : [[2.0, 8.0, 4.0, 8.0], [4.0, 128.0, 16.0, 4.0], [8.0, 16.0, 2.0, 32.0], [2, 4.0, 16.0, 4.0]], epsilon  : 0.822722543893733, learning rate : 0.0005000000237487257 

Episode 360 finished with score 2260.0, result : lose board : [[4.0, 2.0, 4.0, 8.0], [2.0, 4.0, 32.0, 2.0], [4.0, 64.0, 16.0, 4.0], [2, 4.0, 8.0, 256.0]], epsilon  : 0.822722543893733, learning rate : 0.0005000000237487257 

Episode 361 finished with score 1668.0, result : lose board : [[  2.   4.   2.  64.]
 [ 32. 128.   8.  

Episode 392 finished with score 2220.0, result : lose board : [[ 32.   4.   2.   8.]
 [  8.  64. 128.  32.]
 [  4.   8.  32.   4.]
 [  2. 128.   8.   2.]], epsilon  : 0.8145566138399872, learning rate : 0.0005000000237487257 

Episode 393 finished with score 1336.0, result : lose board : [[8.0, 2.0, 4.0, 8.0], [2, 16.0, 128.0, 32.0], [8.0, 2.0, 64.0, 4.0], [2.0, 16.0, 4.0, 2.0]], epsilon  : 0.8145566138399872, learning rate : 0.0005000000237487257 

Episode 394 finished with score 1096.0, result : lose board : [[32.0, 8.0, 4.0, 2.0], [2.0, 4.0, 16.0, 4.0], [16.0, 8.0, 128.0, 16.0], [4, 16, 4, 2]], epsilon  : 0.8145566138399872, learning rate : 0.0005000000237487257 

Episode 395 finished with score 856.0, result : lose board : [[ 2.  8.  4.  2.]
 [ 4. 64. 16.  4.]
 [ 8. 32.  8.  2.]
 [ 4. 64.  4.  8.]], epsilon  : 0.8145566138399872, learning rate : 0.0005000000237487257 

Episode 396 finished with score 1360.0, result : lose board : [[  4.   8.   2.   4.]
 [  8.  16.  32.  16.]
 [  4.

Episode 427 finished with score 1128.0, result : lose board : [[2, 4.0, 64.0, 8.0], [4.0, 16.0, 32.0, 4.0], [32.0, 64.0, 16.0, 2.0], [4.0, 8.0, 32.0, 4.0]], epsilon  : 0.8064717346996236, learning rate : 0.0005000000237487257 

Episode 428 finished with score 644.0, result : lose board : [[2.0, 8.0, 4, 2], [4.0, 16.0, 64.0, 8.0], [16.0, 32.0, 16.0, 4.0], [4.0, 16.0, 4.0, 2.0]], epsilon  : 0.8064717346996236, learning rate : 0.0005000000237487257 

Episode 429 finished with score 1644.0, result : lose board : [[  2.   8.   4.   2.]
 [  4.   2.  32.  64.]
 [ 16. 128.   8.  16.]
 [  4.  64.   2.   8.]], epsilon  : 0.8064717346996236, learning rate : 0.0005000000237487257 

Episode 430 finished with score 2884.0, result : lose board : [[4.0, 8.0, 4.0, 2.0], [16.0, 128.0, 16.0, 256.0], [32.0, 2.0, 32.0, 4.0], [2.0, 16.0, 4.0, 2]], epsilon  : 0.8064717346996236, learning rate : 0.0005000000237487257 

Episode 431 finished with score 2684.0, result : lose board : [[2.0, 4.0, 16.0, 2.0], [4.0,

Episode 461 finished with score 628.0, result : lose board : [[ 4. 64.  8.  2.]
 [ 2.  8. 16.  4.]
 [ 8.  2. 32. 16.]
 [ 2. 16.  8.  4.]], epsilon  : 0.7984671020020532, learning rate : 0.0005000000237487257 

Episode 462 finished with score 1512.0, result : lose board : [[  4.  32.   2.   4.]
 [  8. 128.  32.   2.]
 [ 32.   8.  64.   4.]
 [  4.   2.  16.   2.]], epsilon  : 0.7984671020020532, learning rate : 0.0005000000237487257 

Episode 463 finished with score 2252.0, result : lose board : [[2.0, 8.0, 4, 2], [8.0, 256.0, 64.0, 16.0], [4.0, 32.0, 4.0, 2.0], [2.0, 4.0, 2.0, 4.0]], epsilon  : 0.7984671020020532, learning rate : 0.0005000000237487257 

Episode 464 finished with score 1536.0, result : lose board : [[2.0, 32.0, 8.0, 32.0], [16.0, 128.0, 4.0, 2.0], [2.0, 4.0, 64.0, 32.0], [4.0, 16.0, 4.0, 2]], epsilon  : 0.7984671020020532, learning rate : 0.0005000000237487257 

Episode 465 finished with score 724.0, result : lose board : [[4, 16.0, 4.0, 2], [8.0, 64.0, 32.0, 4.0], [2.0,

Episode 498 finished with score 1284.0, result : lose board : [[  2.  16.   2.   4.]
 [ 32.   2.   4. 128.]
 [  4.  64.  16.   8.]
 [  8.   2.   8.   2.]], epsilon  : 0.7905419192614572, learning rate : 0.0005000000237487257 

Episode 499 finished with score 592.0, result : lose board : [[ 2.  4. 16.  4.]
 [16.  2. 32.  8.]
 [ 2. 64. 16.  4.]
 [ 4.  2.  4.  2.]], epsilon  : 0.7905419192614572, learning rate : 0.0005000000237487257 

Episode 500 finished with score 752.0, result : lose board : [[ 4.  8.  4.  2.]
 [ 2. 64. 16.  4.]
 [ 8. 16. 64.  8.]
 [ 4.  2.  4.  2.]], epsilon  : 0.7905419192614572, learning rate : 0.0005000000237487257 

Episode 501 finished with score 588.0, result : lose board : [[2.0, 4.0, 2.0, 4], [4.0, 16.0, 32.0, 16.0], [8.0, 64.0, 2.0, 4.0], [2, 4.0, 16.0, 2.0]], epsilon  : 0.7905419192614572, learning rate : 0.0005000000237487257 

Episode 502 finished with score 2888.0, result : lose board : [[2, 4.0, 8.0, 4.0], [8.0, 32.0, 256.0, 8.0], [4.0, 16.0, 32.0, 128.

Episode 533 finished with score 1324.0, result : lose board : [[  4.   8.   4.   2.]
 [  2.  16.   8.  16.]
 [  8. 128.  64.   4.]
 [  2.  32.   4.   2.]], epsilon  : 0.7866088748870222, learning rate : 0.0005000000237487257 

Episode 534 finished with score 1384.0, result : lose board : [[2.0, 128.0, 8.0, 2], [8.0, 64.0, 16.0, 8.0], [4.0, 16.0, 32.0, 4.0], [8.0, 4.0, 16.0, 2.0]], epsilon  : 0.7866088748870222, learning rate : 0.0005000000237487257 

Episode 535 finished with score 5444.0, result : lose board : [[  2.   4.  16.   2.]
 [ 32.  64.  32.   4.]
 [  4.   2. 512.  16.]
 [  2. 128.   8.   4.]], epsilon  : 0.7866088748870222, learning rate : 0.0005000000237487257 

Episode 536 finished with score 1004.0, result : lose board : [[2.0, 4.0, 8.0, 2], [4.0, 64.0, 32.0, 4.0], [32.0, 16.0, 64.0, 2.0], [2, 8, 4.0, 16.0]], epsilon  : 0.7826953978975346, learning rate : 0.0005000000237487257 

Episode 537 finished with score 1296.0, result : lose board : [[2.0, 4.0, 16.0, 2.0], [4.0, 2.0

Episode 567 finished with score 1088.0, result : lose board : [[8.0, 4.0, 16.0, 2.0], [4.0, 32.0, 8.0, 16.0], [16.0, 128.0, 4.0, 2], [2.0, 4.0, 16.0, 4.0]], epsilon  : 0.7788013909428206, learning rate : 0.0005000000237487257 

Episode 568 finished with score 852.0, result : lose board : [[16.  4. 32.  2.]
 [ 8. 32. 64.  4.]
 [ 2. 16. 32. 16.]
 [ 4.  8.  2.  4.]], epsilon  : 0.7788013909428206, learning rate : 0.0005000000237487257 

Episode 569 finished with score 416.0, result : lose board : [[4.0, 16.0, 4.0, 2], [16.0, 32.0, 8.0, 16.0], [2.0, 4.0, 32.0, 4.0], [4.0, 2.0, 4.0, 2.0]], epsilon  : 0.7788013909428206, learning rate : 0.0005000000237487257 

Episode 570 finished with score 2012.0, result : lose board : [[2.0, 8.0, 16.0, 2], [8.0, 256.0, 32.0, 4.0], [4.0, 16.0, 8.0, 2.0], [2.0, 4.0, 2.0, 4.0]], epsilon  : 0.7788013909428206, learning rate : 0.0005000000237487257 

Episode 571 finished with score 2160.0, result : lose board : [[  4.   8.   4.   2.]
 [  8. 256.  16.  32.]
 [ 

Episode 601 finished with score 856.0, result : lose board : [[ 4.  2. 16.  4.]
 [64. 32.  4.  2.]
 [ 4. 16. 64.  8.]
 [ 2.  8.  4.  2.]], epsilon  : 0.7710714001562543, learning rate : 0.0005000000237487257 

Episode 602 finished with score 2552.0, result : lose board : [[4.0, 32.0, 8.0, 2.0], [2.0, 64.0, 256.0, 64.0], [8.0, 4.0, 16.0, 4.0], [4.0, 8.0, 4.0, 2]], epsilon  : 0.7710714001562543, learning rate : 0.0005000000237487257 

Episode 603 finished with score 748.0, result : lose board : [[8.0, 16.0, 4.0, 2.0], [32.0, 8.0, 32.0, 4.0], [4.0, 16.0, 64.0, 8.0], [2, 4.0, 16.0, 2.0]], epsilon  : 0.7710714001562543, learning rate : 0.0005000000237487257 

Episode 604 finished with score 1236.0, result : lose board : [[8.0, 4.0, 64.0, 4.0], [2.0, 16.0, 8.0, 2.0], [16.0, 128.0, 2.0, 4.0], [2, 8.0, 16.0, 2]], epsilon  : 0.7710714001562543, learning rate : 0.0005000000237487257 

Episode 605 finished with score 1252.0, result : lose board : [[2, 4.0, 8.0, 4.0], [8.0, 64.0, 32.0, 2.0], [2.0,

Episode 639 finished with score 1288.0, result : lose board : [[2.0, 8.0, 4.0, 2], [8.0, 128.0, 16.0, 4.0], [4.0, 8.0, 64.0, 2.0], [2.0, 4.0, 32.0, 4.0]], epsilon  : 0.7634181333692279, learning rate : 0.0005000000237487257 

Episode 640 finished with score 1628.0, result : lose board : [[  2.   4.  64.   2.]
 [  4.  32. 128.   4.]
 [  8.   4.  16.   2.]
 [ 16.   2.  64.   8.]], epsilon  : 0.7634181333692279, learning rate : 0.0005000000237487257 

Episode 641 finished with score 640.0, result : lose board : [[ 2.  4.  2.  4.]
 [32. 64.  8.  2.]
 [ 8.  4. 32.  4.]
 [ 4.  2. 16.  2.]], epsilon  : 0.7634181333692279, learning rate : 0.0005000000237487257 

Episode 642 finished with score 2384.0, result : lose board : [[8.0, 32.0, 8.0, 2.0], [2.0, 4.0, 64.0, 8.0], [32.0, 2.0, 8.0, 4.0], [8.0, 256.0, 4.0, 2]], epsilon  : 0.7634181333692279, learning rate : 0.0005000000237487257 

Mini-Batch - 0 Back-Prop : 13, Loss : 0.3807961344718933
Mini-Batch - 1 Back-Prop : 13, Loss : 0.81285125017166

Episode 673 finished with score 1304.0, result : lose board : [[4.0, 2.0, 8.0, 2], [2.0, 8.0, 16.0, 128.0], [4.0, 64.0, 8.0, 4.0], [16.0, 32.0, 4.0, 2.0]], epsilon  : 0.7558408290579224, learning rate : 0.0005000000237487257 

Episode 674 finished with score 704.0, result : lose board : [[ 2. 16.  2.  4.]
 [ 4. 64.  8. 32.]
 [16.  4. 32.  8.]
 [ 4.  8.  4.  2.]], epsilon  : 0.7558408290579224, learning rate : 0.0005000000237487257 

Episode 675 finished with score 1112.0, result : lose board : [[2.0, 8.0, 4.0, 2], [8.0, 2.0, 32.0, 4.0], [2.0, 32.0, 128.0, 8.0], [4.0, 8.0, 16.0, 4.0]], epsilon  : 0.7558408290579224, learning rate : 0.0005000000237487257 

Episode 676 finished with score 2392.0, result : lose board : [[  2.  16.   4.   2.]
 [ 16.   8.   2.   4.]
 [256.  16.  64.  16.]
 [  4.   8.  32.   4.]], epsilon  : 0.7558408290579224, learning rate : 0.0005000000237487257 

Episode 677 finished with score 1408.0, result : lose board : [[  2.  32.   4.   2.]
 [  8.   2.   8.   4.]
 [

Episode 707 finished with score 1172.0, result : lose board : [[4.0, 8.0, 16.0, 2], [2.0, 32.0, 128.0, 4.0], [4.0, 16.0, 4.0, 16.0], [8.0, 2.0, 32.0, 4.0]], epsilon  : 0.7483387332570208, learning rate : 0.0005000000237487257 

Episode 708 finished with score 900.0, result : lose board : [[ 2.  8.  2.  8.]
 [ 4. 64.  8. 16.]
 [ 2. 32.  4.  2.]
 [ 8.  2. 16. 64.]], epsilon  : 0.7483387332570208, learning rate : 0.0005000000237487257 

Episode 709 finished with score 608.0, result : lose board : [[2.0, 4.0, 16.0, 2.0], [16.0, 2.0, 64.0, 4.0], [4.0, 8.0, 32.0, 8.0], [2, 4.0, 16.0, 4]], epsilon  : 0.7483387332570208, learning rate : 0.0005000000237487257 

Episode 710 finished with score 1068.0, result : lose board : [[  2.   8.   4.   2.]
 [ 16.  32.   8.   4.]
 [  4.   8. 128.  16.]
 [  8.  16.   8.   2.]], epsilon  : 0.7483387332570208, learning rate : 0.0005000000237487257 

Episode 711 finished with score 1408.0, result : lose board : [[ 16.   2.   4.  16.]
 [  4.  16.  64.   8.]
 [  

Episode 741 finished with score 1148.0, result : lose board : [[ 16.   4.  16.   2.]
 [  2.   8. 128.  16.]
 [  4.  32.  16.   4.]
 [  2.  16.   8.   2.]], epsilon  : 0.740911099484687, learning rate : 0.0005000000237487257 

Episode 742 finished with score 1088.0, result : lose board : [[2.0, 32.0, 2.0, 4.0], [4.0, 16.0, 8.0, 2], [2.0, 32.0, 128.0, 8.0], [4.0, 8.0, 2.0, 4.0]], epsilon  : 0.740911099484687, learning rate : 0.0005000000237487257 

Episode 743 finished with score 1036.0, result : lose board : [[  2.  16.   4.   2.]
 [ 32.   2.  16.   4.]
 [  8.   4. 128.   8.]
 [  4.  16.   8.   4.]], epsilon  : 0.740911099484687, learning rate : 0.0005000000237487257 

Episode 744 finished with score 1144.0, result : lose board : [[  4.   2.  16.   4.]
 [  2.   8. 128.  32.]
 [  8.  32.   8.   4.]
 [  2.   4.  16.   2.]], epsilon  : 0.740911099484687, learning rate : 0.0005000000237487257 

Episode 745 finished with score 1000.0, result : lose board : [[  2.   4.  16.   2.]
 [  8. 128. 

Episode 775 finished with score 1372.0, result : lose board : [[2.0, 8.0, 4.0, 2.0], [4.0, 16.0, 64.0, 8.0], [16.0, 128.0, 16.0, 32.0], [2, 8.0, 4.0, 2]], epsilon  : 0.7335571886682876, learning rate : 0.0005000000237487257 

Episode 776 finished with score 1296.0, result : lose board : [[4.0, 2.0, 64.0, 2.0], [128.0, 8.0, 4.0, 32.0], [8.0, 2.0, 8.0, 16.0], [4, 8.0, 4, 2]], epsilon  : 0.7335571886682876, learning rate : 0.0005000000237487257 

Episode 777 finished with score 1808.0, result : lose board : [[  8.   4.   2.  16.]
 [128.  16.   8.   2.]
 [  8.  32. 128.   4.]
 [  2.   4.  16.   2.]], epsilon  : 0.7335571886682876, learning rate : 0.0005000000237487257 

Episode 778 finished with score 1128.0, result : lose board : [[2, 4.0, 8.0, 2.0], [32.0, 2.0, 64.0, 4.0], [4.0, 8.0, 16.0, 64.0], [2.0, 64.0, 2.0, 8.0]], epsilon  : 0.7335571886682876, learning rate : 0.0005000000237487257 

Episode 779 finished with score 1348.0, result : lose board : [[  2.   4.  16.   2.]
 [  8. 128.  3

Episode 809 finished with score 2040.0, result : lose board : [[2, 8.0, 2.0, 4.0], [8.0, 16.0, 256.0, 16.0], [4.0, 32.0, 16.0, 8.0], [2.0, 4.0, 2.0, 4.0]], epsilon  : 0.7262762690708524, learning rate : 0.0005000000237487257 

Episode 810 finished with score 1004.0, result : lose board : [[  2.   8.  16.   2.]
 [ 32. 128.   4.   8.]
 [  2.   4.  16.   4.]
 [  4.   2.   8.   2.]], epsilon  : 0.7262762690708524, learning rate : 0.0005000000237487257 



### Store the Trained Weights in a file

In [None]:
import os
THIS_FOLDER = os.path.abspath('')
PARENT_FOLDER = os.path.dirname(THIS_FOLDER)
WEIGHT_FOLDER = os.path.join(PARENT_FOLDER, 'ver2_result')

def SaveWeights():
    for name,weights in doublecheck_parameters.items():
        flatten = weights.reshape(-1,1)
        filename = name + '.csv'
        file = open(os.path.join(WEIGHT_FOLDER, filename), 'w')
        file.write('Sno,Weight\n')
        for i in range(flatten.shape[0]):
            file.write(str(i) +',' +str(flatten[i][0])+'\n')
        file.close()
        print(filename + " written!")
        
    
        
SaveWeights()

In [None]:
np.save(os.path.join(WEIGHT_FOLDER,'scores'), scores)


In [None]:
np.save(os.path.join(WEIGHT_FOLDER,'J'), J)

In [None]:
scores_avg = deepcopy(scores)
J_avg = deepcopy(J)
step = 10
for i in range(step - 1, len(scores), step):
    start = i + 1 - step
    end = i + step if (i + step < len(scores)) else len(scores)
    avg = np.average(scores[start:end])
    scores_avg[start:end] = [avg] * (end - start)

for i in range(step - 1, len(J), step):
    start = i + 1 - step
    end = i + step if (i + step < len(J)) else len(J)
    avg = np.average(J[start:end])
    J_avg[start:end] = [avg] * (end - start)



In [None]:
plt.plot(scores_avg)

In [None]:
plt.plot(J_avg)