Changes:
- The output state now consistes of (board, falling piece, next piece) where board CONTAINS the falling piece on it
    - The game code itself keeps board and falling piece as seperate entities for easy manipulation of making the falling piece drop. However, the input to the conv net requires the falling piece to be on the board. At the end of each call of one_step, the falling piece is placed on the board, and the modified squares are kept track. This modified board with the falling piece on it is outputted as state[0]. At the beginning of the next call of one_step, the falling piece is removed from the board. The removing can be done efficiently because we have kept track of the modified squares in a global variable.
- Removed color variable of the pieces. All pieces now have color value of 1, so that the board is a binary array.
- Removed global variables movingLeft, movingRight, movingDown. Realized they are not needed.

In [3]:
BOARD_WIDTH = 10
BOARD_HEIGHT = 20
BLANK = 0

TEMPLATE_WIDTH = 5
TEMPLATE_HEIGHT = 5

S_SHAPE_TEMPLATE = [['.....',
                     '.....',
                     '..OO.',
                     '.OO..',
                     '.....'],
                    ['.....',
                     '..O..',
                     '..OO.',
                     '...O.',
                     '.....']]

Z_SHAPE_TEMPLATE = [['.....',
                     '.....',
                     '.OO..',
                     '..OO.',
                     '.....'],
                    ['.....',
                     '..O..',
                     '.OO..',
                     '.O...',
                     '.....']]

I_SHAPE_TEMPLATE = [['..O..',
                     '..O..',
                     '..O..',
                     '..O..',
                     '.....'],
                    ['.....',
                     '.....',
                     'OOOO.',
                     '.....',
                     '.....']]

O_SHAPE_TEMPLATE = [['.....',
                     '.....',
                     '.OO..',
                     '.OO..',
                     '.....']]

J_SHAPE_TEMPLATE = [['.....',
                     '.O...',
                     '.OOO.',
                     '.....',
                     '.....'],
                    ['.....',
                     '..OO.',
                     '..O..',
                     '..O..',
                     '.....'],
                    ['.....',
                     '.....',
                     '.OOO.',
                     '...O.',
                     '.....'],
                    ['.....',
                     '..O..',
                     '..O..',
                     '.OO..',
                     '.....']]

L_SHAPE_TEMPLATE = [['.....',
                     '...O.',
                     '.OOO.',
                     '.....',
                     '.....'],
                    ['.....',
                     '..O..',
                     '..O..',
                     '..OO.',
                     '.....'],
                    ['.....',
                     '.....',
                     '.OOO.',
                     '.O...',
                     '.....'],
                    ['.....',
                     '.OO..',
                     '..O..',
                     '..O..',
                     '.....']]

T_SHAPE_TEMPLATE = [['.....',
                     '..O..',
                     '.OOO.',
                     '.....',
                     '.....'],
                    ['.....',
                     '..O..',
                     '..OO.',
                     '..O..',
                     '.....'],
                    ['.....',
                     '.....',
                     '.OOO.',
                     '..O..',
                     '.....'],
                    ['.....',
                     '..O..',
                     '.OO..',
                     '..O..',
                     '.....']]

PIECES = {'S': S_SHAPE_TEMPLATE,
          'Z': Z_SHAPE_TEMPLATE,
          'J': J_SHAPE_TEMPLATE,
          'L': L_SHAPE_TEMPLATE,
          'I': I_SHAPE_TEMPLATE,
          'O': O_SHAPE_TEMPLATE,
          'T': T_SHAPE_TEMPLATE}

"""
Helper functions.
"""
def get_new_piece():
    """Return a random new piece.

    Returns:
       Dictionary consisting of shape, rotation, x-coordinates and y-coordinates.
    """
    shape = random.choice(list(PIECES.keys()))
    new_piece = {
        'shape': shape,
        'rotation': random.randint(0, len(PIECES[shape]) - 1),
        'x': int(BOARD_WIDTH / 2) - int(TEMPLATE_WIDTH / 2),
        'y': -2 # start it above the board (i.e. less than 0)
    }
    return new_piece

def get_blank_board():
    """Return a new blank board.

    Returns:
        Array of (BOARD_WIDTH, BOARD_HEIGHT) shape, filled with BLANK as value.
    """
    return np.full((BOARD_WIDTH, BOARD_HEIGHT), BLANK)

def is_on_board(x, y):
    """Return whether the position (x, y) is on the board.
    
    Args:
        x (int): x-coordinate
        y (int): y-coordinate

    Returns:
        Boolean value. True if (x, y) is on the board. False otherwise.
    """
    return x >= 0 and x < BOARD_WIDTH and y < BOARD_HEIGHT

"""
Tetris Environment Wrapper Class.
"""
class Env:
    def __init__(self):
        self.modified_squares = []
        self.board = get_blank_board()
        self.falling_piece = get_new_piece()
        self.next_piece = get_new_piece()
        self.lines_cleared = 0
        self.holding_time = 0
        self.pieces = 1
        self.given = False
        self.bestx, self.besty, self.bestr = self.find_best(self.board, self.falling_piece)
    def reset(self):
        """ Resets game """
        self.modified_squares = []
        self.board = get_blank_board()
        self.falling_piece = get_new_piece()
        self.next_piece = get_new_piece()
        self.lines_cleared = 0
        self.holding_time = 0
        self.pieces = 1
        self.given = False
        self.bestx, self.besty, self.bestr = self.find_best(self.board, self.falling_piece)
        return (self.board, 0, False)
    
    
    def compute_metric(self, board, shape, px, py, rot):
            H = []
            PA = PIECES[shape][rot]
            for x in range(BOARD_WIDTH):
                for y in range(BOARD_HEIGHT):
                    if board[x][y] or (0<=y-py<5 and 0<=x-px<5 and PA[y-py][x-px]=='O'): break
                H.append(BOARD_HEIGHT-y)
            height = sum(H)
            bump = sum([abs(H[i]-H[i-1]) for i in range(1, len(H))])
            lines = 0
            for y in range(BOARD_HEIGHT):
                b = False
                for x in range(BOARD_WIDTH):
                    if not (board[x][y] or (0<=y-py<5 and 0<=x-px<5 and PA[y-py][x-px]=='O')):
                        b = True
                        break
                if not b: lines += 1
            holes = 0
            for x in range(BOARD_WIDTH):
                for y in range(BOARD_HEIGHT):
                    if board[x][y] or (0<=y-py<5 and 0<=x-px<5 and PA[y-py][x-px]=='O'):
                        for t in range(y, BOARD_HEIGHT):
                            if not (board[x][t] or (0<=t-py<5 and 0<=x-px<5 and PA[t-py][x-px]=='O')): holes += 1
                        break
            return -0.51*height+0.76*lines-0.35*holes-0.18*bump

    
    
    def find_best(self, board, piece):
        baseline_metric = -float('inf')
        bestx,besty,bestr = 0,0,0
        shape = piece['shape']
        for rot in range(len(PIECES[shape])):
            for x in range(-3, BOARD_WIDTH+3):
                if not self.is_valid_position(board, shape, x, 0, rot,0, 0): continue
                for y in range(BOARD_HEIGHT+3):
                    if not self.is_valid_position(board, shape, x, y, rot,0, 0):

                        y -= 1
                        m = self.compute_metric(board, shape, x,y,rot)
                        if m>=baseline_metric:
                            baseline_metric = m
                            bestx,besty,bestr = x,y,rot
                        break

        return bestx,besty,bestr
                
    def one_step(self, action):
        """Update environment after a step of action.
    
        Args:
            action (char): character that represents action. One of ['L', 'R', 'D', 'R', 'S'].

        Returns:
            Tuple containing (state, reward, end_of_game).
            
            state (tuple): tuple that contains state of the game environment. (board, falling_piece, and next_piece).
            reward (int): reward per action
            end_of_game (boolean): boolean value. True if the game has ended. False otherwise.
        """
        
        state = self.board
        ps = self.falling_piece['shape']
        # if there exists squares modified on previous step, undo the modification first before proceeding
        if self.modified_squares:
            for square in self.modified_squares:
                self.board[square[0]][square[1]] = 0
            self.modified_squares = []
        
        if not self.is_valid_position(self.board, self.falling_piece['shape'],self.falling_piece['x'],
                                                  self.falling_piece['y'],self.falling_piece['rotation'],0, 0):
            # game is finished. Return
            return (state, 0, True, self.lines_cleared, ps)
        reward = 0
        # move the piece with the left key
        if action == 0:
            reward += (self.falling_piece['x']-self.bestx)>0
            reward -= (self.falling_piece['x']-self.bestx)<0
            reward -= 2*(self.falling_piece['x']-self.bestx)==0
            reward -= (self.falling_piece['rotation']!=self.bestr)
            if self.is_valid_position(self.board, self.falling_piece['shape'],self.falling_piece['x'],
                                                  self.falling_piece['y'],self.falling_piece['rotation'],-1, 0):
            
                self.falling_piece['x'] -= 1
            
        # move the piece with the right key
        elif action == 1:
            reward += (self.falling_piece['x']-self.bestx)<0
            reward -= (self.falling_piece['x']-self.bestx)>0
            reward -= 2*(self.falling_piece['x']-self.bestx)==0
            reward -= (self.falling_piece['rotation']!=self.bestr)
            if self.is_valid_position(self.board, self.falling_piece['shape'],self.falling_piece['x'],
                                                  self.falling_piece['y'],self.falling_piece['rotation'],1, 0):
            
                self.falling_piece['x'] += 1
        # make the piece fall faster with the down key
        elif action == 2:
            reward -= (self.falling_piece['x']-self.bestx)!=0
            reward -= (self.falling_piece['rotation']-self.bestr)!=0
            if self.is_valid_position(self.board, self.falling_piece['shape'],self.falling_piece['x'],
                                                  self.falling_piece['y'],self.falling_piece['rotation'],0, 1):
                self.falling_piece['y'] += 1
        # rotating the piece if there is room to rotate
        elif action == 3:
            if self.falling_piece['rotation']-self.bestr!=0:
                if self.falling_piece['x'] != self.bestx: reward += 1
            else: reward -= len(PIECES[self.falling_piece['shape']])
            self.falling_piece['rotation'] = (self.falling_piece['rotation'] + 1) % len(PIECES[self.falling_piece['shape']])
            if not self.is_valid_position(self.board, self.falling_piece['shape'],self.falling_piece['x'],
                                                  self.falling_piece['y'],self.falling_piece['rotation'],0, 0):
                self.falling_piece['rotation'] = (self.falling_piece['rotation'] - 1) % len(PIECES[self.falling_piece['shape']])
        # move the current piece all the way down
        elif action == 4:
            if self.falling_piece['x']-self.bestx!=0: reward -= 1
            else: reward += 1
            if self.falling_piece['rotation']-self.bestr!=0: reward -= 1
            else: reward += 1
            for i in range(1, BOARD_HEIGHT):
                if not self.is_valid_position(self.board, self.falling_piece['shape'],self.falling_piece['x'],
                                                  self.falling_piece['y'],self.falling_piece['rotation'],0, i):
                    break
            self.falling_piece['y'] += i - 1
        # if falling piece has landed, set it on the board
        reward -= 0.2*self.holding_time/BOARD_HEIGHT * (self.falling_piece['x']==self.bestx) *  (self.falling_piece['rotation']==self.bestr)

        if not self.is_valid_position(self.board, self.falling_piece['shape'],self.falling_piece['x'],
                                                  self.falling_piece['y'],self.falling_piece['rotation'],0, 1):
            self.add_to_board(self.falling_piece)
            lines = self.remove_complete_lines()
            self.lines_cleared += lines
            reward += 2**lines-1
            self.falling_piece = self.next_piece
            self.next_piece = get_new_piece()
            self.bestx, self.besty, self.bestr = self.find_best(self.board, self.falling_piece)
            self.holding_time = 0
            self.pieces += 1
        # if falling piece has not landed, just move the piece down
        else:
            self.falling_piece['y'] += 1
            self.holding_time += 1
            # Add the falling piece to the board to output an image of the current state of board
            # Need to remove this falling piece from the board before the next iteration
            self.add_to_board_in_bound_modified_squares(self.falling_piece)
       
        return (deepcopy(self.board), reward, False, self.lines_cleared, ps)
    
    def max_height(self):
        for y in range(BOARD_HEIGHT):
            if self.board[self.falling_piece['x']][y]!=0: return y
        return y
    def add_to_board(self, piece):
        """Fill in the board based on piece's location, shape, and rotation.
    
        Args:
            piece (dict): data structure of a piece consisting of shape, rotation, x-coordinates and y-coordinates.

        Returns:
            None
        """
        for x in range(TEMPLATE_WIDTH):
            for y in range(TEMPLATE_HEIGHT):
                if PIECES[piece['shape']][piece['rotation']][y][x] != '.' and y + piece['y'] >= 0:
                    self.board[x + piece['x']][y + piece['y']] = 1
    
    def add_to_board_in_bound_modified_squares(self, piece):
        """Fill in the board based on piece's location, shape, and rotation.
        Also populates self.modified_squares with the squares on the board that were just modified
        when adding the piece to the board.
    
        Args:
            piece (dict): data structure of a piece consisting of shape, rotation, x-coordinates and y-coordinates.

        Returns:
            None
        """
        for x in range(TEMPLATE_WIDTH):
            for y in range(TEMPLATE_HEIGHT):
                if PIECES[piece['shape']][piece['rotation']][y][x] != '.' and y + piece['y'] >= 0:
                    self.board[x + piece['x']][y + piece['y']] = 1
                    self.modified_squares.append([x + piece['x'], y + piece['y']])

    def is_valid_position(self, board, shape,px,py,rot, adjX, adjY):
        """Return whether the falling piece is within the board and not colliding,
        after adding (adjX, adjY) to the current coordinates (x, y) of the falling piece.
    
        Args:
            adjX (int): move x-coordinate by adjX
            adjY (int): move y-coordinate by adjY

        Returns:
            Boolean value. True if the resulting coordinate is in valid position. False otherwise.
        """
        for x in range(TEMPLATE_WIDTH):
            for y in range(TEMPLATE_HEIGHT):
                is_above_board = y + py + adjY < 0
                if is_above_board or PIECES[shape][rot][y][x] == '.':
                    continue
                if not is_on_board(x + px + adjX, y + py + adjY):
                    return False
                if board[x + px + adjX][y + py + adjY] != BLANK:
                    return False
        return True

    def is_complete_line(self, y):
        """Return whether the line in y-coordinate is completely filled with boxes.
    
        Args:
            y (int): y-coordinate

        Returns:
            Boolean value. True if the line is filled with boxes. False otherwise.
        """
        return all ([self.board[x][y] != BLANK for x in range(BOARD_WIDTH)])

    def remove_complete_lines(self):
        """Return the number of complete lines after removing any completed lines
        on the board and moving everything above them down.
    
        Returns:
            Int value. Number of lines removed.
        """
        num_lines_removed = 0
        y = BOARD_HEIGHT - 1 # start y at the bottom of the board
        while y >= 0:
            if self.is_complete_line(y):
                # Remove the line and pull boxes down by one line.
                for pull_down_Y in range(y, 0, -1):
                    for x in range(BOARD_WIDTH):
                        self.board[x][pull_down_Y] = self.board[x][pull_down_Y-1]
                # Set very top line to blank.
                for x in range(BOARD_WIDTH):
                    self.board[x][0] = BLANK
                num_lines_removed += 1
                # Note on the next iteration of the loop, y is the same.
                # This is so that if the line that was pulled down is also
                # complete, it will be removed.
            else:
                y -= 1 # move on to check next row up
        return num_lines_removed

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import clone_model
from tensorflow.keras.layers import Dense, Conv2D, Flatten
from tensorflow.keras.optimizers import RMSprop
from collections import deque
from copy import deepcopy
tf.keras.backend.clear_session()

In [2]:
# Deep Q-learning Agent
class DQNAgent:
    def __init__(self, state_size, action_size):
        tf.compat.v1.reset_default_graph()
        self.state_size = state_size
        self.action_size = action_size
        # Hyper parameters
        self.memory = deque(maxlen=256)
        self.gamma = 0.99    # discount rate
        self.epsilon = 1 # exploration rate
        self.epsilon_min = 0.1
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.Errs = deque(maxlen = 100)
        self.lag_model = self._build_model()
        self.StatePH = tf.compat.v1.placeholder(tf.float32, [None,10,20,1], name = 'StatePH')
        self.targetFPH = tf.compat.v1.placeholder(tf.float32, [None, 5], name = 'TargetFPH')
        self.x = tf.compat.v1.placeholder(tf.float32, [None,10,20,1], name = 'x')
        self.res = self.lag_predict(x)
        tf.compat.v1.Session().run(tf.compat.v1.global_variables_initializer())
        #self.model.load_weights('my_checkpoint_LargerNetwork_overnight')
        #self.lag_model.load_weights('my_checkpoint_LargerNetwork_overnight')
    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Conv2D(32, kernel_size=4, activation='relu', input_shape=(10,20,1)))
        model.add(Conv2D(64, kernel_size=3, activation='relu'))
        model.add(Conv2D(64, kernel_size=2, activation='relu'))
        model.add(Flatten())
        model.add(Dense(256, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer = RMSprop(learning_rate=self.learning_rate, momentum = 0.95, rho = 0.95, epsilon = 0.01))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        # Places the state in memory
        
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state):
        # Selects an action based on epsilon-greedy algorithm
        if np.random.rand() <= self.epsilon:
            x = np.random.rand()
            if x < 3.0/7: return random.randrange(2)
            elif x < 5.0/7: return 3
            elif x < 6.0/7: return 2
            else: return 4
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
    
    def update(self):
        self.lag_model.set_weights(self.model.get_weights())
        
    def replay(self, batch_size):
        # Memory replay
        minibatch = random.sample(self.memory, min(batch_size, len(self.memory)))
        States,R,Ns,D,A = [],[],[], [],[]
        for state, action, reward, next_state, done in minibatch:
            R.append(reward)
            D.append(not done)
            Ns.append(next_state[0])
            A.append(action)
            States.append(state[0])
        R,D,Ns,A,States = np.array(R), np.array(D), np.array(Ns), np.array(A), np.array(States)
        PR = tf.compat.v1.Session().run(self.res,feed_dict={x: Ns})
        R = R + D*self.gamma*np.amax(PR, axis = 1)
        targetF = self.model.predict(States)

        for i in range(len(targetF)):
            targetF[i][A[i]] = targetF[i][A[i]]+max(-1, min(1, R[i]-targetF[i][A[i]]))
            #targetF[i][A[i]] = R[i]
        x = self.model.fit(States, targetF, epochs=1, verbose=0, batch_size=batch_size).history['loss']
        self.Errs.append(sum(x)/len(x))
        

In [7]:
episodes = 500000
updatefreq = 4*420
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import random
import time
import os
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import clone_model
from tensorflow.keras.layers import Dense, Conv2D, Flatten
from tensorflow.keras.optimizers import RMSprop
from collections import deque
from copy import deepcopy
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
    # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
        print(e)

fig = plt.figure()
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)
plt.ion()
ax1.grid()
width = 10
height = 20
ax1.set_xlim(-0.5, width-0.5)
ax1.set_ylim(height-0.5, 0.5)
ax1.set_xticks(np.arange(-0.5, width-0.5, 1))
ax1.set_yticks(np.arange(0.5, height-0.5, 1))
# initialize gym environment and the agent
totalR = deque(maxlen = 100)
pieces = deque(maxlen = 50)
pieces.append(0)
totalR.append(0)
env = Env()


### AGENT #########################
def build_model():
    # Neural Net for Deep-Q learning Model
    model = Sequential()
    model.add(Conv2D(32, kernel_size=8, padding="same", activation=tf.keras.layers.LeakyReLU(alpha=0.001), input_shape=(10,20,1)))
    model.add(Conv2D(64, kernel_size=6, padding="same", activation=tf.keras.layers.LeakyReLU(alpha=0.001)))
    model.add(Conv2D(64, kernel_size=3, padding="same", activation=tf.keras.layers.LeakyReLU(alpha=0.001)))
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse', optimizer = RMSprop(learning_rate=learning_rate, momentum = 0.95, rho = 0.95, epsilon = 0.01))
    return model

def remember(state, action, reward, next_state, done,piece):
    # Places the state in memory
    memory.append((state, action, reward, next_state, done, piece))
    
def act(state):
    # Selects an action based on epsilon-greedy algorithm
    if np.random.rand() <= epsilon:
        return random.randrange(action_size)
        #x = np.random.rand()
        #if x < 3.0/7: return random.randrange(2)
        #elif x < 5.0/7: return 3
        #elif x < 6.0/7: return 2
        #else: return 4
    return np.argmax(model.predict(state)[0])
    #print(act_values)
      

def update():
    lag_model.set_weights(model.get_weights())

def replay(batch_size):
    # Memory replay
    minibatch = random.sample(memory, min(batch_size, len(memory)))
    States,R,Ns,D,A = [],[],[], [],[]
    for state, action, reward, next_state, done, ps in minibatch:
        R.append(reward)
        D.append(not done)
        Ns.append(next_state[0])
        A.append(action)
        States.append(state[0])
    Ns, States = np.array(Ns), np.array(States)
    #print(np.shape(Ns[0]))
    R,D,A = np.array(R), np.array(D), np.array(A)
    #x = lag_model.predict(Ns)
    #x = model.predict(States)
    #del States, R, Ns, D, A
    R = R + D*gamma*np.amax(lag_model.predict(Ns), axis = 1)
    #model = tf.keras.models.load_model('mymodel')
    targetF = model.predict(States)
    OrgF = deepcopy(targetF)
    #temp1 = targetF
    for i in range(len(targetF)):
        targetF[i][A[i]] = targetF[i][A[i]]+max(-1.0, min(1.0, R[i]-targetF[i][A[i]]))
    #temp2 = model.predict(States)
    #print(np.sum(np.abs(temp1-targetF))) # should be <= 256
    #print(np.sum(np.abs(temp2-targetF))) # should be 0
    #States = deepcopy(States)
    #targetF = deepcopy(targetF)
    #loss = tf.keras.losses.mean_squared_error(targetF, OrgF)
    #y = np.sum(np.mean(np.square(targetF - OrgF), axis=-1))/256.0
    x = model.fit(States, targetF, epochs=1, verbose=0).history['loss']
    x = sum(x)/len(x)
    #print(x,y,loss)
    #x = model.train_on_batch(States, targetF)
    #model.save('mymodel')
    Errs.append(x)
    
state_size, action_size = (10,20), 5
memory = deque(maxlen=40000)
gamma = 0.99    # discount rate
epsilon = 0 # exploration rate
epsilon_min = 0.1
learning_rate = 0.00035
model = build_model()
#model.save('mymodel')
Errs = deque(maxlen = 100)
Errs.append(0)
lag_model = build_model()
model.load_weights('my_checkpoint_LargerNetwork_overnight2')
lag_model.load_weights('my_checkpoint_LargerNetwork_overnight_lag2')
# Iterate the game

#rolling average rewards and Errors for past 100 games
AVGRs, AVGEs = [],[]

#Frequency of actions for each episode
FREQS = []

#Number of pieces placed in a game before it ended
PIECESPERGAME = []

j = 0
it = 0
for e in range(episodes):
    freqs = [0,0,0,0,0]
    if e % 50 == 0 and e!=0:
        #model.save_weights('my_checkpoint_LargerNetwork_overnight2')
        #lag_model.save_weights('my_checkpoint_LargerNetwork_overnight_lag2')
        AvgR = sum(totalR)/len(totalR)
        AvgE = sum(Errs)/len(Errs)
        AVGRs.append(AvgR)
        AVGEs.append(AvgE)
        #np.savetxt('AVGRewards2.txt', AVGRs)
        #np.savetxt('AVGErrors2.txt', AVGEs)
        #np.savetxt('PieceFrequencies2.txt', FREQS)
        #np.savetxt('PiecesPerGame2.txt', PIECESPERGAME)
        ax2.plot(e,AvgR, marker = 'o', markersize = 2, color = 'k')
        ax3.plot(e,AvgE, marker = 'o', markersize = 2, color = 'k')
        fig.canvas.draw()
        print("episode: {}/{}, Avg score: {}, Errs: {}, epsilon: {}"
                  .format(e, episodes, sum(totalR)/len(totalR), sum(Errs)/len(Errs), epsilon))
    #nums = 0
    # reset state in the beginning of each game
    state, reward, done = env.reset()
    state = np.reshape(state, [-1,10, 20,1])
    # time_t represents each frame of the game
    # Our goal is to keep the pole upright as long as possible until score of 500
    # the more time_t the more score
    total_reward = reward

    while not done:
        it += 1
        # Decide action
        #bestx, besty, bestr = env.bestx, env.besty, env.bestr
        #if env.falling_piece['rotation']!=bestr: action = 3
        #elif env.falling_piece['x'] > bestx: action = 0
        #elif env.falling_piece['x']<bestx: action = 1
        #else: action = 4
        #action = 4
        action = act(state)
        freqs[action] += 1
        # Advance the game to the next frame based on the action.
        next_state, reward, done,lines, piece = env.one_step(action)
        next_state = np.reshape(next_state, [-1,10, 20, 1])
        print(reward)
        total_reward += reward
        # Remember the previous state, action, reward, and done
        remember(state, action, reward, next_state, done, piece)
        # make next_state the new current state for the next frame.
        state = next_state
        # done becomes True when the game ends

        #if it % 1000 == 0:
            #ax2.plot(it/1000.0,sum(agent.Errs)/len(agent.Errs), marker = 'o', markersize = 2, color = 'k')
            #fig.canvas.draw()
        if done:
            # print the score and break out of the loop
            #print("episode: {}/{}, score: {}, lines cleared: {}, Avg Time per cycle: {}, epsilon: {}"
                  #.format(e, episodes, total_reward, lines, totalt/nums, agent.epsilon))
            totalR.append(total_reward)
            pieces.append(env.pieces)
            FREQS.append(freqs)
            PIECESPERGAME.append(env.pieces)
            #ax2.plot(e,total_reward, marker = 'o', markersize = 2, color = 'k')
            #ax3.plot(e,agent.epsilon, marker = 'o', markersize = 2, color = 'k')
            #fig.canvas.draw()
            break
    # train the agent with the experience of the episode
        if it>2100:
            #t0 = time.time()
            #replay(32)
            if epsilon>epsilon_min: epsilon -= 0.00001
    
        #nums += 1

        j += 1
        if j >= updatefreq:
            #update()
            j = 0
        if it % 5 == 0:
            grid = state[0,:,:,0]
            ax1.imshow(np.transpose(grid), cmap=plt.cm.binary, interpolation='none')
            fig.canvas.draw()

        

1 Physical GPUs, 1 Logical GPUs


<IPython.core.display.Javascript object>

-2.0
1.0
1.0
1.0
0.96
-2.0
-2.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
-1.0
1.0
0.0
0.0
1.0
0.0
-1.0
1.0
1.0
1.0
-1.0
-1.0
-1.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
-1.0
1.0
1.0
0.97
-1.0
-1.0
1.0
-1.0
-1.0
-1.0
-1.0
1.0
-1.0
-1.0
-4.0
1.0
1.0
1.0
1.0
-4.0
-2.0
1.0
0.0
-2.0
1.0
0
1.0
1.0
1.0
0.97
1.96
-1.0
0.99
-1.0
0.97
-1.0
0.95
-1.0
0.9299999999999999
1.92
0.0
1.0
1.0
1.0
0.0
1.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
1.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
-1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-2.0
0
-2.0
1.0
-1.0
-1.0
1.0
-1.0
1.0
1.0
1.0
-1.0
1.0
1.0
-1.0
1.0
-1.0
-1.0
-2.0
-2.0
-2.0
-2.0
1.0
1.0
1.0
1.0
-1.0
1.0
-1.0
-1.0
-1.0
0.0
1.0
-1.0
-1.0
-1.0
1.0
1.0
-1.0
1.0
-1.0
0.0
-2.0
1.0
0.0
-2.0
-2.0
-1.0
0.0
0.0
-2.0
0
1.0
1.0
1.0
0.97
1.96
1.0
-1.0
1.0
-1.0
1.0
1.0
1.0
0.0
-2.0
-2.0
-2.0
1.0
1.99
-2.0
0.0
0.0
-0.030000000000000006
-1.0
0.0
-1.0
1.0
1.0
0.97
-1.0
0.0
-2.0
0.0
1.0
0.0
1.0
1.0
0.0
1.0
0
-1.0
1.0
-1.0
1.0
1.0
1.0
0.94
1.93
1.0
1.0
1.0
0.97
1.96
0.0
-0.01
1.98
0.0
1.0

KeyboardInterrupt: 

In [None]:
def animate(env, actions):
    %matplotlib
    
    fig = plt.gcf()
    fig.show()
    fig.canvas.draw()
    plt.grid()
    
    for action in actions:
        time.sleep(0.2)
        
        state, reward, done = env.one_step(action)
        if done:
            break
        falling_piece_shape = state[1]['shape']
        next_piece_shape = state[2]['shape']
        
        plt.title('Action: ' + str(action) + ', Cur: ' + falling_piece_shape + ', Next: ' + next_piece_shape)
        
        board = state[0]
        print(np.shape(board))
        print(type(board))
        plt.imshow(np.transpose(board), cmap=plt.cm.binary, interpolation='none')
        width = len(board)
        height = len(board[0])
        plt.xlim(-0.5, width-0.5)
        plt.ylim(height-0.5, 0.5)
        ax = plt.gca()
        ax.set_xticks(np.arange(-0.5, width-0.5, 1))
        ax.set_yticks(np.arange(0.5, height-0.5, 1))
        
        fig.canvas.draw()

if __name__ == "__main__":
    random.seed(2)
    env = Env()
    action_length = 100
    actions = [random.randint(0, 5) for _ in range(action_length)]

    animate(env, actions)
    
    plt.show()

In [6]:
BOARD_WIDTH = 10
BOARD_HEIGHT = 20
board1 = np.random.randint(2, size = (BOARD_WIDTH, 10))
board2 = board = np.random.randint(1, size = (BOARD_WIDTH, 10))
board = np.concatenate((board2, board1), axis=1)
def is_valid_position(board, shape,px,py,rot, adjX, adjY):
        """Return whether the falling piece is within the board and not colliding,
        after adding (adjX, adjY) to the current coordinates (x, y) of the falling piece.
    
        Args:
            adjX (int): move x-coordinate by adjX
            adjY (int): move y-coordinate by adjY

        Returns:
            Boolean value. True if the resulting coordinate is in valid position. False otherwise.
        """
        for x in range(TEMPLATE_WIDTH):
            for y in range(TEMPLATE_HEIGHT):
                is_above_board = y + py + adjY < 0
                if is_above_board or PIECES[shape][rot][y][x] == '.':
                    continue
                if not is_on_board(x + px + adjX, y + py + adjY):
                    return False
                if board[x + px + adjX][y + py + adjY] != BLANK:
                    return False
        return True
def compute_metric(board, shape, px, py, rot):
        H = []
        PA = PIECES[shape][rot]
        for x in range(BOARD_WIDTH):
            for y in range(BOARD_HEIGHT):
                if board[x][y] or (0<=y-py<5 and 0<=x-px<5 and PA[y-py][x-px]=='O'): break
            H.append(BOARD_HEIGHT-y)
        height = sum(H)
        bump = sum([abs(H[i]-H[i-1]) for i in range(1, len(H))])
        lines = 0
        for y in range(BOARD_HEIGHT):
            b = False
            for x in range(BOARD_WIDTH):
                if not (board[x][y] or (0<=y-py<5 and 0<=x-px<5 and PA[y-py][x-px]=='O')):
                    b = True
                    break
            if not b: lines += 1
        holes = 0
        for x in range(BOARD_WIDTH):
            for y in range(BOARD_HEIGHT):
                if board[x][y] or (0<=y-py<5 and 0<=x-px<5 and PA[y-py][x-px]=='O'):
                    for t in range(y, BOARD_HEIGHT):
                        if not (board[x][t] or (0<=t-py<5 and 0<=x-px<5 and PA[t-py][x-px]=='O')): holes += 1
                    break
        return -0.51*height+0.76*lines-0.186*bump-0.35*holes


def find_best(board, shape):
    baseline_metric = -float('inf')
    bestx,besty,bestr = 0,0,0
    for rot in range(len(PIECES[shape])):
        for x in range(-3, BOARD_WIDTH+3):
            if not is_valid_position(board, shape, x, 0, rot,0, 0): continue
            for y in range(BOARD_HEIGHT+3):
                if not is_valid_position(board, shape, x, y, rot,0, 0):
            
                    y -= 1
                    m = compute_metric(board, shape, x,y,rot)
                    if m>=baseline_metric:
                        baseline_metric = m
                        bestx,besty,bestr = x,y,rot
                    break
                
    return bestx,besty,bestr
def is_valid_position(board, shape,px,py,rot, adjX, adjY):
    """Return whether the falling piece is within the board and not colliding,
    after adding (adjX, adjY) to the current coordinates (x, y) of the falling piece.

    Args:
        adjX (int): move x-coordinate by adjX
        adjY (int): move y-coordinate by adjY

    Returns:
        Boolean value. True if the resulting coordinate is in valid position. False otherwise.
    """
    for x in range(TEMPLATE_WIDTH):
        for y in range(TEMPLATE_HEIGHT):
            is_above_board = y + py + adjY < 0
            if is_above_board or PIECES[shape][rot][y][x] == '.':
                continue
            if not is_on_board(x + px + adjX, y + py + adjY):
                return False
            if board[x + px + adjX][y + py + adjY] != BLANK:
                return False
    return True
def is_on_board(x, y):
    """Return whether the position (x, y) is on the board.
    
    Args:
        x (int): x-coordinate
        y (int): y-coordinate

    Returns:
        Boolean value. True if (x, y) is on the board. False otherwise.
    """
    return x >= 0 and x < BOARD_WIDTH and 0<= y < BOARD_HEIGHT
t1 = time.time()
px,py,r = find_best(board, 'Z')
print('Best', px,py,r)
print('Increased Metric', compute_metric(board, 'Z', px, py, r))
for y in range(BOARD_HEIGHT):
    for x in range(BOARD_WIDTH):
        if 0<=y-py<5 and 0<=x-px<5 and PIECES['Z'][r][y-py][x-px]=='O':
            board[x][y]+=2
t2 = time.time()
print(t2-t1)
plt.figure()
plt.imshow(np.transpose(board),cmap=plt.cm.binary, interpolation='none')


# Try using the "right" policy
# penalize holding time more

Best 7 7 1
Increased Metric -66.444
0.004022836685180664


<IPython.core.display.Javascript object>

<matplotlib.image.AxesImage at 0x1f720efce80>

In [47]:
model = Sequential()
model.add(Conv2D(32,3,activation='relu',padding='same', input_shape=(10,20, 1)))#120
model.add(Conv2D(64,1,activation = 'relu', padding = 'same'))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(5, activation='linear'))
model.compile(loss='mse', optimizer = RMSprop(learning_rate=0.01, momentum = 0.95, rho = 0.95, epsilon = 0.01))

In [14]:
fig = plt.figure()
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
plt.ion()
ax1.grid()
ax2.grid()
width = 10
height = 20
ax1.set_xlim(-0.5, width-0.5)
ax1.set_ylim(height-0.5, 0.5)
ax1.set_xticks(np.arange(-0.5, width-0.5, 1))
ax1.set_yticks(np.arange(0.5, height-0.5, 1))
ax2.set_xlim(-0.5, width-0.5)
ax2.set_ylim(height-0.5, 0.5)
ax2.set_xticks(np.arange(-0.5, width-0.5, 1))
ax2.set_yticks(np.arange(0.5, height-0.5, 1))

def compute_metric(board, shape, px, py, rot):
        H = []
        PA = PIECES[shape][rot]
        for x in range(BOARD_WIDTH):
            for y in range(BOARD_HEIGHT):
                if board[x][y] or (0<=y-py<5 and 0<=x-px<5 and PA[y-py][x-px]=='O'): break
            H.append(BOARD_HEIGHT-y)
        height = sum(H)
        bump = sum([abs(H[i]-H[i-1]) for i in range(1, len(H))])
        lines = 0
        for y in range(BOARD_HEIGHT):
            b = False
            for x in range(BOARD_WIDTH):
                if not (board[x][y] or (0<=y-py<5 and 0<=x-px<5 and PA[y-py][x-px]=='O')):
                    b = True
                    break
            if not b: lines += 1
        holes = 0
        for x in range(BOARD_WIDTH):
            for y in range(BOARD_HEIGHT):
                if board[x][y] or (0<=y-py<5 and 0<=x-px<5 and PA[y-py][x-px]=='O'):
                    for t in range(y, BOARD_HEIGHT):
                        if not (board[x][t] or (0<=t-py<5 and 0<=x-px<5 and PA[t-py][x-px]=='O')): holes += 1
                    break
        return -0.51*height+0.76*lines-0.35*holes-0.18*bump

def is_valid_position(board, shape,px,py,rot, adjX, adjY):
    for x in range(TEMPLATE_WIDTH):
        for y in range(TEMPLATE_HEIGHT):
            is_above_board = y + py + adjY < 0
            if is_above_board or PIECES[shape][rot][y][x] == '.':
                continue
            if not is_on_board(x + px + adjX, y + py + adjY):
                return False
            if board[x + px + adjX][y + py + adjY] != BLANK:
                return False
    return True

def find_best(board, shape):
    baseline_metric = -float('inf')
    bestx,besty,bestr = 0,0,0

    for rot in range(len(PIECES[shape])):
        for x in range(-3, BOARD_WIDTH+3):
            if not is_valid_position(board, shape, x, 0, rot,0, 0): continue
            for y in range(BOARD_HEIGHT+3):
                if not is_valid_position(board, shape, x, y, rot,0, 0):

                    y -= 1
                    m = compute_metric(board, shape, x,y,rot)
                    if m>=baseline_metric:
                        baseline_metric = m
                        bestx,besty,bestr = x,y,rot
                    break
    return bestx,besty,bestr

t = 30

randBoard = np.random.normal(size = (1,10,20,1))
print('Move: ', memory[t][1])
print('Reward: ', memory[t][2])
print('Piece: ', memory[t][-1])
p = memory[t][-1]
print(model.predict(memory[t][0]))
print(model.predict(memory[t][3]))
print(model.predict(randBoard))
grid = deepcopy(memory[t][0][0,:,:,0])
px,py,r = find_best(grid, p)
for y in range(BOARD_HEIGHT):
    for x in range(BOARD_WIDTH):
        if 0<=y-py<5 and 0<=x-px<5 and PIECES[p][r][y-py][x-px]=='O':
            grid[x][y]+=3
ax1.imshow(np.transpose(grid), cmap=plt.cm.binary, interpolation='none')
grid = deepcopy(memory[t][3][0,:,:,0])
for y in range(BOARD_HEIGHT):
    for x in range(BOARD_WIDTH):
        if 0<=y-py<5 and 0<=x-px<5 and PIECES[p][r][y-py][x-px]=='O':
            grid[x][y]+=3
ax2.imshow(np.transpose(grid), cmap=plt.cm.binary, interpolation='none')
fig.canvas.draw()

<IPython.core.display.Javascript object>

Move:  0
Reward:  0.0
Piece:  I
[[28.688921 28.6328   27.45998  22.80893  25.818954]]
[[26.470707 30.122082 26.77666  26.01302  16.82592 ]]
[[-32.48075  -33.422134 -31.957403 -28.558294 -26.943983]]


In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input,Conv2D, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import clone_model
from tensorflow.keras.layers import Dense, Conv2D, Flatten
from tensorflow.keras.optimizers import RMSprop
from collections import deque
from copy import deepcopy
tf.keras.backend.clear_session()
import gc
matrixSide = 128 #define a big enough matrix to give memory issues
model = Sequential()
model.add(Conv2D(32,3,activation='relu',padding='same', input_shape=(matrixSide, matrixSide, 12)))#120
model.add(Conv2D(64,1,activation = 'relu', padding = 'same'))
model.add(Conv2D(64,3,activation = 'relu', padding = 'same'))
model.add(Conv2D(1,1,activation = 'relu', padding = 'same'))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.compile(loss='mse', optimizer = RMSprop(learning_rate=0.01, momentum = 0.95, rho = 0.95, epsilon = 0.01))



#run predictions

for i in range (30000):
    inImm = np.zeros((64,matrixSide,matrixSide,12))
    outImm = model.predict(inImm)

KeyboardInterrupt: 

In [1]:
# MEMORY LEAK TESTING

import matplotlib.pyplot as plt
import numpy as np
import random
import time
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import clone_model
from tensorflow.keras.layers import Dense, Conv2D, Flatten
from tensorflow.keras.optimizers import RMSprop
from collections import deque
from copy import deepcopy
tf.keras.backend.clear_session()

model = Sequential()
model.add(Conv2D(32, kernel_size=4, input_shape=(10,20,1)))
model.add(Conv2D(64, kernel_size=3, activation='relu'))
model.add(Conv2D(64, kernel_size=3, activation='relu'))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(8, activation='linear'))
model.compile(loss='mse', optimizer = RMSprop(learning_rate=0.001, momentum = 0.95, rho = 0.95, epsilon = 0.01))

times = deque(maxlen = 500)
for i in range(200000):
    tempx = np.zeros((256,10,20,1))
    tempy = np.zeros((256,8))
    t0 = time.time()
    act_values = model.fit(tempx, tempy, epochs = 1, verbose=0, batch_size = 256)
    times.append(time.time()-t0)
    if i % 50 == 0: print(sum(times)/len(times))

2.2822349071502686
0.08135778763714958
0.060090674032079114
0.05339335132118882
0.049641691037078405
0.04768807954522243
0.04618224432302076
0.045042437365931325
0.04445797249563317
0.04379331669099051
0.03874838161468506
0.03901313066482544
0.03896823978424072
0.0389485182762146
0.038886003971099856
0.03872193670272827
0.03889572715759277
0.038875873565673826
0.038673686027526855
0.038816738605499265
0.038864844799041745
0.039043806552886966


KeyboardInterrupt: 

In [2]:
episodes = 30000
updatefreq = 420
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import random
import time
import os
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import clone_model
from tensorflow.keras.layers import Dense, Conv2D, Flatten
from tensorflow.keras.optimizers import RMSprop
from collections import deque
from copy import deepcopy


fig = plt.figure()
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)
plt.ion()
ax1.grid()
width = 10
height = 20
ax1.set_xlim(-0.5, width-0.5)
ax1.set_ylim(height-0.5, 0.5)
ax1.set_xticks(np.arange(-0.5, width-0.5, 1))
ax1.set_yticks(np.arange(0.5, height-0.5, 1))
# initialize gym environment and the agent
totalR = deque(maxlen = 50)
pieces = deque(maxlen = 50)
pieces.append(0)
totalR.append(0)
env = Env()

### AGENT #########################
def build_model():
    # Neural Net for Deep-Q learning Model
    model = Sequential()
    model.add(Conv2D(32, kernel_size=4, activation='relu', input_shape=(10,20,1)))
    model.add(Conv2D(64, kernel_size=3, activation='relu'))
    model.add(Conv2D(64, kernel_size=3, activation='relu'))
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse', optimizer = RMSprop(learning_rate=learning_rate, momentum = 0.95, rho = 0.95, epsilon = 0.01))
    return model

def remember(state, action, reward, next_state, done,piece):
    # Places the state in memory
    memory.append((state, action, reward, next_state, done, piece))
    
def act(state):
    # Selects an action based on epsilon-greedy algorithm
    if np.random.rand() <= epsilon:
        x = np.random.rand()
        if x < 3.0/7: return random.randrange(2)
        elif x < 5.0/7: return 3
        elif x < 6.0/7: return 2
        else: return 4
    act_values = model.predict(state)
    return np.argmax(act_values[0])  # returns action

def update():
    lag_model.set_weights(model.get_weights())

def replay(batch_size):
    # Memory replay
    minibatch = random.sample(memory, min(batch_size, len(memory)))
    States,R,Ns,D,A = [],[],[], [],[]
    for state, action, reward, next_state, done, ps in minibatch:
        R.append(reward)
        D.append(not done)
        Ns.append(next_state[0])
        A.append(action)
        States.append(state[0])
    Ns, States = np.array(Ns), np.array(States)
    #print(np.shape(Ns[0]))
    R,D,A = np.array(R), np.array(D), np.array(A)
    #x = lag_model.predict(Ns)
    #del States, R, Ns, D, A
    #R = R + D*gamma*np.amax(lag_model.predict(Ns), axis = 1)
    #model = tf.keras.models.load_model('mymodel')
    #targetF = model.predict(States)
    #for i in range(len(targetF)):
        #targetF[i][A[i]] = targetF[i][A[i]]+max(-1, min(1, R[i]-targetF[i][A[i]]))
    #States = deepcopy(States)
    #targetF = deepcopy(targetF)
    #x = model.fit(States, targetF, epochs=1, verbose=0, batch_size=batch_size).history['loss']
    #x = model.train_on_batch(States, targetF)
    #model.save('mymodel')
    #Errs.append(x)
    
        
state_size, action_size = (10,20), 5
memory = deque(maxlen=1000)
gamma = 0.99    # discount rate
epsilon = 0 # exploration rate
epsilon_min = 0.1
learning_rate = 0.001
model = build_model()
#model.save('mymodel')
Errs = deque(maxlen = 100)
Errs.append(0)
lag_model = build_model()
# Iterate the game

j = 0
it = 0
for e in range(episodes):
    if e % 10 == 0 and e!=0:
        #model.save_weights('my_checkpoint_LargerNetwork_overnight')
        ax2.plot(e,sum(totalR)/len(totalR), marker = 'o', markersize = 2, color = 'k')
        ax3.plot(e,sum(Errs)/len(Errs), marker = 'o', markersize = 2, color = 'k')
        fig.canvas.draw()
        print("episode: {}/{}, Avg score: {}, Errs: {}, epsilon: {}"
                  .format(e, episodes, sum(totalR)/len(totalR), sum(Errs)/len(Errs), epsilon))
    nums = 0
    # reset state in the beginning of each game
    state, reward, done = env.reset()
    state = np.reshape(state, [-1,10, 20,1])
    # time_t represents each frame of the game
    # Our goal is to keep the pole upright as long as possible until score of 500
    # the more time_t the more score
    total_reward = reward

    while not done:
        it += 1
        # Decide action
        #bestx, besty, bestr = env.bestx, env.besty, env.bestr
        #if env.falling_piece['rotation']!=bestr: action = 3
        #elif env.falling_piece['x'] > bestx: action = 0
        #elif env.falling_piece['x']<bestx: action = 1
        #else: action = 4
        #action = 4
        action = act(state)
        # Advance the game to the next frame based on the action.
        next_state, reward, done,lines, piece = env.one_step(action)
        next_state = np.reshape(next_state, [-1,10, 20, 1])
        total_reward += reward
        # Remember the previous state, action, reward, and done
        remember(state, action, reward, next_state, done, piece)
        # make next_state the new current state for the next frame.
        state = next_state
        # done becomes True when the game ends

        #if it % 1000 == 0:
            #ax2.plot(it/1000.0,sum(agent.Errs)/len(agent.Errs), marker = 'o', markersize = 2, color = 'k')
            #fig.canvas.draw()
        if done:
            # print the score and break out of the loop
            #print("episode: {}/{}, score: {}, lines cleared: {}, Avg Time per cycle: {}, epsilon: {}"
                  #.format(e, episodes, total_reward, lines, totalt/nums, agent.epsilon))
            totalR.append(total_reward)
            pieces.append(env.pieces)
            #ax2.plot(e,total_reward, marker = 'o', markersize = 2, color = 'k')
            #ax3.plot(e,agent.epsilon, marker = 'o', markersize = 2, color = 'k')
            #fig.canvas.draw()
            break
    # train the agent with the experience of the episode
        if it>2:
            t0 = time.time()
            replay(256)
            #if epsilon>epsilon_min: epsilon -= 0.00001
    
        nums += 1

        j += 1
        if j >= updatefreq:
            #update()
            j = 0
        #if it % 5 == 0:
            #grid = state[0,:,:,0]
            #ax1.imshow(np.transpose(grid), cmap=plt.cm.binary, interpolation='none')
            #fig.canvas.draw()

        

NameError: name 'Sequential' is not defined

In [3]:
model.save_weights('my_checkpoint_LargerNetwork_overnight_lag')