In [None]:
import time
import numpy as np
import pickle
import matplotlib.pyplot as plt
from IPython.display import clear_output
import random
import board

In [None]:
'''
Game info (game status and show status)

'''
def gamestatus(game, maxnum=12):
    counter = [0]*maxnum
    for i in game.tile:
        counter[i]+=1
    return np.array(counter) / len(game.tile)

def showstatus(game):
    s = ""
    for i, p in enumerate(gamestatus(game)):
        s += "{:4d}:[{:3.1f}] ".format(1<<i & -2, p*100.0)
    return s

def find_isomorphic_pattern(pattern):
    a = board(list(range(16)))

    isomorphic_pattern = []
    for i in range(8):
        if (i >= 4):
            b = board( a.mirror().tile )
        else:
            b = board( a.tile )
        for _ in range(i%4):
            b = b.rotate()
        isomorphic_pattern.append(np.array(b.tile)[pattern])
        
    return isomorphic_pattern

In [None]:
'''
Tuple Net Implementation

'''
class TuplesNet():
    def __init__(self, pattern, maxnum):
        self.V = np.zeros(([maxnum]*len(pattern)))
        self.pattern = pattern
        self.isomorphic_pattern = find_isomorphic_pattern(self.pattern)
        
    def getState(self, tile):
        return [tuple(np.array(tile)[p]) for p in self.isomorphic_pattern]
    
    def getValue(self, tile):
        S = self.getState(tile)     
        V = [self.V[s] for s in S]     
        # sum up the isomorphic pattern
        V = sum(V)
        return V

    def setValue(self, tile, v, reset=False):
        S = self.getState(tile)
        
        # Initialize isomorphic pattern
        v /= len(self.isomorphic_pattern)
        V = 0.0
        for s in S:
            self.V[s] += v        
            V += self.V[s]
        return V

In [None]:
'''

Reinforcement Learning Agent Implementation

'''

class Agent():

    def __init__(self, patterns, maxnum):
        self.Tuples = []
        for p in patterns:
            self.Tuples.append(TuplesNet(p, maxnum))
        self.metrics = []

    def changeStage(self):
        for tu in self.Tuples:
          tu.V = 0

    def getValue(self, tile):
        V = [t.getValue(tile) for t in self.Tuples]
        V = sum(V)
        
        return V
    
    def setValue(self, tile, v, reset=False):
        v /= len(self.Tuples)
        V = 0.0
        for t in self.Tuples:
            V += t.setValue(tile, v, reset)
        return V
    

    def evaulate(self, next_games):
        return [ng[1] + self.getValue(ng[0].tile) for ng in next_games]
    
    def learn(self, records, lr):      # learning part of the agent
        exact = 0.0
        for s, a, r, s_, s__ in records: 

            error = exact - self.getValue(s_)
            exact = r + self.setValue(s_, lr*error)
    
    '''
    Function for result analysis
    '''    
    def showStattistic(self, epoch, unit, show=True):
        metrics = np.array(self.metrics[epoch-unit:epoch])
        score_mean = np.mean(metrics[:, 0])
        score_max = np.max(metrics[:, 0])       
        if show:
            print('{:<8d}mean = {:<8.0f} max = {:<8.0f}'.format(epoch, score_mean, score_max))      
        if (metrics.shape[1] < 3):
            return score_mean, score_max      
        reach_nums = np.array([1<<max(end) & -2 for end in end_games])
                  
        if show:
            print('\n')       
        score_stat = []
        
        for num in np.sort(np.unique(reach_nums)):
            reachs = np.count_nonzero(reach_nums >= num)
            reachs = (reachs*100)/len(metrics)         
            ends = np.count_nonzero(reach_nums == num)
            ends = (ends*100)/len(metrics)           
            if show:
                print('{:<5d}  {:3.1f} % ({:3.1f} %)'.format(num, reachs, ends) )            
            score_stat.append( (num, reachs, ends) )      
        score_stat = np.array(score_stat)       
        return score_mean, score_max, score_stat
    

    def train(self, epoch_size, lr=0.1, showsize=1000): #Learning Rate is set to 0.1
        start_epoch = len(self.metrics)
        for epoch in range(start_epoch, epoch_size):       
            score = 0.0
            game = board().popup().popup()
            records = []
            records1024 = []
            records2048 = []
            stage = False;
            while True:            
                next_games = [game.up(), game.down(), game.left(), game.right()]
                action = np.argmax(self.evaulate(next_games))   
                next_game, reward = next_games[action]
                
                if game.end():
                    break
                       
                next_game_after = next_game.popup()          
                score += reward                      
                if (max(np.array(game.tile)>=10)):
                  records1024.insert(0, (game.tile, action, reward, next_game.tile, next_game_after.tile) )
                else:
                  records.insert(0, (game.tile, action, reward, next_game.tile, next_game_after.tile) )                             
                game = next_game_after
           
            # learning corresponding to different stages
            if (max(np.array(game.tile)==10)):
              self.learn(records1024, lr)
              self.metrics.append( (score, len(records1024), game.tile) )
            elif (max(np.array(game.tile)==11)):
              self.learn(records2048, lr)
              self.metrics.append( (score, len(records2048), game.tile) )
            else:
              self.learn(records, lr)
              self.metrics.append( (score, len(records), game.tile) )
         
            if (epoch+1) % showsize == 0:
                clear_output(wait=True)
                self.showStattistic(epoch+1, showsize)          

    ''' 
    Play the game
    '''
    def play(self, game):
        next_games = [game.up(), game.down(), game.left(), game.right()]
        action = np.argmax(self.evaulate(next_games))
                
        next_game, reward = next_games[action]
        return next_game, reward, ['up', 'down', 'left', 'right'][action]

In [None]:
# Control Variables
MAX_NUM = 15 
TUPLE_NUM = 6 
PATTERN_NUM = 4
ACTION_NUM = 4 

PATTERNS = [
    [0,1,2,3,4,5],
    [4,5,6,7,8,9],
    [0,1,2,4,5,6],
    [4,5,6,8,9,10]
]

In [None]:
#random.seed(20211212)
agent = Agent(PATTERNS, MAX_NUM)

In [None]:
%%time 
# 50000 epoches for three stages (change at 1024 and 2048)
agent.train(50000)

50000   mean = 31335    max = 82953   


64     100.0 % (0.3 %)
128    99.7 % (0.4 %)
256    99.3 % (1.6 %)
512    97.7 % (7.1 %)
1024   90.6 % (24.4 %)
2048   66.2 % (51.4 %)
4096   14.8 % (14.8 %)
Wall time: 9h 25min 28s


In [None]:
%%time 
# 10000 epoches for three stages (change at 1024 and 2048)
agent.train(10000)

10000   mean = 17091    max = 55309   


64     100.0 % (0.3 %)
128    99.7 % (0.5 %)
256    99.2 % (2.6 %)
512    96.6 % (18.3 %)
1024   78.3 % (57.3 %)
2048   21.0 % (20.7 %)
4096   0.3 % (0.3 %)
CPU times: user 2h 10min 21s, sys: 1min 22s, total: 2h 11min 43s
Wall time: 2h 9min 18s


In [None]:
%%time 
# 10000 epoches for two stages (change at 1024)
agent.train(10000)

10000   mean = 17612    max = 57089   


64     100.0 % (0.2 %)
128    99.8 % (0.7 %)
256    99.1 % (2.9 %)
512    96.2 % (18.2 %)
1024   78.0 % (54.6 %)
2048   23.4 % (22.9 %)
4096   0.5 % (0.5 %)
CPU times: user 2h 16min 33s, sys: 1min 26s, total: 2h 18min
Wall time: 2h 15min 41s


In [None]:
# 10000 epoches for one stages
%%time
agent.train(10000)

10000   mean = 19367    max = 60471   


32     100.0 % (0.1 %)
64     99.9 % (0.1 %)
128    99.8 % (0.9 %)
256    98.9 % (2.0 %)
512    96.9 % (12.6 %)
1024   84.3 % (53.0 %)
2048   31.3 % (31.0 %)
4096   0.3 % (0.3 %)
CPU times: user 2h 36min 40s, sys: 1min 18s, total: 2h 37min 58s
Wall time: 2h 35min 50s


In [None]:
%%time
agent.train(10000)



1000    mean = 4667     max = 17794   


64     100.0 % (2.7 %)
128    97.3 % (20.3 %)
256    77.0 % (36.1 %)
512    40.9 % (36.2 %)
1024   4.7 % (4.7 %)
2000    mean = 8200     max = 25754   


64     100.0 % (0.3 %)
128    99.7 % (2.5 %)
256    97.2 % (14.9 %)
512    82.3 % (61.8 %)
1024   20.5 % (20.1 %)
2048   0.4 % (0.4 %)
3000    mean = 10109    max = 30321   


64     100.0 % (0.2 %)
128    99.8 % (1.5 %)
256    98.3 % (8.7 %)
512    89.6 % (50.7 %)
1024   38.9 % (37.0 %)
2048   1.9 % (1.9 %)
4000    mean = 11733    max = 32707   


128    100.0 % (0.9 %)
256    99.1 % (6.1 %)
512    93.0 % (42.1 %)
1024   50.9 % (45.5 %)
2048   5.4 % (5.4 %)
5000    mean = 13151    max = 36834   


64     100.0 % (0.1 %)
128    99.9 % (1.1 %)
256    98.8 % (4.9 %)
512    93.9 % (31.7 %)
1024   62.2 % (54.1 %)
2048   8.1 % (8.1 %)
6000    mean = 14652    max = 42742   


64     100.0 % (0.2 %)
128    99.8 % (0.7 %)
256    99.1 % (3.3 %)
512    95.8 % (26.0 %)
1024   69.8 % (57.0 %)
2048   12.8 %