# Motif Mining / Combined Memory Module

### What is a Motif (in time series/finance context):
A motif is a short, distinctive, recurring pattern in time series data (e.g., “three rising candles”, “hammer candlestick”).

Motif mining is about discovering these patterns that frequently occur, possibly before some important event (like price spikes).

Motifs can be fixed length (e.g., always 3 bars) or variable length, but they're usually not tied to any particular "agent experience" or reward—they're just patterns that are statistically common or relevant to outcomes.

### How does this differ from current agent’s memory?
Motifs could become part of memory if they prove useful, but in RL, memory entries are scored and selected by usefulness to the agent’s task, not just frequency.

* **Motif:**
  * **Purely pattern-based:** “What sequence shapes show up often in the market?”
  * **Unsupervised:** Does not depend on what the agent did or the rewards/outcomes.
  * Often discovered with algorithms like matrix profile, SAX, or clustering over subsequences.

* **Strategic RL Memory:**
  * Stores sequences of observations, actions, rewards from actual episodes, tied to what the agent did and what outcome it got.
  * Is used for retrieval during decision-making, not just for pattern mining.
  * Memory can be trained to only keep those episodes/patterns that are useful for policy improvement, not just frequent.

* **Summary:**
  * **Motif:**  Statistically recurring pattern in the world
  * **Memory_** Agent’s own experienced or retained pattern, which it can choose to use, forget, or score for future use


### Goal:

* Get both kinds of retrieval in one single process.

### Summary:
* All memory retrieval (episodic and motif) is neural, attention-based, and trainable.

* Motif memory can be used for either unsupervised mining (offline DTW) or end-to-end learned patterns.

* Everything is differentiable and ready for RL + auxiliary losses.





In [1]:
import os
import sys
import torch
import torch.nn as nn
import numpy as np

sys.path.append('../')
from environments import MemoryTaskEnv
from memory import StrategicMemoryBuffer, BaseMemoryBuffer,StrategicMemoryTransformerPolicy
from agent import TraceRL

In [2]:
class MotifMemoryBank(BaseMemoryBuffer):
    """
    Motif memory: learnable bank of pattern embeddings, attention-retrieved.

    Features:
        - Stores K motif embeddings, trainable.
        - Neural encoder to encode subtrajectories as motifs.
        - Attention over motifs given current context trajectory.
    """
    def __init__(self, obs_dim, action_dim, mem_dim=32, n_motifs=32, motif_len=4, device='cpu'):
        super().__init__()
        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.mem_dim = mem_dim
        self.n_motifs = n_motifs
        self.motif_len = motif_len
        self.device = device
        self.last_attn = None
        # Learnable motif memory bank
        self.motif_embeds = nn.Parameter(torch.randn(n_motifs, mem_dim))
        # Neural encoder for extracting motifs from subtrajectories
        self.embedding_proj = nn.Linear(obs_dim + action_dim + 1, mem_dim)
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=mem_dim, nhead=2, batch_first=True),
            num_layers=1
        )

    def retrieve(self, context_traj):
        """
        Attends over motif bank using the latest motif_len steps of the context trajectory.
        """
        if len(context_traj) < self.motif_len:
            pad = [context_traj[0]] * (self.motif_len - len(context_traj))
            motif_traj = pad + context_traj
        else:
            motif_traj = context_traj[-self.motif_len:]

        motif_np = np.array([np.concatenate([obs, [a], [r]]) for obs, a, r in motif_traj], dtype=np.float32)
        motif_input = torch.from_numpy(motif_np).unsqueeze(0).to(self.device)
        motif_embed = self.encoder(self.embedding_proj(motif_input)).mean(dim=1).squeeze(0)  # [mem_dim]
        attn_logits = torch.matmul(self.motif_embeds, motif_embed)
        attn = torch.softmax(attn_logits, dim=0)
        motif_readout = (attn.unsqueeze(1) * self.motif_embeds).sum(dim=0)
        self.last_attn = attn.detach().cpu().numpy()
        return motif_readout, attn

    def motif_parameters(self):
        return [self.motif_embeds]

    def get_trainable_parameters(self):
        return list(self.parameters()) + list(self.motif_parameters())

    def get_last_attention(self):
        return self.last_attn

In [3]:
class CombinedMemoryModule(BaseMemoryBuffer):
    def __init__(self, episodic_buffer, motif_bank):
        super().__init__()
        self.episodic_buffer = episodic_buffer
        self.motif_bank = motif_bank
        self.last_attn = None


    def retrieve(self, context_trajectory):
        epi_readout, epi_attn = self.episodic_buffer.retrieve(context_trajectory)
        motif_readout, motif_attn = self.motif_bank.retrieve(context_trajectory)
        combined = torch.cat([epi_readout, motif_readout], dim=-1)
        self.last_attn = (epi_attn, motif_attn)
        return combined, epi_attn, motif_attn

    def add_entry(self, trajectory, outcome):
        self.episodic_buffer.add_entry(trajectory, outcome)
        # Motif bank may NOT need this, but later might optionally do motif mining here 
        # For now, only episodic buffer gets new entries
        # If you want motifs to be updated with experience, call self.motif_bank.add_entry(trajectory, outcome) if you define it

    def get_trainable_parameters(self):
        params = []
        if hasattr(self, "episodic_buffer"):
            params += self.episodic_buffer.get_trainable_parameters()
        if hasattr(self, "motif_bank"):
            params += self.motif_bank.get_trainable_parameters()
        return params

    def get_last_attention(self):
        return self.last_attn  # tuple: (episodic, motif)


In [4]:
class StrategicCombinedMemoryPolicy(nn.Module):
    def __init__(self, obs_dim, mem_dim=32, nhead=4, memory=None, aux_modules=None, **kwargs):
        super().__init__()
        self.mem_dim = mem_dim
        self.embed = nn.Linear(obs_dim, mem_dim)
        self.pos_embed = nn.Embedding(256, mem_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=mem_dim, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=1)
        self.policy_head = nn.Linear(mem_dim + 2 * mem_dim, 2)   # now +2mem_dim (episodic + motif)
        self.value_head = nn.Linear(mem_dim + 2 * mem_dim, 1)
        self.aux_modules = aux_modules if aux_modules is not None else []
        self.memory = memory

    def forward(self, trajectory, obs_t=None, actions=None, rewards=None):
        T = trajectory.shape[0]
        x = self.embed(trajectory)
        pos = torch.arange(T, device=trajectory.device)
        x = x + self.pos_embed(pos)
        x = x.unsqueeze(0)
        x = self.transformer(x)
        feat = x[0, -1]

        mem_feat = torch.zeros(2 * self.mem_dim, device=feat.device)
        epi_attn, motif_attn = None, None
        if self.memory is not None and actions is not None and rewards is not None:
            actions_list = actions.tolist()
            rewards_list = rewards.tolist()
            if len(actions_list) < T:
                actions_list = [0] * (T - len(actions_list)) + actions_list
            if len(rewards_list) < T:
                rewards_list = [0.0] * (T - len(rewards_list)) + rewards_list
            context_traj = [
                (trajectory[i].cpu().numpy(), actions_list[i], rewards_list[i]) for i in range(T)
            ]
            mem_feat, epi_attn, motif_attn = self.memory.retrieve(context_traj)
        final_feat = torch.cat([feat, mem_feat], dim=-1)
        logits = self.policy_head(final_feat)
        value = self.value_head(final_feat)
        aux_preds = {}
        for aux in self.aux_modules:
            aux_preds[aux.name] = aux.head(final_feat)
        return logits, value.squeeze(-1), aux_preds


In [5]:

# ──────────────────────────────────────────────────────────────
# Example training loop
# ──────────────────────────────────────────────────────────────

# SETUP ===================================
DELAY = 4
MEM_DIM = 32
N_EPISODES = 2500
N_MEMORIES = 32

AGENT_KWARGS = dict(
    device="cpu",
    verbose=0,
    lam=0.95, 
    gamma=0.99, 
    ent_coef=0.01,
    learning_rate=1e-3, 
    
)
MEMORY_AGENT_KWARGS=dict(
    her=False,
    reward_norm=False,
    aux_modules=None,
    
    intrinsic_expl=False,
    intrinsic_eta=0.01,
    
    use_rnd=False, 
    rnd_emb_dim=32, 
    rnd_lr=1e-3,
)

# HELPERS =================================
def total_timesteps(delay,n_episodes):
    return delay * n_episodes

# ENVIRONMENT =============================
env = MemoryTaskEnv(delay=DELAY, difficulty=0)

# MEMORY BUFFER ===========================
episodic_buffer = StrategicMemoryBuffer(
    obs_dim=env.observation_space.shape[0],
    action_dim=1,
    mem_dim=MEM_DIM,
    max_entries=N_MEMORIES,
    device="cpu"
)
motif_bank = MotifMemoryBank(
    obs_dim=env.observation_space.shape[0],
    action_dim=1,
    mem_dim=MEM_DIM,
    n_motifs=32,
    motif_len=4,
    device="cpu"
)
combined_memory = CombinedMemoryModule(episodic_buffer, motif_bank)


# POLICY NETWORK (use class) ==============
policy = StrategicCombinedMemoryPolicy


# AGENT SETUP =============================
agent = TraceRL(
    policy_class=policy,
    env=env,
    memory=combined_memory,
    memory_learn_retention=True,    
    memory_retention_coef=0.01,   
    # aux_modules=aux_modules,  
    device="cpu",
    verbose=1,
    lam=0.95, 
    gamma=0.99, 
    ent_coef=0.01,
    learning_rate=1e-3, 
    
    **MEMORY_AGENT_KWARGS
)

# TRAIN THE AGENT =========================
#agent.learn(
#    total_timesteps=total_timesteps(DELAY, 1000),
#    log_interval=50
#)

In [6]:

# ENVIRONMENT =============================
env = MemoryTaskEnv(delay=DELAY, difficulty=0)

# MEMORY BUFFER ===========================
memory = StrategicMemoryBuffer(
    obs_dim=env.observation_space.shape[0],
    action_dim=1,          # For Discrete(2)
    mem_dim=MEM_DIM,
    max_entries=N_MEMORIES,
    device="cpu"
)

# POLICY NETWORK (use class) ==============
policy = StrategicMemoryTransformerPolicy

# (optional) AUXILIARY MODULES ============
"""
aux_modules = [
    CueAuxModule(feat_dim=MEM_DIM*2, n_classes=2),
    ConfidenceAuxModule(feat_dim=MEM_DIM*2)
]
"""

# AGENT SETUP =============================
agent = TraceRL(
    policy_class=policy,
    env=env,
    memory=memory,
    memory_learn_retention=True,    
    memory_retention_coef=0.01,   
    # aux_modules=aux_modules,  
    device="cpu",
    verbose=0,
    lam=0.95, 
    gamma=0.99, 
    ent_coef=0.01,
    learning_rate=1e-3, 
    
    **MEMORY_AGENT_KWARGS
)

# TRAIN THE AGENT =========================
#agent.learn(
#    total_timesteps=total_timesteps(DELAY, 100),
#    log_interval=50
#)

In [7]:
import numpy as np


def predict_ep_duration(delay):
    delays = np.array([2, 4, 8, 16, 32, 64, 128,258])
    avg_durations = np.array([0.02, 0.03, 0.06, 0.12, 0.26, 0.61, 1.52,4.18])
    coeff = np.polyfit(delays, avg_durations, 1)
    # Use the fitted slope from your data
    return coeff[0] * delay

In [8]:
predict_ep_duration(54)

0.8615542774982028

In [10]:
from benchmark import AgentPerformanceBenchmark

env = MemoryTaskEnv(delay=4, difficulty=0)

# MEMORY BUFFER ===========================
memory = StrategicMemoryBuffer(
        obs_dim=env.observation_space.shape[0],
        action_dim=1,          # For Discrete(2)
        mem_dim=MEM_DIM,
        max_entries=N_MEMORIES,
        device="cpu"
    )
    
# POLICY NETWORK (use class) ==============
#policy = StrategicMemoryTransformerPolicy
policy = StrategicCombinedMemoryPolicy
agent = TraceRL(
        policy_class=policy,
        env=env,
        memory=memory,
        memory_learn_retention=True,    
        memory_retention_coef=0.01,   
        # aux_modules=aux_modules,  
        device="cpu",
        verbose=0,
        lam=0.95, 
        gamma=0.99, 
        ent_coef=0.01,
        learning_rate=1e-3, 
        
        **MEMORY_AGENT_KWARGS
    )
curriculum = [2, 4, 8, 16,32,64,128,256]
for delay in curriculum:
    agent.env.delay = delay
    #agent.get_episodic_buffer().reset()
    print(f"\n--- Training with delay={delay} ---")
    agent.learn(total_timesteps=total_timesteps(delay, 100000), log_interval=50)
    
    benchmark = AgentPerformanceBenchmark(dict(delay=delay, n_train_episodes=2000, total_timesteps=1_000_000, difficulty=0, mode_name="EASY", verbose=0, eval_base=True),)
    e_r, e_s = benchmark.evaluate(agent,'motif')
    table = [["Avg reward",e_r],["Std reward",e_s]]
    print(tabulate(table, headers=["Evaluation",""], tablefmt="rounded_outline"))


--- Training with delay=2 ---


ValueError: not enough values to unpack (expected 3, got 2)

In [None]:
from benchmark import AgentPerformanceBenchmark

env = MemoryTaskEnv(delay=delay, difficulty=0)

# MEMORY BUFFER ===========================
memory = StrategicMemoryBuffer(
        obs_dim=env.observation_space.shape[0],
        action_dim=1,          # For Discrete(2)
        mem_dim=MEM_DIM,
        max_entries=N_MEMORIES,
        device="cpu"
    )
    
# POLICY NETWORK (use class) ==============
#policy = StrategicMemoryTransformerPolicy
policy = StrategicCombinedMemoryPolicy
agent = TraceRL(
        policy_class=policy,
        env=env,
        memory=memory,
        memory_learn_retention=True,    
        memory_retention_coef=0.01,   
        # aux_modules=aux_modules,  
        device="cpu",
        verbose=0,
        lam=0.95, 
        gamma=0.99, 
        ent_coef=0.01,
        learning_rate=1e-3, 
        
        **MEMORY_AGENT_KWARGS
    )
curriculum = [2, 4, 8, 16,32,64,128,256]
for delay in curriculum:
    agent.env.delay = delay
    #agent.get_episodic_buffer().reset()
    print(f"\n--- Training with delay={delay} ---")
    agent.learn(total_timesteps=total_timesteps(delay, 100000), log_interval=50)
    
    benchmark = AgentPerformanceBenchmark(dict(delay=delay, n_train_episodes=2000, total_timesteps=1_000_000, difficulty=0, mode_name="EASY", verbose=0, eval_base=True),)
    e_r, e_s = benchmark.evaluate(agent,'motif')
    table = [["Avg reward",e_r],["Std reward",e_s]]
    print(tabulate(table, headers=["Evaluation",""], tablefmt="rounded_outline"))