<a href="https://colab.research.google.com/github/nnniels/SDD-DeepLearning-project/blob/main/SLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Python imports

In [71]:
# generic libraries
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
# torch-related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
# hugging face dataset downloader
import datasets
# tokenizer (?)
import tiktoken
# show progress bar in loop
from tqdm.auto import tqdm
# colab download file
from google.colab import files

# Loading

In [72]:
ds = datasets.load_dataset("ahmad543/genre_stories")

# Tokenizer

In [73]:
# uses a pre-otkenized teokenizer (?)
enc = tiktoken.get_encoding("gpt2")

# Parameters

In [74]:
# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')
print(f'PyTorch version: {torch.__version__}')

Using device: cuda
PyTorch version: 2.9.0+cu126


In [75]:

# Model hyperparameters
class Config:
    # Vocabulary
    vocab_size = enc.n_vocab
    n_embd = 384
    # Model architecture
    d_model = 384          # Embedding dimension ??
    n_heads = 6            # Number of attention heads
    n_layers = 6           # Number of transformer blocks o:4
    d_ff = 1024             # Feed-forward dimension
    dropout = 0.1          # Dropout rate

    # Training
    block_size = 128        # Maximum context length
    batch_size = 32        # Batch size
    learning_rate = 3e-4   # Learning rate
    max_iters = 4000       # Training iterations o:3000
    eval_interval = 100    # Evaluate every N iterations
    eval_iters = 50       # Number of iterations for evaluation

config = Config()

In [76]:
def auto_lr(iter, i1=1000, v1=2e-4, i2=20000, v2=5e-5):
  if iter < i1:
    # augment from 0 to v0
    beta = np.arccos((v1))/(i1)
    return v1*np.cos(beta*(iter-i1))
    #return iter/1000*v1
  elif iter < i2:
    # diminish learning rate following cosine curve
    beta = np.arccos((v2/v1))/(i2-i1)
    return v1*np.cos(beta*(iter-i1))
  else:
    return v1

lr_iter = []
for k in range(config.max_iters):
  lr_iter.append(auto_lr(k, i1=config.max_iters//10, i2=config.max_iters, v1=4e-4, v2=1e-4))

figure = px.line(x=range(config.max_iters), y=lr_iter, labels={'x': 'Iteration', 'y': 'Learning Rate'}, title='Learning Rate')
figure.show()

# Embedding

In [77]:
wte = nn.Embedding(config.vocab_size, config.n_embd)  # Token embeddings
wpe = nn.Embedding(config.block_size, config.n_embd)  # Position embeddings

# Model Components

## Multi-head Attention
Model components : MHA

In [78]:
class MultiHeadAttention(nn.Module):
    """
    Multi-head causal self-attention.

    Key difference from your previous class:
    - Uses a causal mask to prevent attending to future tokens
    """

    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"

        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads  # Dimension per head

        # Linear projections for Q, K, V (all heads at once)
        self.W_q = nn.Linear(d_model, d_model, bias=True) # passage à true car supposément meilleur
        self.W_k = nn.Linear(d_model, d_model, bias=True)
        self.W_v = nn.Linear(d_model, d_model, bias=True)

        # Output projection
        self.W_o = nn.Linear(d_model, d_model, bias=True)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        Args:
            x: Input tensor [batch_size, seq_len, d_model]

        Returns:
            Output tensor [batch_size, seq_len, d_model]
        """
        batch_size, seq_len, d_model = x.shape

        # Linear projections
        Q = self.W_q(x)  # [batch_size, seq_len, d_model]
        K = self.W_k(x)
        V = self.W_v(x)

        # Split into multiple heads and reshape
        # [batch_size, seq_len, d_model] -> [batch_size, seq_len, n_heads, d_k]
        # -> [batch_size, n_heads, seq_len, d_k]
        Q = Q.view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)

        # Scaled dot-product attention
        # Compute attention scores
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        # [batch_size, n_heads, seq_len, seq_len]

        # Apply causal mask (prevent attending to future tokens)
        mask = torch.tril(torch.ones(seq_len, seq_len, device=x.device)).bool()
        scores = scores.masked_fill(~mask, float('-inf'))

        # Apply softmax
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Apply attention to values
        out = torch.matmul(attn_weights, V)
        # [batch_size, n_heads, seq_len, d_k]

        # Concatenate heads
        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)

        # Output projection
        out = self.W_o(out)

        return out

## Feed-Forward Network
Model components : Feed Forward

In [79]:
class FeedForward(nn.Module):
    """
    Position-wise Feed-Forward Network.
    Two linear transformations with GELU activation.
    """

    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Linear(d_ff, d_model),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

## Transformer Block
Model components : transformer block

In [80]:
class TransformerBlock(nn.Module):
    """
    A single transformer block:
    - Multi-head attention with residual connection
    - Feed-forward network with residual connection
    - Layer normalization (pre-norm architecture)
    """

    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.ln2 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, d_ff, dropout)

    def forward(self, x):
        # Pre-norm architecture (more stable training)
        # Attention with residual
        x = x + self.attn(self.ln1(x))
        # Feed-forward with residual
        x = x + self.ff(self.ln2(x))
        return x

# Complete transformer model

In [81]:
class GPTModel(nn.Module):
    """
    A minimal GPT-style transformer for next token prediction.

    Architecture:
    1. Token embeddings + positional embeddings
    2. Stack of transformer blocks
    3. Layer norm
    4. Linear head to predict next token
    """

    def __init__(self, vocab_size, config):
        super().__init__()
        self.config = config

        # Token embeddings
        self.token_embedding = nn.Embedding(vocab_size, config.d_model)

        # Positional embeddings (learned)
        self.pos_embedding = nn.Embedding(config.block_size, config.d_model)

        # Transformer blocks
        self.blocks = nn.ModuleList([
            TransformerBlock(config.d_model, config.n_heads, config.d_ff, config.dropout)
            for _ in range(config.n_layers)
        ])

        # Final layer norm
        self.ln_f = nn.LayerNorm(config.d_model)

        # Language modeling head
        self.lm_head = nn.Linear(config.d_model, vocab_size, bias=False)

        # Weight tying (share weights between token embeddings and lm_head)
        self.token_embedding.weight = self.lm_head.weight

        # Initialize weights
        self.apply(self._init_weights)

        # Report number of parameters
        n_params = sum(p.numel() for p in self.parameters())
        print(f"Number of parameters: {n_params/1e6:.2f}M")

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        """
        Args:
            idx: Input token indices [batch_size, seq_len]
            targets: Target token indices [batch_size, seq_len] (optional)

        Returns:
            logits: Output logits [batch_size, seq_len, vocab_size]
            loss: Cross-entropy loss (if targets provided)
        """
        batch_size, seq_len = idx.shape

        # Token embeddings
        tok_emb = self.token_embedding(idx)  # [batch_size, seq_len, d_model]

        # Positional embeddings
        pos = torch.arange(0, seq_len, dtype=torch.long, device=idx.device)
        pos_emb = self.pos_embedding(pos)  # [seq_len, d_model]

        # Add embeddings
        x = tok_emb + pos_emb  # Broadcasting happens automatically

        # Apply transformer blocks
        for block in self.blocks:
            x = block(x)

        # Final layer norm
        x = self.ln_f(x)

        # Language modeling head
        logits = self.lm_head(x)  # [batch_size, seq_len, vocab_size]

        # Compute loss if targets are provided
        loss = None
        if targets is not None:
            # Reshape for cross-entropy
            B, T, C = logits.shape
            logits_flat = logits.view(B * T, C)
            targets_flat = targets.view(B * T)
            loss = F.cross_entropy(logits_flat, targets_flat)

        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Generate new tokens autoregressively.

        Args:
            idx: Starting sequence [batch_size, seq_len]
            max_new_tokens: Number of tokens to generate
            temperature: Sampling temperature (higher = more random)
            top_k: If set, only sample from top k tokens

        Returns:
            Generated sequence [batch_size, seq_len + max_new_tokens]
        """
        for _ in range(max_new_tokens):
            # Crop context if needed (to fit block_size)
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]

            # Forward pass
            logits, _ = self(idx_cond)

            # Focus on last time step
            logits = logits[:, -1, :] / temperature  # [batch_size, vocab_size]

            # Optionally crop logits to only top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')

            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)

            # Append to sequence
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

# Instanciate the model

In [82]:
# Create model
model = GPTModel(config.vocab_size, config).to(device)

# Print model architecture
print(model)

Number of parameters: 27.63M
GPTModel(
  (token_embedding): Embedding(50257, 384)
  (pos_embedding): Embedding(128, 384)
  (blocks): ModuleList(
    (0-5): 6 x TransformerBlock(
      (ln1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=384, out_features=384, bias=True)
        (W_k): Linear(in_features=384, out_features=384, bias=True)
        (W_v): Linear(in_features=384, out_features=384, bias=True)
        (W_o): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (ff): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1024, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=1024, out_features=384, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (ln_f): LayerNorm((3

In [83]:
# Generate from untrained model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated = model.generate(context, max_new_tokens=200, temperature=1.0, top_k=10)
print('\n=== Untrained Model Generation ===')
print(enc.decode(generated[0].tolist()))
print('=' * 50)


=== Untrained Model Generation ===
! Thief esche ministry Display Display cabinet optimum audiences hesitate ThiefitcherPred hesitate CENT CENT0404 Bones updating477477 subp confidential subp Seek HAL cabinet casinos improvingPH cabinet bullyingPH negro Amy mul explore Hob Shameslaveaca Flask wandering bloodstreamslave reproduced zonesoux sculpturesanamo Sof Sof Ct spotlight wearer euro AnkaraCs sizedUTION Vader 6000 incorrectlyubric Shameubric teamsMel exploreServices spotlight FIGHT mismatchCorDark subp Venezuela Hor improvingServices Baghd afforded linebackers VaderDisplay illegal illegal Sof policymakers Yahoobour Emily parted linebackers updating Ankaraission Teachers Macro thrott impressions Jah Shame radicals radicals linebackersbour spotlightFA-+reditation linebackers nonsensicalamiyaSteam hitterVT Favorite larg Goldberg revenge rig impressions impressionsanut Cyprusategic Bold Bold doom Bold Bold FIGHTFAFAFA craz�minimumUCTreements cohesionFA cheek Second clash hated sclerosi

# Train setup

## Optimizer

In [84]:
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)

## Split training data

In [85]:
all_stories = ds["train"]["story"]
n_stories = len(all_stories)
n_test = n_stories //8
n_train = n_stories - n_test

train_stories = np.random.choice(all_stories, n_train, replace=False)
test_stories = np.setdiff1d(all_stories, train_stories)

In [86]:
# concatenate train and test stories, and tokenize them
train_data = enc.encode(" ".join(train_stories))
test_data = enc.encode(" ".join(test_stories))

In [87]:
# print length (# of tokens) if train and test data
print(f"Train data length: {len(train_data)} tokens, corresponding to {len(" ".join(train_stories)):4f} characters")
print(f"Test data length: {len(test_data)} tokens, corresponding to {len(" ".join(test_stories))} characters")

Train data length: 515117 tokens, corresponding to 2501190.000000 characters
Test data length: 62804 tokens, corresponding to 302983 characters


## Batching

In [88]:
def get_batch_r(split, config, iter=0):
  if split == 'train':
    data = train_data
  elif split == 'test':
    data = test_data
  else : return None
  # rnadomly select starting position
  ix = torch.randint(len(data) - config.block_size, (config.batch_size,))
  # in and target sequence
  x = torch.stack([torch.tensor(data[i:i+config.block_size]) for i in ix])
  y = torch.stack([torch.tensor(data[i+1:i+config.block_size+1]) for i in ix])
  # send to device
  x, y = x.to(device), y.to(device)
  return x, y

def get_batch_d(split, config, iter):
  if split == 'train':
    data = train_data
    len_data = len(train_data)
  elif split == 'test':
    data = test_data
    len_data = len(test_data)
  else : return None
  # rnadomly select starting position
  #ix = torch.randint(len(data) - config.block_size, (config.batch_size,))
  ix = torch.ones(config.batch_size, dtype=torch.int64) * (iter * config.batch_size)%len_data
  # in and target sequence
  x = torch.stack([torch.tensor(data[i:i+config.block_size]) for i in ix])
  y = torch.stack([torch.tensor(data[i+1:i+config.block_size+1]) for i in ix])
  # send to device
  x, y = x.to(device), y.to(device)
  return x, y

In [89]:
# test batching batchi batch
print("="*40)
xb, yb = get_batch_r('train', config)
print('inputs:')
print(xb.shape)
print(enc.decode(xb[0].tolist()))
print('targets:')
print(yb.shape)
print(enc.decode(yb[0].tolist()))

print("="*40)
xb, yb = get_batch_d('train', config, 0)
print('inputs:')
print(xb.shape)
print(enc.decode(xb[0].tolist()))
print('targets:')
print(yb.shape)
print(enc.decode(yb[0].tolist()))

inputs:
torch.Size([32, 128])
 the jungle, they stumbled upon ancient ruins overrun by thick vines and guarded by hidden traps. Guided by faded maps and cryptic clues, they pressed on, their excitement mounting with each step closer to uncovering the idol's secrets.  But they were not alone in their quest. A rival group of treasure hunters, led by the notorious mercenary, Captain Rodriguez, lurked in the shadows, intent on claiming the idol for themselves. A race against time ensued, as both parties navigated perilous obstacles and battled unforgiving elements.  Finally, deep within a hidden chamber, Dr. Rivers and her team stumbled upon the legendary artifact, gleaming
targets:
torch.Size([32, 128])
 jungle, they stumbled upon ancient ruins overrun by thick vines and guarded by hidden traps. Guided by faded maps and cryptic clues, they pressed on, their excitement mounting with each step closer to uncovering the idol's secrets.  But they were not alone in their quest. A rival group of

# Training

In [90]:
@torch.no_grad()
def estimate_loss(model, config):
    """
    Estimate loss on train and validation sets.
    """
    out = {}
    model.eval()
    for split in ['train', 'test']:
        losses = torch.zeros(config.eval_iters)
        for k in range(config.eval_iters):
            X, Y = get_batch_r(split, config)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [91]:
# save model parameters
random_n_model = np.random.randint(1000, 10000)
with open(f'model_{random_n_model}.txt', 'w') as f:
    f.write(str(model))
print("Saved model description")
files.download(f'model_{random_n_model}.txt')
print("Downloaded model description")


# Training loop
train_losses = []
val_losses = []
iterations = []

print('Starting training...')
print(f'Training for {config.max_iters} iterations')
print(f'Evaluating every {config.eval_interval} iterations\n')

for iter in tqdm(range(config.max_iters)):
    # Evaluate loss periodically
    if iter % config.eval_interval == 0 or iter == config.max_iters - 1:
        losses = estimate_loss(model, config)
        print(f"Step {iter}: train loss {losses['train']:.4f}, test loss {losses['test']:.4f}")
        train_losses.append(losses['train'])
        val_losses.append(losses['test'])
        iterations.append(iter)

    # Sample a batch
    xb, yb = get_batch_r('train', config)

    # Forward pass
    logits, loss = model(xb, yb)

    # Backward pass
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    # Update learning rate
    optimizer.param_groups[0]['lr'] = lr_iter[iter]
    optimizer.step()
    # save the model
    if iter % 1000 == 0 and iter !=0 :
      torch.save(model.state_dict(), f'model_{random_n_model}_i{iter}.pt')
      print("Model saved !")
      #files.download(f'model_{random_n_model}_i{iter}.pt')
      print("Model downloaded !")


print('\nTraining complete!')

Saved model description


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded model description
Starting training...
Training for 4000 iterations
Evaluating every 100 iterations



  0%|          | 0/4000 [00:00<?, ?it/s]

Step 0: train loss 10.8177, test loss 10.8156
Step 100: train loss 7.1132, test loss 7.1576
Step 200: train loss 5.7289, test loss 5.8825
Step 300: train loss 4.7456, test loss 5.0490
Step 400: train loss 3.9804, test loss 4.4677
Step 500: train loss 3.4373, test loss 4.0443
Step 600: train loss 3.0371, test loss 3.7284
Step 700: train loss 2.6510, test loss 3.5429
Step 800: train loss 2.4149, test loss 3.4376
Step 900: train loss 2.2522, test loss 3.3494
Step 1000: train loss 2.0430, test loss 3.2937
Model saved !


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model downloaded !
Step 1100: train loss 1.8529, test loss 3.3025
Step 1200: train loss 1.6948, test loss 3.2951
Step 1300: train loss 1.5933, test loss 3.2682
Step 1400: train loss 1.4399, test loss 3.2357
Step 1500: train loss 1.3522, test loss 3.1960
Step 1600: train loss 1.2555, test loss 3.2805
Step 1700: train loss 1.1381, test loss 3.3049
Step 1800: train loss 1.0475, test loss 3.3459
Step 1900: train loss 0.9809, test loss 3.3676
Step 2000: train loss 0.9077, test loss 3.4951
Model saved !


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model downloaded !
Step 2100: train loss 0.8045, test loss 3.4831
Step 2200: train loss 0.7475, test loss 3.4643
Step 2300: train loss 0.6850, test loss 3.5377
Step 2400: train loss 0.6505, test loss 3.4941
Step 2500: train loss 0.5766, test loss 3.5359
Step 2600: train loss 0.5258, test loss 3.6184
Step 2700: train loss 0.4860, test loss 3.6569
Step 2800: train loss 0.4488, test loss 3.6589
Step 2900: train loss 0.4111, test loss 3.6586
Step 3000: train loss 0.3720, test loss 3.6914
Model saved !


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model downloaded !
Step 3100: train loss 0.3568, test loss 3.7366
Step 3200: train loss 0.3190, test loss 3.7770
Step 3300: train loss 0.3067, test loss 3.7030
Step 3400: train loss 0.2872, test loss 3.7639
Step 3500: train loss 0.2688, test loss 3.8140
Step 3600: train loss 0.2533, test loss 3.9067
Step 3700: train loss 0.2384, test loss 3.8282
Step 3800: train loss 0.2265, test loss 3.9006
Step 3900: train loss 0.2196, test loss 3.8882
Step 3999: train loss 0.2069, test loss 3.8820

Training complete!


In [92]:
# gemini, pour sauver les fichiers en live sur le drive
# from google.colab import drive
# drive.mount('/content/drive')
# import os

# drive_path = '/content/drive/MyDrive/colab_model_checkpoints'
# os.makedirs(drive_path, exist_ok=True)
# print(f"Created directory: {drive_path}")

## Results

## Basic

In [93]:
figure = px.line(x=iterations, y=[train_losses, val_losses], labels={'x': 'Iteration', 'y': 'Loss'}, title='Training and Validation Loss')
figure.show()

In [94]:
# Generate from trained model
model.eval()
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated = model.generate(context, max_new_tokens=500, temperature=1.0, top_k=10)
print('\n=== Trained Model Generation ===')
print(enc.decode(generated[0].tolist()))
print('=' * 50)


=== Trained Model Generation ===
! And as they prepared for another day of arranging flowers, Lily and Larry stumbled upon an old gardening book hidden beneath a pile of wilted petals. With a twinkle in their eyes and a sense of mischief in their hearts, they decided to challenge each other to a floral design competition and see who could create the most breathtaking bouquet.  Armed with their trusty shears and a rainbow of blooms, Lily and Larry set out to showcase their floral talents to the people of Chuckleburg, transforming their shops into blooming wonderlands filled with the scent of roses and laughter. From elegant centerpieces to whimsical bouquets, they spared no expense in their quest for floral glory.  But just as they were about to unveil their masterpiece arrangements, disaster struck in the form of a mischievous squirrel who darted into the shop and nibbled on their flowers. With determination in their hearts and a dash of creativity, Lily and Larry improvised with a se

## Advanced

In [96]:
prompts = ["One day, looking over the horizon",
           "In the deep forest of Mirkwood",
           "The sea"]
answers = []

for prompt in prompts :
  p_enc = enc.encode(prompt)
  context = torch.tensor(p_enc, dtype=torch.long, device=device).unsqueeze(0)
  generated = model.generate(context, max_new_tokens=500, temperature=1.0, top_k=10)
  answers.append(generated)
  print('\n=== Trained Model Generation ===')
  print(enc.decode(generated[0].tolist()))
  print('=' * 50)


=== Trained Model Generation ===
One day, looking over the horizon - the victim of an ancient magic. With her newfound clarity, Elara emerged from the portal, a beacon of hope amidst the darkness before she knew that her, she would forever be haunted by the tragic fate of all humanity. In the quaint village of Willowvale, the ticking of the clock tower echoed through the cobblestone streets, filling the air with joy and amusement. When the renowned clockmaker, Mr. Thompson found dead in his workshop, suspicion fell upon the eccentric residents harboring their own secrets.  Enter Detective Harper, a newcomer to Willowbrook with a keen eye for detail and a knack for unraveling mysteries. As she delved into the enigmatic world of clockwork, she uncovered a web of family secrets and betrayal lurking beneath the village's picturesque landscapes.  With each clue uncovered, Detective Sterling pieced together the pattern that pointed towards a sinister organization known world. Driven by an i

##