# Supervised Dictionary Learning for Sentence Decomposition

We have had success with linear probes. We can now try to further decompose it into further atoms, which would show broader structure.

In order to align these atoms with interpretable properties, we train the dictionary with a classification task added to the reconstruction loss.


The goal is to check if we can linearly decompose the embedding back into words with the part of speech and dependencies.

In [1]:
!pip install stanza -q
!pip install nltk -q
!pip install transformers datasets


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:

import nltk
nltk.download('brown')
import stanza
from nltk.corpus import brown
stanza.download('en')


def reconstruct_sentence(tokens):
    sentence = " ".join(tokens)
    sentence = sentence.replace('``', '').replace("''", "").replace(
        " ,", ",").replace(" .", ".").replace(" ?", "?").replace(" !", "!")
    return sentence

brown_sentences = [reconstruct_sentence(tokens) for tokens in brown.sents()]
brown_sentences = brown_sentences[:20000]



[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-03-29 03:02:53 INFO: Downloaded file to /root/stanza_resources/resources.json
2025-03-29 03:02:53 INFO: Downloading default packages for language: en (English) ...
2025-03-29 03:02:54 INFO: File exists: /root/stanza_resources/en/default.zip
2025-03-29 03:02:57 INFO: Finished downloading models and saved to /root/stanza_resources


In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
import stanza
import pandas as pd

# Load tokenizer and model
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
    

In [None]:


# Load stanza for offset alignment
stanza.download('en')
nlp = stanza.Pipeline('en', processors='tokenize,pos,depparse,lemma')

def get_word_embeddings_aligned(sentence: str):
    """
    Given a sentence, aligns subword embeddings from MiniLM to words using char offsets from Stanza.
    Returns a list of dicts with word, embedding, POS, dependency, and position.
    """
    doc = nlp(sentence)
    word_spans = [(word.text, word.start_char, word.end_char, word.upos, word.deprel) 
                  for sent in doc.sentences for word in sent.words]

    # Tokenize with offset mapping, no special tokens
    encoding = tokenizer(
        sentence,
        return_offsets_mapping=True,
        return_tensors="pt",
        add_special_tokens=False
    )
    offsets = encoding["offset_mapping"][0].tolist()
    input_ids = encoding["input_ids"]

    # Get subword embeddings
    with torch.no_grad():
        output = model(**{k: v for k, v in encoding.items() if k != 'offset_mapping'})
        subword_embeddings = output.last_hidden_state.squeeze(0)  # [seq_len, dim]

    # Align subwords to words
    aligned_data = []
    for i, (word, w_start, w_end, upos, deprel) in enumerate(word_spans):
        matching_sub_idxs = [j for j, (s, e) in enumerate(offsets) if s < w_end and e > w_start and s != e]

        if matching_sub_idxs:
            embs = [subword_embeddings[j] for j in matching_sub_idxs]
            word_embedding = torch.stack(embs).mean(dim=0)
            aligned_data.append({
                "word": word,
                "embedding": word_embedding,
                "pos": upos,
                "dep": deprel,
                "position": i
            })

    return aligned_data

from tqdm import tqdm

all_rows = []
for i, sent in tqdm(enumerate(brown_sentences), total=len(brown_sentences), desc="Processing sentences"):
    try:
        aligned = get_word_embeddings_aligned(sent)
        for row in aligned:
            row["sentence_id"] = i
            row["sentence"] = sent
            all_rows.append(row)
    except: 
        continue

# Convert to DataFrame
df = pd.DataFrame(all_rows)



Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-03-29 03:02:09 INFO: Downloaded file to /root/stanza_resources/resources.json
2025-03-29 03:02:09 INFO: Downloading default packages for language: en (English) ...
2025-03-29 03:02:10 INFO: File exists: /root/stanza_resources/en/default.zip
2025-03-29 03:02:13 INFO: Finished downloading models and saved to /root/stanza_resources
2025-03-29 03:02:13 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-03-29 03:02:13 INFO: Downloaded file to /root/stanza_resources/resources.json
2025-03-29 03:02:14 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2025-03-29 03:02:14 INFO: Using device: cuda
2025-03-29 03:02:14 INFO: Loading: tokenize
  return self.fget.__get__(instance, owner)()
2025-03-29 03:02:14 INFO: Loading: mwt
2025-03-29 03:02:14 INFO: Loading: pos
2025-03-29 03:02:16 INFO: Loading: lemma
2025-03-29 03:02:16 INFO: Loading: depparse
2025-03-29 03:02:16 INFO: Done loading processors!
Processing sentences:   1%|          | 195/20000 [00:18<27:34, 11.97it/s]

In [4]:
df.to_pickle("./dataset.pkl")  

In [3]:
%%capture
!pip install scikit-learn

In [None]:
import pandas as pd

# Load the DataFrame from a pickle file
df = pd.read_pickle("./dataset.pkl")

# NEW PROBES 

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# ==== Probes ====
class LinearProbe(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

class NonlinearProbe(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=128):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.network(x)

class AdaptiveSoftmaxProbe(nn.Module):
    def __init__(self, input_dim, n_classes):
        super().__init__()
        cutoffs = [1000, min(10000, n_classes - 2)] if n_classes > 10000 else [1000]
        self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
            in_features=input_dim,
            n_classes=n_classes,
            cutoffs=cutoffs,
            div_value=4.0
        )

    def forward(self, x, target=None):
        if target is not None:
            return self.adaptive_softmax(x, target)
        else:
            return self.adaptive_softmax.log_prob(x)

class RandomPredictionProbe(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes

    def forward(self, x):
        return torch.randint(0, self.n_classes, (x.size(0),), device=x.device)

# ==== Training ====
def train_probe(model, X, y, num_classes, task_name="TASK", epochs=10):
    model.train()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    dataset = TensorDataset(X, y)
    loader = DataLoader(dataset, batch_size=64, shuffle=True)

    for epoch in range(epochs):
        total_loss = 0
        for xb, yb in loader:
            optimizer.zero_grad()
            logits = model(xb)
            loss = loss_fn(logits, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"{task_name} - Epoch {epoch+1}, Loss: {total_loss:.4f}")
    return model

def train_linear_probe(X, y, num_classes, task_name="TASK", epochs=10):
    return train_probe(LinearProbe(X.shape[1], num_classes), X, y, num_classes, task_name, epochs)

def train_adaptive_probe(X, y, num_classes, task_name="TASK", epochs=10):
    model = AdaptiveSoftmaxProbe(X.shape[1], num_classes)
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    dataset = TensorDataset(X, y)
    loader = DataLoader(dataset, batch_size=64, shuffle=True)

    for epoch in range(epochs):
        total_loss = 0
        for xb, yb in loader:
            optimizer.zero_grad()
            output = model(xb, yb)
            loss = output.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"{task_name} - Epoch {epoch+1}, Loss: {total_loss:.4f}")
    return model

# ==== Evaluation ====
def evaluate_probe(model, X, y):
    model.eval()
    with torch.no_grad():
        if isinstance(model, RandomPredictionProbe):
            preds = model(X)
        else:
            preds = model(X).argmax(dim=1)
        accuracy = (preds == y).float().mean().item()
    return accuracy

def evaluate_adaptive_probe(model, X, y):
    model.eval()
    with torch.no_grad():
        log_probs = model(X).cpu()
        preds = torch.argmax(log_probs, dim=1)
        accuracy = (preds == y.cpu()).float().mean().item()
    return accuracy

# ==== Master Runner ====
def run_all_probes_and_controls(X, y_pos, y_dep, y_position, y_word, le_pos, le_dep, le_word):
    results = {}

    # === Main Probes ===
    pos_model = train_linear_probe(X, y_pos, len(le_pos.classes_), task_name="POS")
    dep_model = train_linear_probe(X, y_dep, len(le_dep.classes_), task_name="DEP")
    position_model = train_linear_probe(X, y_position, y_position.max().item() + 1, task_name="POSITION")
    word_model = train_adaptive_probe(X, y_word, len(le_word.classes_), task_name="WORD")

    # === Nonlinear ===
    pos_nonlinear = train_probe(NonlinearProbe(X.shape[1], len(le_pos.classes_)), X, y_pos, len(le_pos.classes_), task_name="POS_Nonlinear")
    dep_nonlinear = train_probe(NonlinearProbe(X.shape[1], len(le_dep.classes_)), X, y_dep, len(le_dep.classes_), task_name="DEP_Nonlinear")
    position_nonlinear = train_probe(NonlinearProbe(X.shape[1], y_position.max().item() + 1), X, y_position, y_position.max().item() + 1, task_name="POSITION_Nonlinear")

    # === Random Baselines ===
    pos_random = RandomPredictionProbe(len(le_pos.classes_))
    dep_random = RandomPredictionProbe(len(le_dep.classes_))
    position_random = RandomPredictionProbe(y_position.max().item() + 1)
    word_random = RandomPredictionProbe(len(le_word.classes_))

    # === Shuffled Labels ===
    y_pos_shuffled = y_pos[torch.randperm(len(y_pos))]
    y_dep_shuffled = y_dep[torch.randperm(len(y_dep))]
    y_position_shuffled = y_position[torch.randperm(len(y_position))]
    y_word_shuffled = y_word[torch.randperm(len(y_word))]

    pos_shuffled = train_linear_probe(X, y_pos_shuffled, len(le_pos.classes_), task_name="POS_Shuffled")
    dep_shuffled = train_linear_probe(X, y_dep_shuffled, len(le_dep.classes_), task_name="DEP_Shuffled")
    position_shuffled = train_linear_probe(X, y_position_shuffled, y_position.max().item() + 1, task_name="POSITION_Shuffled")
    word_shuffled_model = train_adaptive_probe(X, y_word_shuffled, len(le_word.classes_), task_name="WORD_Shuffled")

    # === Random Representations ===
    X_random = torch.randn_like(X)
    pos_randrep = train_linear_probe(X_random, y_pos, len(le_pos.classes_), task_name="POS_RandomRep")
    dep_randrep = train_linear_probe(X_random, y_dep, len(le_dep.classes_), task_name="DEP_RandomRep")
    position_randrep = train_linear_probe(X_random, y_position, y_position.max().item() + 1, task_name="POSITION_RandomRep")
    word_randrep_model = train_adaptive_probe(X_random, y_word, len(le_word.classes_), task_name="WORD_RandomRep")

    # === Dummy Task ===
    #token_lengths = torch.tensor([len(tok) for tok in tokens], device=X.device)
    #length_model = train_linear_probe(X, token_lengths, token_lengths.max().item() + 1, task_name="TOKEN_LENGTH")

    # === Evaluation ===
    print("\n--- Evaluation ---")
    results.update({
        "POS (Linear)": evaluate_probe(pos_model, X, y_pos),
        "POS (Nonlinear)": evaluate_probe(pos_nonlinear, X, y_pos),
        "POS (Random)": evaluate_probe(pos_random, X, y_pos),
        "POS (Shuffled)": evaluate_probe(pos_shuffled, X, y_pos),
        "POS (RandomRep)": evaluate_probe(pos_randrep, X_random, y_pos),

        "DEP (Linear)": evaluate_probe(dep_model, X, y_dep),
        "DEP (Nonlinear)": evaluate_probe(dep_nonlinear, X, y_dep),
        "DEP (Random)": evaluate_probe(dep_random, X, y_dep),
        "DEP (Shuffled)": evaluate_probe(dep_shuffled, X, y_dep),
        "DEP (RandomRep)": evaluate_probe(dep_randrep, X_random, y_dep),

        "POSITION (Linear)": evaluate_probe(position_model, X, y_position),
        "POSITION (Nonlinear)": evaluate_probe(position_nonlinear, X, y_position),
        "POSITION (Random)": evaluate_probe(position_random, X, y_position),
        "POSITION (Shuffled)": evaluate_probe(position_shuffled, X, y_position),
        "POSITION (RandomRep)": evaluate_probe(position_randrep, X_random, y_position),

        "WORD": evaluate_adaptive_probe(word_model, X, y_word),
        "WORD (Shuffled)": evaluate_adaptive_probe(word_shuffled_model, X, y_word),
        "WORD (RandomRep)": evaluate_adaptive_probe(word_randrep_model, torch.randn_like(X), y_word),
        "WORD (Random)": evaluate_probe(word_random, X, y_word),

        #"Token Length": evaluate_probe(length_model, X, token_lengths)
    })

    for name, acc in results.items():
        print(f"{name:30s}: {acc:.2%}")

    return results

run_all_probes_and_controls(X, y_pos, y_dep, y_position, y_word, le_pos, le_dep, le_word)



tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

AttributeError: Can't get attribute 'LinearProbe' on <module '__main__'>

# Adaptative Softwax

In [None]:
X = torch.stack(df['embedding'].tolist())

le_pos = LabelEncoder().fit(df["pos"])
le_dep = LabelEncoder().fit(df["dep"])
le_word = LabelEncoder().fit(df["word"])

y_pos = torch.tensor(le_pos.transform(df['pos'].values))
y_dep = torch.tensor(le_dep.transform(df['dep'].values))
y_word = torch.tensor(le_word.transform(df['word'].values))
y_position = torch.tensor(df['position'].values)

In [None]:
import torch.nn as nn

class LinearProbe(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

class NonlinearProbe(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=128):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.network(x)

class AdaptiveSoftmaxProbe(nn.Module):
    def __init__(self, input_dim, n_classes):
        super().__init__()
        cutoffs = [1000, min(10000, n_classes - 2)] if n_classes > 10000 else [1000]
        self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
            in_features=input_dim,
            n_classes=n_classes,
            cutoffs=cutoffs,
            div_value=4.0
        )

    def forward(self, x, target=None):
        if target is not None:
            return self.adaptive_softmax(x, target)
        else:
            return self.adaptive_softmax.log_prob(x)

class RandomPredictionProbe(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes

    def forward(self, x):
        batch_size = x.size(0)
        return torch.randint(0, self.n_classes, (batch_size,))

def train_linear_probe(X, y, num_classes, task_name="TASK", epochs=10):
    model = LinearProbe(X.shape[1], num_classes)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    dataset = TensorDataset(X, y)
    loader = DataLoader(dataset, batch_size=64, shuffle=True)

    for epoch in range(epochs):
        total_loss = 0
        for xb, yb in loader:
            optimizer.zero_grad()
            logits = model(xb)
            loss = loss_fn(logits, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"{task_name} - Epoch {epoch+1}, Loss: {total_loss:.4f}")

    return model

# ---- Training Functions ----
def train_probe(model, X, y, num_classes, task_name="TASK", epochs=10):
    model.train()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    dataset = TensorDataset(X, y)
    loader = DataLoader(dataset, batch_size=64, shuffle=True)

    for epoch in range(epochs):
        total_loss = 0
        for xb, yb in loader:
            optimizer.zero_grad()
            logits = model(xb)
            loss = loss_fn(logits, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"{task_name} - Epoch {epoch+1}, Loss: {total_loss:.4f}")

    return model

def evaluate_probe(model, X, y):
    model.eval()
    with torch.no_grad():
        if isinstance(model, RandomPredictionProbe):
            preds = model(X)
        else:
            preds = model(X).argmax(dim=1)
        accuracy = (preds == y).float().mean().item()
    return accuracy


# ---- Train Probes ----
pos_nonlinear_model = train_probe(NonlinearProbe(X.shape[1], len(le_pos.classes_)), X, y_pos, len(le_pos.classes_), task_name="POS_Nonlinear")
dep_nonlinear_model = train_probe(NonlinearProbe(X.shape[1], len(le_dep.classes_)), X, y_dep, len(le_dep.classes_), task_name="DEP_Nonlinear")
pos_random_model = RandomPredictionProbe(len(le_pos.classes_))
dep_random_model = RandomPredictionProbe(len(le_dep.classes_))
pos_model = train_linear_probe(X, y_pos, len(le_pos.classes_), task_name="POS")
dep_model = train_linear_probe(X, y_dep, len(le_dep.classes_), task_name="DEP")
position_model = train_linear_probe(X, y_position, y_position.max().item() + 1, task_name="POSITION")
word_model = train_adaptive_probe(X, y_word, len(le_word.classes_), task_name="WORD")


# ---- Evaluate All Probes ----
print("\n--- Evaluation ---")
print(f"POS (Linear) Accuracy:        {evaluate_probe(pos_model, X, y_pos):.2%}")
print(f"POS (Nonlinear) Accuracy:     {evaluate_probe(pos_nonlinear_model, X, y_pos):.2%}")
print(f"POS (Random) Accuracy:        {evaluate_probe(pos_random_model, X, y_pos):.2%}")
print(f"DEP (Linear) Accuracy:        {evaluate_probe(dep_model, X, y_dep):.2%}")
print(f"DEP (Nonlinear) Accuracy:     {evaluate_probe(dep_nonlinear_model, X, y_dep):.2%}")
print(f"DEP (Random) Accuracy:        {evaluate_probe(dep_random_model, X, y_dep):.2%}")
print(f"WORD Accuracy:                {evaluate_adaptive_probe(word_model, X, y_word):.2%}")

POS_Nonlinear - Epoch 1, Loss: 2888.2757
POS_Nonlinear - Epoch 2, Loss: 1976.6686
POS_Nonlinear - Epoch 3, Loss: 1779.3955
POS_Nonlinear - Epoch 4, Loss: 1658.4548
POS_Nonlinear - Epoch 5, Loss: 1574.7528
POS_Nonlinear - Epoch 6, Loss: 1510.8217
POS_Nonlinear - Epoch 7, Loss: 1459.2356
POS_Nonlinear - Epoch 8, Loss: 1418.0469
POS_Nonlinear - Epoch 9, Loss: 1380.2332
POS_Nonlinear - Epoch 10, Loss: 1348.4042
DEP_Nonlinear - Epoch 1, Loss: 7642.3478
DEP_Nonlinear - Epoch 6, Loss: 5592.7187
DEP_Nonlinear - Epoch 7, Loss: 5524.3890
DEP_Nonlinear - Epoch 8, Loss: 5473.1048
DEP_Nonlinear - Epoch 9, Loss: 5426.2508
DEP_Nonlinear - Epoch 10, Loss: 5384.5528
POS - Epoch 1, Loss: 4717.1947
POS - Epoch 2, Loss: 3245.3301
POS - Epoch 3, Loss: 3068.7674
POS - Epoch 4, Loss: 2992.4550
POS - Epoch 5, Loss: 2947.4905
POS - Epoch 6, Loss: 2920.5260
POS - Epoch 7, Loss: 2901.4198
POS - Epoch 8, Loss: 2887.2228
POS - Epoch 9, Loss: 2877.0748
POS - Epoch 10, Loss: 2868.1642
DEP - Epoch 1, Loss: 9735.5192
