In [38]:

from sqlalchemy import create_engine
import pandas as pd

# --- Configuration ---
# Define the database URI directly
# !! In real projects, manage credentials securely (e.g., env variables, secrets manager) !!
DB_URI = "postgresql://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki"

engine = create_engine(DB_URI)
# --- Optional: Set up logging ---
import logging
logging.basicConfig(level=logging.INFO)


# Example: Show tables (PostgreSQL metadata)
res = pd.read_sql("""
    SELECT
        time
        , title
        , url
        , score  
    FROM "hacker_news"."items" a
    WHERE a.type = 'story'
        AND a.time >= '2023-01-01 00:00:00'
        AND a.dead IS NOT TRUE
        AND LENGTH(a.title) > 0
        AND url IS NOT NULL
""", engine)

res


Unnamed: 0,time,title,url,score
0,2023-01-01 00:07:29,The physics of entropy and the origin of life ...,https://www.youtube.com/watch?v=Sz1n0RHwLqA,5
1,2023-01-01 00:08:15,"Xfinity Stream on Linux: A Tale of Widevine, C...",https://thebrokenrail.com/2022/12/31/xfinity-s...,144
2,2023-01-01 00:12:29,Let’s try ChatGPT. Is it any good? (Bisqwit),https://www.youtube.com/watch?v=q2A-MkGjvmI,4
3,2023-01-01 00:14:17,Because Internet: Understanding the New Rules ...,https://www.amazon.com/Because-Internet-Unders...,2
4,2023-01-01 00:14:24,Solar thermal storage using lunar regolith,https://www.esa.int/Enabling_Support/Preparing...,2
...,...,...,...,...
514994,2024-02-17 12:43:45,Show HN: Simple 1-Rep Max Calculator for Stren...,https://calcolomassimale.it/,1
514995,2024-02-17 12:44:34,Visualize Latent Spaces,https://github.com/enjalot/latent-scope,116
514996,2024-02-17 12:44:52,You don't need LPM Tries (2023),https://cookie.engineer/weblog/articles/you-do...,2
514997,2024-02-17 12:45:49,Show HN: ANXRacers – A 2D top-down time-attack...,https://studios.aeonax.com/racers/,2


In [39]:
data = res.copy()
# convert time into hour of day
data['hour'] = data['time'].dt.hour
data.drop(columns=['time'], inplace=True)
data.head(50)

Unnamed: 0,title,url,score,hour
0,The physics of entropy and the origin of life ...,https://www.youtube.com/watch?v=Sz1n0RHwLqA,5,0
1,"Xfinity Stream on Linux: A Tale of Widevine, C...",https://thebrokenrail.com/2022/12/31/xfinity-s...,144,0
2,Let’s try ChatGPT. Is it any good? (Bisqwit),https://www.youtube.com/watch?v=q2A-MkGjvmI,4,0
3,Because Internet: Understanding the New Rules ...,https://www.amazon.com/Because-Internet-Unders...,2,0
4,Solar thermal storage using lunar regolith,https://www.esa.int/Enabling_Support/Preparing...,2,0
5,The craft of SwiftUI API design: Progressive d...,https://developer.apple.com/videos/play/wwdc20...,4,0
6,Worst interview questions for software developers,https://fibery.io/blog/worst-interview-questio...,154,0
7,Running Advent of Code on a $2 microcontroller,https://medium.com/@erik_68861/running-advent-...,90,7
8,OpenBSD KDE Status Report 2022,https://www.sizeofvoid.org/posts/2022-26-12-op...,5,7
9,Australia mandates Covid tests for Chinese tou...,https://www.smh.com.au/national/covid-test-to-...,6,7


In [18]:
import sys
import os

# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

from utils import logger  # Import the `logger` module from `utils`

In [34]:
import torch
import torch.nn as nn
from word2vec.vocabulary import Vocabulary as vocab
from urllib.parse import urlparse

class TextUrlRegressionModel(nn.Module):
    def __init__(self, vocab_path, cbow_model_path, input_dim, hidden_dims=[128, 64, 32], dropout=0.2, domain_embedding_dim=32):
        """
        Combines vocabulary, CBOW embeddings, and MLP regression model.

        Args:
            vocab_path (str): Path to the saved vocabulary JSON.
            cbow_model_path (str): Path to the saved CBOW model state.
            input_dim (int): Dimension of the input embeddings.
            hidden_dims (List[int]): List of hidden layer dimensions.
            dropout (float): Dropout probability.
        """
        super().__init__()
        # Load vocabulary
        self.vocab = vocab.load_vocab(vocab_path)
        
        # Load CBOW model and extract embedding layer
        cbow_state = torch.load(cbow_model_path, map_location=torch.device('cpu'))
        self.embedding = nn.Embedding.from_pretrained(cbow_state['embeddings.weight'])

        # domain handling
        self.domain_embedding_dim = domain_embedding_dim
        self.domain_dict = {} # will map domains to indices
        self.unknown_domain_idx = 0 # index for unknown domain
        self.next_domain_idx = 1 # index for next domain to be added        

        # Initialize domain embedding layer
        self.domain_embedding = nn.Embedding(1, domain_embedding_dim)
        
        # Initialize MLP layers
        layers = []
        combined_dim = input_dim + domain_embedding_dim  # Combined dimension for text + domain
        prev_dim = combined_dim
        
        # Add hidden layers
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_dim),
                nn.Dropout(dropout)
            ])
            prev_dim = hidden_dim
        
        # Add final output layer
        layers.append(nn.Linear(prev_dim, 1))
        # Combine all layers
        self.regression_model = nn.Sequential(*layers)


    def extract_domain(self, url):
        """Extract base domain from URL using urlparse."""
        if not url or not isinstance(url, str):
            return None
        
        try:
            # Parse the URL
            parsed_url = urlparse(url)
            
            # Get the netloc part (e.g., 'www.example.com:80')
            netloc = parsed_url.netloc
            
            # If netloc is empty (might happen with malformed URLs), try path
            if not netloc and parsed_url.path:
                netloc = parsed_url.path.split('/')[0]
                
            # Remove port number if present
            netloc = netloc.split(':')[0]
            return netloc
        except:
            # Handle any parsing errors
            logger.error(f"Error parsing URL: {url}")
            return None


    def get_domain_index(self, url, update=False):
        """Convert URL to domain index."""
        domain = self.extract_domain(url)
        
        if not domain:
            return self.unknown_domain_idx
            
        if domain not in self.domain_dict:
            if update:
                self.domain_dict[domain] = self.next_domain_idx
                self.next_domain_idx += 1
                # Expand embedding layer if needed
                if self.next_domain_idx > self.domain_embedding.num_embeddings:
                    old_embedding = self.domain_embedding
                    new_size = max(self.next_domain_idx * 2, 100)  # Double size or at least 100
                    self.domain_embedding = nn.Embedding(new_size, self.domain_embedding_dim)
                    self.domain_embedding = self.domain_embedding.to(next(self.parameters()).device)
                    # Copy existing embeddings
                    with torch.no_grad():
                        self.domain_embedding.weight[:old_embedding.num_embeddings] = old_embedding.weight
            else:
                return self.unknown_domain_idx
                
        return self.domain_dict[domain]


    def initialize_domain_dict(self, urls):
        """Initialize domain dictionary from a list of URLs."""
        for url in urls:
            self.get_domain_index(url, update=True)



    def forward(self, text_embeddings, urls):
        """
        Forward pass for the model.
        Args:
            text_embeddings (Tensor): Averaged embeddings of shape (batch_size, input_dim).
            urls (List[str]): List of URLs for the batch.
        Returns:
            Tensor: Predicted regression values.
        """
        batch_size = text_embeddings.shape[0]
        device = text_embeddings.device
        
        # Get domain indices for each URL
        domain_indices = [self.get_domain_index(url) for url in urls]
        domain_indices_tensor = torch.tensor(domain_indices, dtype=torch.long).to(device)
        
        # Get domain embeddings
        domain_embeddings = self.domain_embedding(domain_indices_tensor)
        
        # Combine text and domain embeddings
        combined_embeddings = torch.cat([text_embeddings, domain_embeddings], dim=1)
        
        # Pass through regression model
        return self.regression_model(combined_embeddings)

In [29]:
from torch.utils.data import Dataset, DataLoader
import re


class TextDataset(Dataset):
    def __init__(self, texts, urls, targets, vocab):
        """
        Custom Dataset for text regression.
        
        Args:
            texts (List[str]): List of input texts.
            targets (List[float]): List of target regression values.
            vocab (Vocabulary): Vocabulary object for tokenization.
        """
        self.texts = texts
        self.urls = urls
        self.targets = torch.tensor(targets, dtype=torch.float32)
        self.vocab = vocab

    @staticmethod
    def preprocess_text(text):
        import re
        # Replace any non-alphanumeric character with a space
        text = re.sub(r"[^a-zA-Z0-9]", " ", text)
        # Convert to lowercase and split into tokens
        tokens = text.lower().split()
        return tokens

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        url = self.urls[idx] if self.urls is not None else None
        target = self.targets[idx]
        # Preprocess the text
        tokens = self.preprocess_text(text)
        # Convert tokens to indices using the vocabulary
        indices = [self.vocab.get_index(token) for token in tokens]
        return torch.tensor(indices, dtype=torch.long), url, target


In [30]:

def make_collate_fn(model, device):
    def collate_fn(batch):
        sequences, urls, targets = zip(*batch)
        targets = torch.stack(targets).to(device)

        embedded_sequences = []
        for seq in sequences:
            if len(seq) == 0:  # Handle empty sequences
                embedded_sequences.append(torch.zeros(model.embedding.embedding_dim).to(device))
            else:
                embeddings = model.embedding(seq.to(device))
                avg_embedding = embeddings.mean(dim=0)
                embedded_sequences.append(avg_embedding)

        embedded_batch = torch.stack(embedded_sequences).to(device)
        return embedded_batch, urls, targets
    return collate_fn

In [44]:
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for text_inputs, urls, targets in dataloader:
        text_inputs, targets = text_inputs.to(device), targets.to(device)
        
        # Forward pass with both text and URLs
        outputs = model(text_inputs, urls)
        loss = criterion(outputs.squeeze(), targets)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(dataloader)


def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    predictions = []
    actuals = []

    with torch.no_grad():
        for text_inputs, urls, targets in dataloader:
            text_inputs, targets = text_inputs.to(device), targets.to(device)

            outputs = model(text_inputs, urls)
            loss = criterion(outputs.squeeze(), targets)
            total_loss += loss.item()

            # Reverse log1p transform
            predictions.extend(np.expm1(outputs.squeeze().cpu().numpy()))
            actuals.extend(np.expm1(targets.cpu().numpy()))

    avg_loss = total_loss / len(dataloader)
    return avg_loss, predictions, actuals


In [40]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Split the data
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)


In [41]:
import numpy as np

# Detect the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize model
model = TextUrlRegressionModel(
    vocab_path="../models/word2vec/text8_vocab_NWAll_MF5.json",
    cbow_model_path="../models/word2vec/CBOW_D128_W5_NWAll_MF5_E15_LR0.001_BS512/model_state.pth",
    input_dim=128,
    hidden_dims=[96, 64, 32, 16],
    domain_embedding_dim=32
)
model = model.to(device)

# initialize domain dictionary with training URLs
model.initialize_domain_dict(train_df['url'].tolist())

# Apply log scaling to the scores in the training and testing datasets
train_df['score'] = train_df['score'].apply(lambda x: np.log1p(x))  # log1p ensures log(0) is handled
test_df['score'] = test_df['score'].apply(lambda x: np.log1p(x))


# Create datasets including URLs
train_dataset = TextDataset(
    texts=train_df['title'].tolist(),
    urls=train_df['url'].tolist(),
    targets=train_df['score'].tolist(),
    vocab=model.vocab
)

test_dataset = TextDataset(
    texts=test_df['title'].tolist(),
    urls=test_df['url'].tolist(),
    targets=test_df['score'].tolist(),
    vocab=model.vocab
)

# Create dataloaders with the custom collate function
batch_size = 2048

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=make_collate_fn(model, device)
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=make_collate_fn(model, device)
)


# Simplified training loop using train_model function
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = nn.L1Loss()

num_epochs = 1
for epoch in range(num_epochs):
    avg_loss = train_model(model, train_loader, optimizer, criterion, device)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

Using device: cuda
2025-04-18 14:53:44 | DropoutDisco | INFO     | [vocabulary.py:110] | Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


2025-04-18 14:53:44 | DropoutDisco | INFO     | [vocabulary.py:123] | 📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


Epoch 1/1, Loss: 0.9365


In [335]:
torch.save(model, 'regression_model.pth')

In [22]:
# find the 10 entries with the highest scores
top_10_entries = test_df.nlargest(10, 'score')
# Print the top 10 entries
print("Top 10 entries with the highest scores:")
print(top_10_entries[['title', 'score']])


Top 10 entries with the highest scores:
                                                    title     score
281486                OpenAI's board has fired Sam Altman  8.649974
140506                Apollo will close down on June 30th  8.137688
427865                                             GPT-4o  8.051341
363454                                            Airfoil  7.841493
109675  Google “We have no moat, and neither does OpenAI”  7.805882
285829                   Reflecting on 18 Years at Google  7.702104
307641            Figma and Adobe abandon proposed merger  7.697121
513519       Bypassing airport security via SQL injection  7.602900
285557  We have reached an agreement in principle for ...  7.590852
315983                  In 2024, please switch to Firefox  7.482119


In [43]:
num_epochs = 2

for epoch in range(num_epochs):
    avg_loss = train_model(model, train_loader, optimizer, criterion, device)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

Epoch 1/2, Loss: 0.7659
Epoch 2/2, Loss: 0.7484


In [85]:
# Evaluate on test set
test_loss, predictions, actuals = evaluate_model(model, test_loader, criterion, device)

# Calculate error metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

mse = mean_squared_error(actuals, predictions)
mae = mean_absolute_error(actuals, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(actuals, predictions)

print(f'Test Loss: {test_loss:.4f}')
print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')
print(f'R2 Score: {r2:.4f}')


Test Loss: 0.7859
MSE: 6249.0697
RMSE: 79.0511
MAE: 19.1072
R2 Score: -0.0493


In [None]:
def predict_score(model, text, url, device):
    model.eval()
    model = model.to(device)

    tokens = TextDataset.preprocess_text(text)
    indices = [model.vocab.get_index(token) for token in tokens]
    token_tensor = torch.tensor(indices, dtype=torch.long).to(device)

    # Get the embedding for each token
    with torch.no_grad():
        embeddings = model.embedding(token_tensor)  # shape: (seq_len, embedding_dim)
        averaged_embedding = embeddings.mean(dim=0).unsqueeze(0)  # shape: (1, embedding_dim)

        prediction = model(averaged_embedding, [url])  # Note: [url] to match batch size 1
        return np.expm1(prediction.item())


print("Manual input and prediction: \n")
# fix the above issue
print("Text: test sentence with a random website")
print("URL: https://example.com")
print("Score: " + str(predict_score(model, "test sentence with a random website", "https://google.com", device)))

print("\nIterating through a few samples from the test df: \n")

# iterate through a few samples with urls, showing samples, urls, and predictions
for i in range(10,15):
    text = test_df.iloc[i]['title']
    url = test_df.iloc[i]['url']
    prediction = predict_score(model, text, url, device)
    print(f"Text: {text}\nURL: {url}\nPredicted Score: {prediction:.4f}\n")


Manual input and prediction: 

Text: test sentence with a random website
URL: https://example.com
Score: 4.766228419170504

Iterating through a few samples from the test df: 

Text: Three class society
URL: https://geohot.github.io//blog/jekyll/update/2023/10/03/three-class-society.html
Predicted Score: 1.1060

Text: Google Slashes Most Jobs at Area 120 Incubator as Part of Cuts
URL: https://www.bloomberg.com/news/articles/2023-01-20/google-slashes-most-jobs-at-incubator-area-120-as-part-of-cuts
Predicted Score: 3.2320

Text: Show HN: Supabase Admin Dashboard
URL: https://uibakery.io/templates/supabase-admin
Predicted Score: 14.6956

Text: Artificial intelligence in drug discovery: what is realistic, what are illusions
URL: https://www.sciencedirect.com/science/article/pii/S1359644620305274
Predicted Score: 2.0266

Text: The settlers brought the lottery to America. It's had a long, uneven history
URL: https://text.npr.org/1192893936
Predicted Score: 2.5163



In [97]:
data

Unnamed: 0,title,url,score,hour
0,The physics of entropy and the origin of life ...,https://www.youtube.com/watch?v=Sz1n0RHwLqA,5,0
1,"Xfinity Stream on Linux: A Tale of Widevine, C...",https://thebrokenrail.com/2022/12/31/xfinity-s...,144,0
2,Let’s try ChatGPT. Is it any good? (Bisqwit),https://www.youtube.com/watch?v=q2A-MkGjvmI,4,0
3,Because Internet: Understanding the New Rules ...,https://www.amazon.com/Because-Internet-Unders...,2,0
4,Solar thermal storage using lunar regolith,https://www.esa.int/Enabling_Support/Preparing...,2,0
...,...,...,...,...
514994,Show HN: Simple 1-Rep Max Calculator for Stren...,https://calcolomassimale.it/,1,12
514995,Visualize Latent Spaces,https://github.com/enjalot/latent-scope,116,12
514996,You don't need LPM Tries (2023),https://cookie.engineer/weblog/articles/you-do...,2,12
514997,Show HN: ANXRacers – A 2D top-down time-attack...,https://studios.aeonax.com/racers/,2,12


In [26]:
# Iterate over all words in the vocabulary and predict scores
word_scores = []

for word in model.vocab.idx2word:
    score = predict_score(model, word, device)
    word_scores.append((word, score))

# Sort the words by their predicted scores in descending order
word_scores = sorted(word_scores, key=lambda x: x[1], reverse=True)

# Get the top 5 words with the highest scores
top_5_words = word_scores[:5]

# Print the results
print("Top 5 words with the highest scores:")
for word, score in top_5_words:
    print(f"{word}: {score}")



Top 5 words with the highest scores:
besieges: 12.070055859253058
fbi: 9.683543925330543
humiliated: 9.279914603186548
infiltrated: 7.236810527204565
tugs: 7.150932142170766


In [30]:
words = ['centuries', 'populous', 'kwh', 'nebuchadrezzar', 'zionists', 'palestine', 'cormen', 'ungovernable', 'majuscule', 'gurion', 'bethlehem', 'amicable', 'dioceses', 'nehemiah', 'hellenized', 'babylonian', 'gaza', 'plotarea', 'damasus', 'nicea', 'waismann', 'sephardic', 'century', 'conclave', 'golan', 'antioch', 'teutonic', 'szil', 'elector', 'emesa', 'manasseh', 'phoenicians', 'burkert', 'germain', 'levant', 'anastasius', 'syriac', 'suffrage', 'hierarchs', 'suzerainty', 'geographer', 'codepoint', 'knesset', 'landtag', 'jamnia', 'hyrcanus', 'martyred', 'francia', 'berbers', 'koresh']

n = 20
print(" ".join(words[:n]))

centuries populous kwh nebuchadrezzar zionists palestine cormen ungovernable majuscule gurion bethlehem amicable dioceses nehemiah hellenized babylonian gaza plotarea damasus nicea


In [None]:
train_df.head(5)

# Create datasets
train_dataset = TextDataset(
    texts=train_df['title'].tolist(),
    targets=train_df['score'].tolist(),
    vocab=model.vocab  # Use the model's vocabulary
)

# Create dataloaders with the custom collate function
batch_size = 10

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=make_collate_fn(model, device)
)

item = 2

# Inspect raw text inputs
sample_text = train_df['title'].iloc[item]
print(f"Raw text: {sample_text}")

# Tokenize the text
#tokens = sample_text.lower().split()
tokens = preprocess_text(sample_text)
print(f"Tokens: {tokens}")

# Convert tokens to indices using the vocabulary
vocab = vocab.load_vocab("../models/word2vec/text8_vocab_NWAll_MF5.json")
indices = [vocab.get_index(token) for token in tokens]
print(f"Token indices: {indices}")

# Get embeddings for the token indices
token_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)
embeddings = model.embedding(token_tensor)
print(f"Embeddings: {embeddings[0]}")

# Average the embeddings
avg_embedding = embeddings.mean(dim=1)
print(f"Averaged embedding: {avg_embedding}")

Raw text: Show HN: Buyidentities.com
Tokens: ['show', 'hn', 'buyidentities', 'com']
2025-04-17 10:38:09 | DropoutDisco | INFO     | [vocabulary.py:110] | Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


2025-04-17 10:38:09 | DropoutDisco | INFO     | [vocabulary.py:123] | 📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


Token indices: [2194, 38072, 0, 2806]
Embeddings: tensor([[ 1.9466e+00,  4.3013e-01, -2.0402e+00,  4.4432e-01, -2.1084e+00,
          2.7667e-01,  9.9176e-01,  8.9653e-01,  2.2319e-01, -1.3121e+00,
          4.9519e-02, -1.8727e-01,  7.8188e-01, -2.2465e+00, -1.2707e-01,
         -3.1723e-03,  1.3502e+00,  1.2984e+00, -1.1449e-01, -1.8985e-01,
         -9.1769e-01, -1.3087e-01,  2.9257e-01, -8.9975e-01, -5.0819e-01,
          8.5919e-01, -3.4376e+00,  7.6520e-01,  1.3828e+00,  2.9100e-01,
          1.2134e+00,  7.9240e-01,  4.1146e-01, -1.7667e+00, -2.8801e+00,
          2.5893e-01, -1.6380e-01,  3.6928e-01, -5.3449e-01, -1.8259e-01,
          2.1736e+00, -2.1261e+00,  1.3076e+00, -3.2017e+00,  3.3742e-01,
         -1.2346e+00,  1.3087e+00,  2.1186e+00, -1.7250e+00,  4.6178e-01,
         -5.8093e-01, -1.4234e-01,  1.6505e+00, -1.9864e-02,  1.4873e-01,
         -6.8776e-01,  1.3533e+00, -8.1330e-01,  9.0072e-01,  7.5148e-01,
          7.8481e-01, -8.7399e-01, -4.5775e-02, -1.1776e-01, -

In [332]:
# Load the vocabulary
vocab = vocab.load_vocab("../models/word2vec/text8_vocab_NWAll_MF5.json")

# Load the CBOW model state
cbow_state = torch.load("../models/word2vec/CBOW_D128_W5_NWAll_MF5_E15_LR0.001_BS512/model_state.pth", map_location=torch.device('cpu'))

# Extract the embeddings
embeddings = cbow_state['embeddings.weight']

# Define a function to compute cosine similarity
def cosine_similarity(vec1, vec2):
    return torch.dot(vec1, vec2) / (torch.norm(vec1) * torch.norm(vec2))

# Check similarity between words
def check_word_similarity(word1, word2, vocab, embeddings):
    idx1 = vocab.get_index(word1)
    idx2 = vocab.get_index(word2)
    vec1 = embeddings[idx1]
    vec2 = embeddings[idx2]
    similarity = cosine_similarity(vec1, vec2)
    return similarity.item()

# Example: Check similarity between two words
word1 = "prince"
word2 = "queen"
similarity = check_word_similarity(word1, word2, vocab, embeddings)
print(f"Cosine similarity between '{word1}' and '{word2}': {similarity:.4f}")

2025-04-17 15:55:04 | DropoutDisco | INFO     | [vocabulary.py:110] | Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


2025-04-17 15:55:04 | DropoutDisco | INFO     | [vocabulary.py:123] | 📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


Cosine similarity between 'prince' and 'queen': 0.3710


In [209]:


# Compare the mean embeddings of two user-input sentences
sentence1 = "prince"
sentence2 = "queen"

# Preprocess and tokenize the sentences
tokens1 = preprocess_text(sentence1)
tokens2 = preprocess_text(sentence2)

# Convert tokens to indices using the vocabulary
indices1 = [vocab.get_index(token) for token in tokens1]
indices2 = [vocab.get_index(token) for token in tokens2]

# Get embeddings for the token indices
token_tensor1 = torch.tensor(indices1, dtype=torch.long).unsqueeze(0).to(device)
token_tensor2 = torch.tensor(indices2, dtype=torch.long).unsqueeze(0).to(device)

embeddings1 = model.embedding(token_tensor1)
embeddings2 = model.embedding(token_tensor2)

# Average the embeddings
avg_embedding1 = embeddings1.mean(dim=1)
avg_embedding2 = embeddings2.mean(dim=1)

# Print the averaged embeddings
#print(f"Averaged embedding for sentence 1: {avg_embedding1}")
#print(f"Averaged embedding for sentence 2: {avg_embedding2}")

# Compute cosine similarity between the two averaged embeddings
cos_sim = cosine_similarity(avg_embedding1.squeeze(), avg_embedding2.squeeze())
print(f"Cosine similarity between the two sentences: {cos_sim:.4f}")

Cosine similarity between the two sentences: 0.3710


In [None]:
# save the model state pth
model_path = "../models/text_regression_model.pth"
