In [2]:

from sqlalchemy import create_engine
import pandas as pd

# --- Configuration ---
# Define the database URI directly
# !! In real projects, manage credentials securely (e.g., env variables, secrets manager) !!
DB_URI = "postgresql://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki"

engine = create_engine(DB_URI)
# --- Optional: Set up logging ---
import logging
logging.basicConfig(level=logging.INFO)


# Example: Show tables (PostgreSQL metadata)
res = pd.read_sql("""
    SELECT
        time
        , title
        , url
        , score  
    FROM "hacker_news"."items" a
    WHERE a.type = 'story'
        AND a.time >= '2023-01-01 00:00:00'
        AND a.dead IS NOT TRUE
        AND LENGTH(a.title) > 0
""", engine)

res


Unnamed: 0,time,title,url,score
0,2023-01-01 00:07:29,The physics of entropy and the origin of life ...,https://www.youtube.com/watch?v=Sz1n0RHwLqA,5
1,2023-01-01 00:08:15,"Xfinity Stream on Linux: A Tale of Widevine, C...",https://thebrokenrail.com/2022/12/31/xfinity-s...,144
2,2023-01-01 00:09:17,"Ask HN: Examples of successful, small companie...",,4
3,2023-01-01 00:12:29,Let’s try ChatGPT. Is it any good? (Bisqwit),https://www.youtube.com/watch?v=q2A-MkGjvmI,4
4,2023-01-01 00:14:17,Because Internet: Understanding the New Rules ...,https://www.amazon.com/Because-Internet-Unders...,2
...,...,...,...,...
549327,2024-06-22 05:31:18,Solving Maxwell's Equations with Non-Trainable...,https://arxiv.org/abs/2405.00814,2
549328,2024-06-22 05:37:07,Khronos: glTF Interactivity Specification Rele...,https://www.khronos.org/blog/gltf-interactivit...,1
549329,2024-06-22 05:39:21,Family whose roof was damaged by space debris ...,https://arstechnica.com/space/2024/06/family-w...,6
549330,2024-06-22 05:41:12,YouTube confirms crackdown on VPN users access...,https://techcrunch.com/2024/06/20/youtube-conf...,2


In [42]:
titles_and_scores = res.loc[:, ['title', 'score']].copy()
titles_and_scores.head(50)

Unnamed: 0,title,score
0,The physics of entropy and the origin of life ...,5
1,"Xfinity Stream on Linux: A Tale of Widevine, C...",144
2,"Ask HN: Examples of successful, small companie...",4
3,Let’s try ChatGPT. Is it any good? (Bisqwit),4
4,Because Internet: Understanding the New Rules ...,2
5,Solar thermal storage using lunar regolith,2
6,The craft of SwiftUI API design: Progressive d...,4
7,Worst interview questions for software developers,154
8,Running Advent of Code on a $2 microcontroller,90
9,OpenBSD KDE Status Report 2022,5


In [4]:
import sys
import os

# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

from utils import logger  # Import the `logger` module from `utils`

In [None]:
import torch
import torch.nn as nn
from word2vec.vocabulary import Vocabulary as vocab

class TextToRegressionModel(nn.Module):
    def __init__(self, vocab_path, cbow_model_path, input_dim, hidden_dims=[128, 64, 32], dropout=0.2):
        """
        Combines vocabulary, CBOW embeddings, and MLP regression model.

        Args:
            vocab_path (str): Path to the saved vocabulary JSON.
            cbow_model_path (str): Path to the saved CBOW model state.
            input_dim (int): Dimension of the input embeddings.
            hidden_dims (List[int]): List of hidden layer dimensions.
            dropout (float): Dropout probability.
        """
        super().__init__()
        # Load vocabulary
        self.vocab = vocab.load_vocab(vocab_path)
        
        # Load CBOW model and extract embedding layer
        cbow_state = torch.load(cbow_model_path, map_location=torch.device('cpu'))
        self.embedding = nn.Embedding.from_pretrained(cbow_state['embeddings.weight'])
        
        # Initialize MLP layers
        layers = []
        prev_dim = input_dim
        
        # Add hidden layers
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_dim),
                nn.Dropout(dropout)
            ])
            prev_dim = hidden_dim
        
        # Add final output layer
        layers.append(nn.Linear(prev_dim, 1))
        # Combine all layers
        self.regression_model = nn.Sequential(*layers)

    def forward(self, x):
        """
        Forward pass for the model.
        Args:
            x (Tensor): Averaged embeddings of shape (batch_size, input_dim).
        Returns:
            Tensor: Predicted regression values.
        """
        return self.regression_model(x)

In [52]:
from torch.utils.data import Dataset, DataLoader
import re


class TextDataset(Dataset):
    def __init__(self, texts, targets, vocab):
        """
        Custom Dataset for text regression.
        
        Args:
            texts (List[str]): List of input texts.
            targets (List[float]): List of target regression values.
            vocab (Vocabulary): Vocabulary object for tokenization.
        """
        self.texts = texts
        self.targets = torch.tensor(targets, dtype=torch.float32)
        self.vocab = vocab

    @staticmethod
    def preprocess_text(text):
        import re
        # Replace any non-alphanumeric character with a space
        text = re.sub(r"[^a-zA-Z0-9]", " ", text)
        # Convert to lowercase and split into tokens
        tokens = text.lower().split()
        return tokens

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        # Preprocess the text
        tokens = self.preprocess_text(text)
        # Convert tokens to indices using the vocabulary
        indices = [self.vocab.get_index(token) for token in tokens]
        return torch.tensor(indices, dtype=torch.long), target


In [8]:

def make_collate_fn(model, device):
    def collate_fn(batch):
        sequences, targets = zip(*batch)
        targets = torch.stack(targets).to(device)

        embedded_sequences = []
        for seq in sequences:
            if len(seq) == 0:  # Handle empty sequences
                embedded_sequences.append(torch.zeros(model.embedding.embedding_dim).to(device))
            else:
                embeddings = model.embedding(seq.to(device))
                avg_embedding = embeddings.mean(dim=0)
                embedded_sequences.append(avg_embedding)

        embedded_batch = torch.stack(embedded_sequences).to(device)
        return embedded_batch, targets
    return collate_fn

In [9]:
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            total_loss += loss.item()
    return total_loss / len(dataloader)

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Split the data
train_df, test_df = train_test_split(titles_and_scores, test_size=0.2, random_state=42)


In [56]:
import numpy as np

# Detect the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize model
model = TextToRegressionModel(
    vocab_path="../models/word2vec/text8_vocab_NWAll_MF5.json",  # Replace with your actual path
    cbow_model_path="../models/word2vec/CBOW_D128_W5_NWAll_MF5_E15_LR0.001_BS512/model_state.pth",  # Replace with your actual path
    input_dim=128,  # Match your CBOW embedding dimension
    hidden_dims=[96, 64, 32, 16],  # Example hidden layer dimensions
)
model = model.to(device)


# Apply log scaling to the scores in the training and testing datasets
train_df['score'] = train_df['score'].apply(lambda x: np.log(x))  # log1p ensures log(0) is handled
test_df['score'] = test_df['score'].apply(lambda x: np.log(x))


# Create datasets
train_dataset = TextDataset(
    texts=train_df['title'].tolist(),
    targets=train_df['score'].tolist(),
    vocab=model.vocab  # Use the model's vocabulary
)

test_dataset = TextDataset(
    texts=test_df['title'].tolist(),
    targets=test_df['score'].tolist(),
    vocab=model.vocab
)

# Create dataloaders with the custom collate function
batch_size = 2048

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=make_collate_fn(model, device)
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=make_collate_fn(model, device)
)


# Simplified training loop using train_model function
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = nn.L1Loss()

num_epochs = 1
for epoch in range(num_epochs):
    avg_loss = train_model(model, train_loader, optimizer, criterion, device)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

Using device: cuda
2025-04-18 12:11:43 | DropoutDisco | INFO     | [vocabulary.py:110] | Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


2025-04-18 12:11:43 | DropoutDisco | INFO     | [vocabulary.py:123] | 📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json
  train_df['score'] = train_df['score'].apply(lambda x: np.log(x))  # log1p ensures log(0) is handled
  test_df['score'] = test_df['score'].apply(lambda x: np.log(x))


Epoch 1/1, Loss: inf


In [335]:
torch.save(model, 'regression_model.pth')

In [35]:
num_epochs = 3
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

Epoch 1/3, Loss: 0.9888
Epoch 2/3, Loss: 0.9829
Epoch 3/3, Loss: 0.9794


In [36]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Evaluate model on test set
model.eval()
test_loss = 0
predictions = []
actuals = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)

        # Compute the loss for this batch
        loss = criterion(outputs.squeeze(), targets)
        test_loss += loss.item()  # Accumulate the loss

        # Reverse the log transformation
        predictions.extend(np.exp(outputs.squeeze().cpu().numpy()))  # Convert predictions back to original scale
        actuals.extend(np.exp(targets.cpu().numpy()))  # Convert targets back to original scale

# Calculate errors in the original scale
mse = mean_squared_error(actuals, predictions)
mae = mean_absolute_error(actuals, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(actuals, predictions)

print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')
print(f'R2 Score: {r2:.4f}')

MSE: 5976.3472
RMSE: 77.3068
MAE: 19.1020
R2 Score: -0.0568


In [53]:
def predict_score(model, text, device):
    """
    Predict score for a single text input.
    
    Args:
        model (TextToRegressionModel): Trained model
        text (str): Input text to predict score for
        device (str): Device to run prediction on
        
    Returns:
        float: Predicted score
    """
    # Use the preprocess_text function for text preprocessing
    tokens = TextDataset.preprocess_text(text)
    #tokens = text.lower().split() # old preprocessing that just splits and lowercases
    #print(tokens)
    model.eval()
    model = model.to(device)
    with torch.no_grad():
        # Preprocess the text
        indices = [model.vocab.get_index(token) for token in tokens]
        #print(indices)
        token_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)
        
        # Get embeddings and average
        embeddings = model.embedding(token_tensor)

        avg_embedding = embeddings.mean(dim=1)
        
        # Get prediction
        prediction = model(avg_embedding)
        return np.exp(prediction.item()).item()


print(predict_score(model, "monarch", device))

2.0299300119627595


# BELOW ARE EXPLORATORY ITEMS

In [3]:
# find the 10 entries with the highest scores
top_10_entries = test_df.nlargest(10, 'score')
# Print the top 10 entries
print("Top 10 entries with the highest scores:")
print(top_10_entries[['title', 'score']])


NameError: name 'test_df' is not defined

In [26]:
# Iterate over all words in the vocabulary and predict scores
word_scores = []

for word in model.vocab.idx2word:
    score = predict_score(model, word, device)
    word_scores.append((word, score))

# Sort the words by their predicted scores in descending order
word_scores = sorted(word_scores, key=lambda x: x[1], reverse=True)

# Get the top 5 words with the highest scores
top_5_words = word_scores[:5]

# Print the results
print("Top 5 words with the highest scores:")
for word, score in top_5_words:
    print(f"{word}: {score}")



Top 5 words with the highest scores:
besieges: 12.070055859253058
fbi: 9.683543925330543
humiliated: 9.279914603186548
infiltrated: 7.236810527204565
tugs: 7.150932142170766


In [30]:
words = ['centuries', 'populous', 'kwh', 'nebuchadrezzar', 'zionists', 'palestine', 'cormen', 'ungovernable', 'majuscule', 'gurion', 'bethlehem', 'amicable', 'dioceses', 'nehemiah', 'hellenized', 'babylonian', 'gaza', 'plotarea', 'damasus', 'nicea', 'waismann', 'sephardic', 'century', 'conclave', 'golan', 'antioch', 'teutonic', 'szil', 'elector', 'emesa', 'manasseh', 'phoenicians', 'burkert', 'germain', 'levant', 'anastasius', 'syriac', 'suffrage', 'hierarchs', 'suzerainty', 'geographer', 'codepoint', 'knesset', 'landtag', 'jamnia', 'hyrcanus', 'martyred', 'francia', 'berbers', 'koresh']

n = 20
print(" ".join(words[:n]))

centuries populous kwh nebuchadrezzar zionists palestine cormen ungovernable majuscule gurion bethlehem amicable dioceses nehemiah hellenized babylonian gaza plotarea damasus nicea


In [None]:
train_df.head(5)

# Create datasets
train_dataset = TextDataset(
    texts=train_df['title'].tolist(),
    targets=train_df['score'].tolist(),
    vocab=model.vocab  # Use the model's vocabulary
)

# Create dataloaders with the custom collate function
batch_size = 10

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=make_collate_fn(model, device)
)

item = 2

# Inspect raw text inputs
sample_text = train_df['title'].iloc[item]
print(f"Raw text: {sample_text}")

# Tokenize the text
#tokens = sample_text.lower().split()
tokens = preprocess_text(sample_text)
print(f"Tokens: {tokens}")

# Convert tokens to indices using the vocabulary
vocab = vocab.load_vocab("../models/word2vec/text8_vocab_NWAll_MF5.json")
indices = [vocab.get_index(token) for token in tokens]
print(f"Token indices: {indices}")

# Get embeddings for the token indices
token_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)
embeddings = model.embedding(token_tensor)
print(f"Embeddings: {embeddings[0]}")

# Average the embeddings
avg_embedding = embeddings.mean(dim=1)
print(f"Averaged embedding: {avg_embedding}")

Raw text: Show HN: Buyidentities.com
Tokens: ['show', 'hn', 'buyidentities', 'com']
2025-04-17 10:38:09 | DropoutDisco | INFO     | [vocabulary.py:110] | Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


2025-04-17 10:38:09 | DropoutDisco | INFO     | [vocabulary.py:123] | 📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


Token indices: [2194, 38072, 0, 2806]
Embeddings: tensor([[ 1.9466e+00,  4.3013e-01, -2.0402e+00,  4.4432e-01, -2.1084e+00,
          2.7667e-01,  9.9176e-01,  8.9653e-01,  2.2319e-01, -1.3121e+00,
          4.9519e-02, -1.8727e-01,  7.8188e-01, -2.2465e+00, -1.2707e-01,
         -3.1723e-03,  1.3502e+00,  1.2984e+00, -1.1449e-01, -1.8985e-01,
         -9.1769e-01, -1.3087e-01,  2.9257e-01, -8.9975e-01, -5.0819e-01,
          8.5919e-01, -3.4376e+00,  7.6520e-01,  1.3828e+00,  2.9100e-01,
          1.2134e+00,  7.9240e-01,  4.1146e-01, -1.7667e+00, -2.8801e+00,
          2.5893e-01, -1.6380e-01,  3.6928e-01, -5.3449e-01, -1.8259e-01,
          2.1736e+00, -2.1261e+00,  1.3076e+00, -3.2017e+00,  3.3742e-01,
         -1.2346e+00,  1.3087e+00,  2.1186e+00, -1.7250e+00,  4.6178e-01,
         -5.8093e-01, -1.4234e-01,  1.6505e+00, -1.9864e-02,  1.4873e-01,
         -6.8776e-01,  1.3533e+00, -8.1330e-01,  9.0072e-01,  7.5148e-01,
          7.8481e-01, -8.7399e-01, -4.5775e-02, -1.1776e-01, -

In [332]:
# Load the vocabulary
vocab = vocab.load_vocab("../models/word2vec/text8_vocab_NWAll_MF5.json")

# Load the CBOW model state
cbow_state = torch.load("../models/word2vec/CBOW_D128_W5_NWAll_MF5_E15_LR0.001_BS512/model_state.pth", map_location=torch.device('cpu'))

# Extract the embeddings
embeddings = cbow_state['embeddings.weight']

# Define a function to compute cosine similarity
def cosine_similarity(vec1, vec2):
    return torch.dot(vec1, vec2) / (torch.norm(vec1) * torch.norm(vec2))

# Check similarity between words
def check_word_similarity(word1, word2, vocab, embeddings):
    idx1 = vocab.get_index(word1)
    idx2 = vocab.get_index(word2)
    vec1 = embeddings[idx1]
    vec2 = embeddings[idx2]
    similarity = cosine_similarity(vec1, vec2)
    return similarity.item()

# Example: Check similarity between two words
word1 = "prince"
word2 = "queen"
similarity = check_word_similarity(word1, word2, vocab, embeddings)
print(f"Cosine similarity between '{word1}' and '{word2}': {similarity:.4f}")

2025-04-17 15:55:04 | DropoutDisco | INFO     | [vocabulary.py:110] | Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


2025-04-17 15:55:04 | DropoutDisco | INFO     | [vocabulary.py:123] | 📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


Cosine similarity between 'prince' and 'queen': 0.3710


In [209]:


# Compare the mean embeddings of two user-input sentences
sentence1 = "prince"
sentence2 = "queen"

# Preprocess and tokenize the sentences
tokens1 = preprocess_text(sentence1)
tokens2 = preprocess_text(sentence2)

# Convert tokens to indices using the vocabulary
indices1 = [vocab.get_index(token) for token in tokens1]
indices2 = [vocab.get_index(token) for token in tokens2]

# Get embeddings for the token indices
token_tensor1 = torch.tensor(indices1, dtype=torch.long).unsqueeze(0).to(device)
token_tensor2 = torch.tensor(indices2, dtype=torch.long).unsqueeze(0).to(device)

embeddings1 = model.embedding(token_tensor1)
embeddings2 = model.embedding(token_tensor2)

# Average the embeddings
avg_embedding1 = embeddings1.mean(dim=1)
avg_embedding2 = embeddings2.mean(dim=1)

# Print the averaged embeddings
#print(f"Averaged embedding for sentence 1: {avg_embedding1}")
#print(f"Averaged embedding for sentence 2: {avg_embedding2}")

# Compute cosine similarity between the two averaged embeddings
cos_sim = cosine_similarity(avg_embedding1.squeeze(), avg_embedding2.squeeze())
print(f"Cosine similarity between the two sentences: {cos_sim:.4f}")

Cosine similarity between the two sentences: 0.3710


In [None]:
# save the model state pth
model_path = "../models/text_regression_model.pth"
