In [34]:

from sqlalchemy import create_engine
import pandas as pd

# --- Configuration ---
# Define the database URI directly
# !! In real projects, manage credentials securely (e.g., env variables, secrets manager) !!
DB_URI = "postgresql://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki"

engine = create_engine(DB_URI)
# --- Optional: Set up logging ---
import logging
logging.basicConfig(level=logging.INFO)


# Example: Show tables (PostgreSQL metadata)
res = pd.read_sql("""
    SELECT *
    FROM "hacker_news"."items" a
    WHERE a.type = 'story'
        AND a.time >= '2023-01-01 00:00:00'
        AND a.dead IS NOT TRUE
        AND LENGTH(a.title) > 0
        --LIMIT 10
""", engine)

res


Unnamed: 0,id,dead,type,by,time,text,parent,kids,url,score,title,descendants
0,34202102,,story,viewtransform,2023-01-01 00:07:29,,,[34202608],https://www.youtube.com/watch?v=Sz1n0RHwLqA,5,The physics of entropy and the origin of life ...,1
1,34202107,,story,TheBrokenRail,2023-01-01 00:08:15,,,"[34203377, 34205953, 34206965, 34204095, 34203...",https://thebrokenrail.com/2022/12/31/xfinity-s...,144,"Xfinity Stream on Linux: A Tale of Widevine, C...",71
2,34202114,,story,forte124,2023-01-01 00:09:17,What types of businesses most likely fall in t...,,[34203714],,4,"Ask HN: Examples of successful, small companie...",1
3,34202138,,story,todsacerdoti,2023-01-01 00:12:29,,,,https://www.youtube.com/watch?v=q2A-MkGjvmI,4,Let’s try ChatGPT. Is it any good? (Bisqwit),0
4,34202154,,story,lisper,2023-01-01 00:14:17,,,,https://www.amazon.com/Because-Internet-Unders...,2,Because Internet: Understanding the New Rules ...,0
...,...,...,...,...,...,...,...,...,...,...,...,...
549327,40826340,,story,bino47,2024-06-29 00:13:29,,,[40826341],https://www.langui.io,1,New AI Language Learning App Looking for Beta ...,0
549328,40826360,,story,thunderbong,2024-06-29 00:16:22,,,,https://blog.brownplt.org/2024/06/27/different...,2,Differential Analysis: A Summary,0
549329,40826370,,story,thunderbong,2024-06-29 00:18:24,,,[40826580],https://www.relational-algebra.dev/ra-primer/i...,13,Relational Algebra Primer,1
549330,40826421,,story,zuhayeer,2024-06-29 00:26:26,,,"[40827602, 40837042, 40827522, 40828446, 40827...",https://www.msn.com/en-us/money/companies/empl...,22,Nvidia Employees Are Now Multi-Millionaires in...,11


In [35]:
titles_and_scores = res.loc[:, ['title', 'score']].copy()
titles_and_scores.head(50)

Unnamed: 0,title,score
0,The physics of entropy and the origin of life ...,5
1,"Xfinity Stream on Linux: A Tale of Widevine, C...",144
2,"Ask HN: Examples of successful, small companie...",4
3,Let’s try ChatGPT. Is it any good? (Bisqwit),4
4,Because Internet: Understanding the New Rules ...,2
5,Solar thermal storage using lunar regolith,2
6,The craft of SwiftUI API design: Progressive d...,4
7,Worst interview questions for software developers,154
8,Running Advent of Code on a $2 microcontroller,90
9,OpenBSD KDE Status Report 2022,5


In [36]:
import sys
import os

# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

from utils import logger  # Import the `logger` module from `utils`

In [37]:
vocab

word2vec.vocabulary.Vocabulary

In [38]:
import torch
import torch.nn as nn
from word2vec.vocabulary import Vocabulary as vocab
class TextToRegressionModel(nn.Module):
    def __init__(self, vocab_path, cbow_model_path, input_dim, hidden_dims=[128, 64, 32], dropout=0.2):
        """
        Combines vocabulary, CBOW embeddings, and MLP regression model.
        
        Args:
            vocab_path (str): Path to the saved vocabulary JSON.
            cbow_model_path (str): Path to the saved CBOW model state.
            input_dim (int): Dimension of the input embeddings.
            hidden_dims (List[int]): List of hidden layer dimensions.
            dropout (float): Dropout probability.
        """
        super().__init__()
        # Load vocabulary
        self.vocab = vocab.load_vocab(vocab_path)
        
        # Load CBOW model and extract embedding layer
        cbow_state = torch.load(cbow_model_path, map_location=torch.device('cpu'))
        self.embedding = nn.Embedding.from_pretrained(cbow_state['embeddings.weight'])
        
        # Initialize MLP layers
        layers = []
        prev_dim = input_dim
        
        # Add hidden layers
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_dim),
                nn.Dropout(dropout)
            ])
            prev_dim = hidden_dim
        
        # Add final output layer
        layers.append(nn.Linear(prev_dim, 1))
        
        # Combine all layers
        self.regression_model = nn.Sequential(*layers)

    def forward(self, x):
        # x is already embedded and averaged from the collate function
        return self.regression_model(x)

In [39]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, texts, targets, vocab):
        """
        Custom Dataset for text regression.
        
        Args:
            texts (List[str]): List of input texts.
            targets (List[float]): List of target regression values.
            vocab (Vocabulary): Vocabulary object for tokenization.
        """
        self.texts = texts
        self.targets = torch.tensor(targets, dtype=torch.float32)
        self.vocab = vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        # Convert text to lowercase and split into tokens
        tokens = text.lower().split()
        # Get indices for each token, handling unknown words
        indices = [self.vocab.get_index(token) for token in tokens]
        return torch.tensor(indices, dtype=torch.long), target


In [40]:
def make_collate_fn(model, device):
    def collate_fn(batch):
        # Separate the sequences and targets
        sequences, targets = zip(*batch)
        
        # Convert targets to tensor and move to the correct device
        targets = torch.stack(targets).to(device)
        
        # Process each sequence through the model's embedding layer
        embedded_sequences = []
        for seq in sequences:
            # Get embeddings for the sequence
            embeddings = model.embedding(seq.to(device))  # Move seq to the correct device
            # Average the embeddings
            avg_embedding = embeddings.mean(dim=0)
            embedded_sequences.append(avg_embedding)
        
        # Stack the averaged embeddings
        embedded_batch = torch.stack(embedded_sequences).to(device)
        
        return embedded_batch, targets
    return collate_fn

In [41]:
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            total_loss += loss.item()
    return total_loss / len(dataloader)

In [42]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming titles_and_scores is your DataFrame
# Split the data
train_df, test_df = train_test_split(titles_and_scores, test_size=0.2, random_state=42)

# Create datasets
train_dataset = TextDataset(
    texts=train_df['title'].tolist(),
    targets=train_df['score'].tolist(),
    vocab=vocab  # Your existing vocabulary object
)

test_dataset = TextDataset(
    texts=test_df['title'].tolist(),
    targets=test_df['score'].tolist(),
    vocab=vocab
)

In [43]:
print(train_dataset)

<__main__.TextDataset object at 0x762d80aa87f0>


In [None]:
# Detect the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# Initialize model
model = TextToRegressionModel(
    vocab_path="../models/word2vec/text8_vocab_NWAll_MF5.json",  # Replace with your actual path
    cbow_model_path="../models/word2vec/CBOW_D128_W5_NWAll_MF5_E15_LR0.001_BS512/model_state.pth",  # Replace with your actual path
    input_dim=128,  # Match your CBOW embedding dimension
)
model = model.to(device)

# Create datasets
train_dataset = TextDataset(
    texts=train_df['title'].tolist(),
    targets=train_df['score'].tolist(),
    vocab=model.vocab  # Use the model's vocabulary
)

test_dataset = TextDataset(
    texts=test_df['title'].tolist(),
    targets=test_df['score'].tolist(),
    vocab=model.vocab
)

# Create dataloaders with the custom collate function
train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=make_collate_fn(model, device)
)

test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=make_collate_fn(model, device)
)

# Training loop
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.L1Loss()

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

Using device: cuda
2025-04-16 14:49:06 | DropoutDisco | INFO     | [vocabulary.py:110] | Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


2025-04-16 14:49:06 | DropoutDisco | INFO     | [vocabulary.py:123] | 📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


Epoch 1/10, Loss: 18.7210
Epoch 2/10, Loss: 18.6745
Epoch 3/10, Loss: 18.6659
Epoch 4/10, Loss: 18.6647
Epoch 5/10, Loss: 18.6631
Epoch 6/10, Loss: 18.6559


In [None]:
num_epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Evaluate model on test set
model.eval()
test_loss = 0
predictions = []
actuals = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        test_loss += criterion(outputs.squeeze(), targets).item()
        
        predictions.extend(outputs.squeeze().cpu().numpy())
        actuals.extend(targets.cpu().numpy())

avg_test_loss = test_loss / len(test_loader)
print(f'Test Loss: {avg_test_loss:.4f}')

# Calculate additional metrics
mse = mean_squared_error(actuals, predictions)
mae = mean_absolute_error(actuals, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(actuals, predictions)

print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')
print(f'R2 Score: {r2:.4f}')


In [None]:
def predict_score(model, text, device):
    """
    Predict score for a single text input.
    
    Args:
        model (TextToRegressionModel): Trained model
        text (str): Input text to predict score for
        device (str): Device to run prediction on
        
    Returns:
        float: Predicted score
    """
    model.eval()
    model = model.to(device)
    with torch.no_grad():
        # Preprocess the text
        tokens = text.lower().split()
        indices = [model.vocab.get_index(token) for token in tokens]
        token_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)
        
        # Get embeddings and average
        embeddings = model.embedding(token_tensor)
        avg_embedding = embeddings.mean(dim=1)
        
        # Get prediction
        prediction = model.regression_model(avg_embedding)
        return prediction.item()
    
predict_score(model, "technology", device)

In [None]:
print(model)

In [None]:
res.head(20)

# find the entry in res df including the title "the best way to learn..."
# Search for the title in the res DataFrame
#matching_entries = res[res['title'].str.contains("", case=False, na=False)]
#print(matching_entries)
