In [1]:

from sqlalchemy import create_engine
import pandas as pd

# --- Configuration ---
# Define the database URI directly
# !! In real projects, manage credentials securely (e.g., env variables, secrets manager) !!
DB_URI = "postgresql://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki"

engine = create_engine(DB_URI)
# --- Optional: Set up logging ---
import logging
logging.basicConfig(level=logging.INFO)


# Example: Show tables (PostgreSQL metadata)
res = pd.read_sql("""
    SELECT *
    FROM "hacker_news"."items" a
    WHERE a.type = 'story'
        AND a.time >= '2023-01-01 00:00:00'
        AND a.dead IS NOT TRUE
        AND LENGTH(a.title) > 0
        --LIMIT 10
""", engine)

res


Unnamed: 0,id,dead,type,by,time,text,parent,kids,url,score,title,descendants
0,34202102,,story,viewtransform,2023-01-01 00:07:29,,,[34202608],https://www.youtube.com/watch?v=Sz1n0RHwLqA,5,The physics of entropy and the origin of life ...,1
1,34202107,,story,TheBrokenRail,2023-01-01 00:08:15,,,"[34203377, 34205953, 34206965, 34204095, 34203...",https://thebrokenrail.com/2022/12/31/xfinity-s...,144,"Xfinity Stream on Linux: A Tale of Widevine, C...",71
2,34202114,,story,forte124,2023-01-01 00:09:17,What types of businesses most likely fall in t...,,[34203714],,4,"Ask HN: Examples of successful, small companie...",1
3,34202138,,story,todsacerdoti,2023-01-01 00:12:29,,,,https://www.youtube.com/watch?v=q2A-MkGjvmI,4,Let’s try ChatGPT. Is it any good? (Bisqwit),0
4,34202154,,story,lisper,2023-01-01 00:14:17,,,,https://www.amazon.com/Because-Internet-Unders...,2,Because Internet: Understanding the New Rules ...,0
...,...,...,...,...,...,...,...,...,...,...,...,...
549327,41830822,,story,popcalc,2024-10-13 20:30:13,,,,https://www.youtube.com/watch?v=g-OT4XDqY-o,1,A special packet that can wake-up your PC,0
549328,41830856,,story,todsacerdoti,2024-10-13 20:35:17,,,,https://nolanlawson.com/2024/10/13/the-greatne...,1,The greatness and limitations of the JavaScrip...,0
549329,41830876,,story,thunderbong,2024-10-13 20:37:46,,,,https://slate.com/life/2024/10/tinder-bumble-d...,2,Dating apps destroyed in-person romance. Now t...,0
549330,41830882,,story,synthc,2024-10-13 20:38:22,,,"[41832306, 41830883, 41831437]",https://fireinabottle.net/introducing-the-croi...,5,The Croissant Diet,3


In [2]:
titles_and_scores = res.loc[:, ['title', 'score']].copy()
titles_and_scores.head(50)

Unnamed: 0,title,score
0,The physics of entropy and the origin of life ...,5
1,"Xfinity Stream on Linux: A Tale of Widevine, C...",144
2,"Ask HN: Examples of successful, small companie...",4
3,Let’s try ChatGPT. Is it any good? (Bisqwit),4
4,Because Internet: Understanding the New Rules ...,2
5,Solar thermal storage using lunar regolith,2
6,The craft of SwiftUI API design: Progressive d...,4
7,Worst interview questions for software developers,154
8,Running Advent of Code on a $2 microcontroller,90
9,OpenBSD KDE Status Report 2022,5


In [3]:
import sys
import os

# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

from utils import logger  # Import the `logger` module from `utils`

⚙️  Configuring DropoutDisco logging...
  Logger 'DropoutDisco' level set to: INFO
  Clearing existing logging handlers...
  ✅ Console logging handler added.
  ✅ File logging handler added: logs/dropout_disco.log
2025-04-16 16:52:39 | DropoutDisco | INFO     | [logging.py:102] | 🎉 Logging system initialized successfully!


INFO:DropoutDisco:🎉 Logging system initialized successfully!


In [4]:
import torch
import torch.nn as nn
from word2vec.vocabulary import Vocabulary as vocab
class TextToRegressionModel(nn.Module):
    def __init__(self, vocab_path, cbow_model_path, input_dim, hidden_dims=[128, 64, 32], dropout=0.2):
        """
        Combines vocabulary, CBOW embeddings, and MLP regression model.
        
        Args:
            vocab_path (str): Path to the saved vocabulary JSON.
            cbow_model_path (str): Path to the saved CBOW model state.
            input_dim (int): Dimension of the input embeddings.
            hidden_dims (List[int]): List of hidden layer dimensions.
            dropout (float): Dropout probability.
        """
        super().__init__()
        # Load vocabulary
        self.vocab = vocab.load_vocab(vocab_path)
        
        # Load CBOW model and extract embedding layer
        cbow_state = torch.load(cbow_model_path, map_location=torch.device('cpu'))
        self.embedding = nn.Embedding.from_pretrained(cbow_state['embeddings.weight'])
        
        # Initialize MLP layers
        layers = []
        prev_dim = input_dim
        
        # Add hidden layers
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_dim),
                nn.Dropout(dropout)
            ])
            prev_dim = hidden_dim
        
        # Add final output layer
        layers.append(nn.Linear(prev_dim, 1))
        
        # Combine all layers
        self.regression_model = nn.Sequential(*layers)

    def forward(self, x):
        # x is already embedded and averaged from the collate function
        return self.regression_model(x)

In [26]:
from torch.utils.data import Dataset, DataLoader
import re



def preprocess_text(text):
    # Replace any non-alphanumeric character with a space
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    # Convert to lowercase and split into tokens
    tokens = text.lower().split()
    return tokens


class TextDataset(Dataset):
    def __init__(self, texts, targets, vocab):
        """
        Custom Dataset for text regression.
        
        Args:
            texts (List[str]): List of input texts.
            targets (List[float]): List of target regression values.
            vocab (Vocabulary): Vocabulary object for tokenization.
        """
        self.texts = texts
        self.targets = torch.tensor(targets, dtype=torch.float32)
        self.vocab = vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        # Preprocess the text
        tokens = preprocess_text(text)
        # Convert tokens to indices using the vocabulary
        indices = [self.vocab.get_index(token) for token in tokens]
        return torch.tensor(indices, dtype=torch.long), target


In [27]:
def make_collate_fn(model, device):
    def collate_fn(batch):
        # Separate the sequences and targets
        sequences, targets = zip(*batch)
        
        # Convert targets to tensor and move to the correct device
        targets = torch.stack(targets).to(device)
        
        # Process each sequence through the model's embedding layer
        embedded_sequences = []
        for seq in sequences:
            # Get embeddings for the sequence
            embeddings = model.embedding(seq.to(device))  # Move seq to the correct device
            # Average the embeddings
            avg_embedding = embeddings.mean(dim=0)
            embedded_sequences.append(avg_embedding)
        
        # Stack the averaged embeddings
        embedded_batch = torch.stack(embedded_sequences).to(device)
        
        return embedded_batch, targets
    return collate_fn


In [7]:
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            total_loss += loss.item()
    return total_loss / len(dataloader)

In [8]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming titles_and_scores is your DataFrame
# Split the data
train_df, test_df = train_test_split(titles_and_scores, test_size=0.2, random_state=42)

# Create datasets
train_dataset = TextDataset(
    texts=train_df['title'].tolist(),
    targets=train_df['score'].tolist(),
    vocab=vocab  # Your existing vocabulary object
)

test_dataset = TextDataset(
    texts=test_df['title'].tolist(),
    targets=test_df['score'].tolist(),
    vocab=vocab
)

In [9]:
print(train_dataset)

<__main__.TextDataset object at 0x7c81478de2c0>


In [30]:
# Detect the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# Initialize model
model = TextToRegressionModel(
    vocab_path="../models/word2vec/text8_vocab_NWAll_MF5.json",  # Replace with your actual path
    cbow_model_path="../models/word2vec/CBOW_D128_W5_NWAll_MF5_E15_LR0.001_BS512/model_state.pth",  # Replace with your actual path
    input_dim=128,  # Match your CBOW embedding dimension
    hidden_dims=[128, 96, 64, 32, 16],  # Example hidden layer dimensions
)
model = model.to(device)


# Create datasets
train_dataset = TextDataset(
    texts=train_df['title'].tolist(),
    targets=train_df['score'].tolist(),
    vocab=model.vocab  # Use the model's vocabulary
)

test_dataset = TextDataset(
    texts=test_df['title'].tolist(),
    targets=test_df['score'].tolist(),
    vocab=model.vocab
)

# Create dataloaders with the custom collate function
batch_size = 1024

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=make_collate_fn(model, device)
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=make_collate_fn(model, device)
)

# Training loop
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.L1Loss()

num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

Using device: cuda
2025-04-16 17:17:36 | DropoutDisco | INFO     | [vocabulary.py:110] | Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


2025-04-16 17:17:36 | DropoutDisco | INFO     | [vocabulary.py:123] | 📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


Epoch 1/1, Loss: nan


In [32]:
for inputs, targets in train_loader:
    if torch.isnan(inputs).any() or torch.isinf(inputs).any():
        print("Invalid values in inputs!")
    if torch.isnan(targets).any() or torch.isinf(targets).any():
        print("Invalid values in targets!")

Invalid values in inputs!
Invalid values in inputs!
Invalid values in inputs!
Invalid values in inputs!
Invalid values in inputs!
Invalid values in inputs!
Invalid values in inputs!
Invalid values in inputs!
Invalid values in inputs!
Invalid values in inputs!
Invalid values in inputs!
Invalid values in inputs!


In [11]:
num_epochs = 1
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

Epoch 1/1, Loss: 18.6140


In [12]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Evaluate model on test set
model.eval()
test_loss = 0
predictions = []
actuals = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        test_loss += criterion(outputs.squeeze(), targets).item()
        
        predictions.extend(outputs.squeeze().cpu().numpy())
        actuals.extend(targets.cpu().numpy())

avg_test_loss = test_loss / len(test_loader)
print(f'Test Loss: {avg_test_loss:.4f}')

# Calculate additional metrics
mse = mean_squared_error(actuals, predictions)
mae = mean_absolute_error(actuals, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(actuals, predictions)

print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')
print(f'R2 Score: {r2:.4f}')


Test Loss: 18.9806
MSE: 6412.3902
RMSE: 80.0774
MAE: 18.9286
R2 Score: -0.0519


In [13]:
def predict_score(model, text, device):
    """
    Predict score for a single text input.
    
    Args:
        model (TextToRegressionModel): Trained model
        text (str): Input text to predict score for
        device (str): Device to run prediction on
        
    Returns:
        float: Predicted score
    """
    model.eval()
    model = model.to(device)
    with torch.no_grad():
        # Preprocess the text
        tokens = text.lower().split()
        indices = [model.vocab.get_index(token) for token in tokens]
        token_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)
        
        # Get embeddings and average
        embeddings = model.embedding(token_tensor)
        avg_embedding = embeddings.mean(dim=1)
        
        # Get prediction
        prediction = model.regression_model(avg_embedding)
        return prediction.item()
    
predict_score(model, "test text", device)

2.1064224243164062

In [14]:
print(model)

TextToRegressionModel(
  (embedding): Embedding(71291, 128)
  (regression_model): Sequential(
    (0): Linear(in_features=128, out_features=128, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=128, out_features=96, bias=True)
    (5): ReLU()
    (6): BatchNorm1d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.2, inplace=False)
    (8): Linear(in_features=96, out_features=64, bias=True)
    (9): ReLU()
    (10): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.2, inplace=False)
    (12): Linear(in_features=64, out_features=32, bias=True)
    (13): ReLU()
    (14): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (15): Dropout(p=0.2, inplace=False)
    (16): Linear(in_features=32, out_features=16, bias=True)
    (17): ReLU()
  

In [29]:
train_df.head(5)

# Create datasets
train_dataset = TextDataset(
    texts=train_df['title'].tolist(),
    targets=train_df['score'].tolist(),
    vocab=model.vocab  # Use the model's vocabulary
)

train_dataset

# Create dataloaders with the custom collate function
batch_size = 10

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=make_collate_fn(model, device)
)

# Inspect raw text inputs
sample_text = train_df['title'].iloc[0]
print(f"Raw text: {sample_text}")

# Tokenize the text
#tokens = sample_text.lower().split()
tokens = preprocess_text(sample_text)
print(f"Tokens: {tokens}")

# Convert tokens to indices using the vocabulary
vocab = vocab.load_vocab("../models/word2vec/text8_vocab_NWAll_MF5.json")
indices = [vocab.get_index(token) for token in tokens]
print(f"Token indices: {indices}")

# Get embeddings for the token indices
token_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)
embeddings = model.embedding(token_tensor)
print(f"Embeddings: {embeddings[0]}")

# Average the embeddings
avg_embedding = embeddings.mean(dim=1)
print(f"Averaged embedding: {avg_embedding}")

Raw text: Math's 'Game of Life' Reveals Long-Sought Repeating Patterns
Tokens: ['math', 's', 'game', 'of', 'life', 'reveals', 'long', 'sought', 'repeating', 'patterns']
2025-04-16 17:16:33 | DropoutDisco | INFO     | [vocabulary.py:110] | Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:Attempting to load vocabulary from: ../models/word2vec/text8_vocab_NWAll_MF5.json


2025-04-16 17:16:33 | DropoutDisco | INFO     | [vocabulary.py:123] | 📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


INFO:DropoutDisco:📚 Vocab loaded (71,291 words) from ../models/word2vec/text8_vocab_NWAll_MF5.json


Token indices: [26907, 342, 6576, 6, 863, 8133, 425, 786, 8075, 1850]
Embeddings: tensor([[ 0.5263,  0.9586, -1.8299,  ..., -2.0175, -1.5044,  1.4731],
        [ 0.2049,  1.3344, -0.9665,  ..., -1.1682,  0.2235,  0.1650],
        [-1.6386, -2.0149, -1.0736,  ...,  2.6468,  0.5863,  0.9648],
        ...,
        [ 1.3525,  3.0522,  1.5153,  ..., -0.1363, -2.0352,  2.7430],
        [ 0.1271, -1.0758, -1.9703,  ...,  1.5935, -0.6970,  0.1737],
        [-0.1854,  0.2421, -2.2333,  ..., -1.8195,  1.0001,  0.8238]],
       device='cuda:0')
Averaged embedding: tensor([[ 0.1076,  0.2204, -0.8686,  0.2745, -1.1864, -0.1390,  1.0319,  0.6062,
          0.2069,  0.1624,  0.0597,  0.1354,  0.8648,  0.4364,  0.0431,  0.7406,
          0.0552,  0.2273,  0.3179,  0.5687,  0.1716, -0.7294, -0.0464, -0.7934,
         -0.0365,  0.2184,  0.0496, -0.3312,  0.1448,  0.0438,  0.1571,  0.7432,
          0.3607,  0.9029,  0.6433, -0.4903, -0.4284,  1.1146, -0.0296, -0.3438,
          1.2740, -0.9281,  0.2797,