In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
import gensim.downloader as api
from sqlalchemy import create_engine
import pandas as pd
from tqdm import tqdm

In [2]:
class HackerNewsDataset(Dataset):
    def __init__(self, text_features, numerical_features, author_features, scores):
        self.text_features = torch.FloatTensor(text_features)
        self.numerical_features = torch.FloatTensor(numerical_features)
        self.author_features = torch.FloatTensor(author_features)
        self.scores = torch.FloatTensor(scores)
        
    def __len__(self):
        return len(self.scores)
    
    def __getitem__(self, idx):
        return {
            'text': self.text_features[idx],
            'numerical': self.numerical_features[idx],
            'author': self.author_features[idx],
            'score': self.scores[idx]
        }


In [5]:
class LateFusionModel(nn.Module):
    def __init__(self, text_dim, numerical_dim, author_dim):
        super().__init__()
        
        # Text processing branch
        self.text_network = nn.Sequential(
            nn.Linear(text_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64)
        )
        
        # Numerical processing branch
        self.numerical_network = nn.Sequential(
            nn.Linear(numerical_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16)
        )
        
        # Author processing branch
        self.author_network = nn.Sequential(
            nn.Linear(author_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32)
        )
        
        # Fusion layer
        self.fusion = nn.Sequential(
            nn.Linear(64 + 16 + 32, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )
        
    def forward(self, text_features, numerical_features, author_features):
        text_out = self.text_network(text_features)
        numerical_out = self.numerical_network(numerical_features)
        author_out = self.author_network(author_features)
        
        # Concatenate all features
        combined = torch.cat([text_out, numerical_out, author_out], dim=1)
        
        # Final prediction
        return self.fusion(combined)


In [6]:
def text_to_vector(text, model):
    """
    Convert text to a vector using word2vec model
    Returns average of word vectors
    """
    words = str(text).lower().split()
    word_vecs = [model[word] for word in words if word in model]
    if not word_vecs:
        return np.zeros(model.vector_size)
    return np.mean(word_vecs, axis=0)

def load_data():
    """
    Load data from PostgreSQL database
    """
    DB_URL = "postgres://sy91dhb:g5t49ao@178.156.142.230:5432/hd64m1ki"
    engine = create_engine(DB_URL)
    
    query = """
    SELECT title, score, "by" as author, descendants as num_comments
    FROM stories
    WHERE score > 0
    """
    
    df = pd.read_sql(query, engine)
    print(f"Loaded {len(df)} records from database")
    return df

In [7]:
def prepare_features(df, word2vec_model):
    """
    Prepare features for the model
    """
    # Text features
    print("Creating text feature vectors...")
    title_vectors = np.array([text_to_vector(title, word2vec_model) for title in tqdm(df['title'])])
    
    # Author features (one-hot encoding)
    author_dummies = pd.get_dummies(df['author'], prefix='author')
    
    # Numerical features
    numerical_features = df[['num_comments']].values
    scaler = StandardScaler()
    numerical_features_scaled = scaler.fit_transform(numerical_features)
    
    return title_vectors, numerical_features_scaled, author_dummies.values, df['score'].values

In [8]:
def train_model(model, train_loader, val_loader, num_epochs=10, learning_rate=0.001):
    """
    Train the late fusion model
    """
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            
            predictions = model(
                batch['text'],
                batch['numerical'],
                batch['author']
            )
            
            loss = criterion(predictions.squeeze(), batch['score'])
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                predictions = model(
                    batch['text'],
                    batch['numerical'],
                    batch['author']
                )
                val_loss += criterion(predictions.squeeze(), batch['score']).item()
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Training Loss: {train_loss/len(train_loader):.4f}")
        print(f"Validation Loss: {val_loss/len(val_loader):.4f}")
        print("--------------------")

In [9]:
import torch
from torch.utils.data import random_split
import gensim.downloader as api
from model import (
    load_data,
    prepare_features,
    HackerNewsDataset,
    LateFusionModel,
    train_model
)


ModuleNotFoundError: No module named 'model'