In [11]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
import wikipediaapi
import re
from typing import List, Dict
import logging

In [5]:
import torch
import torch.nn as nn
from typing import Dict, List, Optional
import numpy as np
from sklearn.preprocessing import StandardScaler
import logging

In [6]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
import wikipediaapi
import re
from typing import List, Dict
import logging

In [7]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class HackerNewsDataCollector:
    def __init__(self, db_url: str):
        """Initialize the HackerNews data collector with database connection."""
        self.db_url = db_url
        self.engine = create_engine(db_url)
    
    def fetch_hn_data(self) -> pd.DataFrame:
        """Fetch Hacker News data from PostgreSQL database."""
        query = """
        SELECT 
            title,
            score,
            by as author,
            descendants as num_comments,
            text as comment_text
        FROM stories
        WHERE score > 0
        """
        try:
            df = pd.read_sql(query, self.engine)
            logger.info(f"Successfully fetched {len(df)} records from HackerNews")
            return df
        except Exception as e:
            logger.error(f"Error fetching HackerNews data: {str(e)}")
            raise

In [9]:
class WikipediaDataCollector:
    def __init__(self, language: str = 'en'):
        """Initialize Wikipedia API client."""
        self.wiki = wikipediaapi.Wikipedia(language)
        
    def get_articles_by_category(self, category: str, max_articles: int = 1000) -> List[str]:
        """Fetch Wikipedia articles from a specific category."""
        category_page = self.wiki.page(f"Category:{category}")
        articles = []
        
        def collect_articles(category_page, depth=0, max_depth=2):
            if depth >= max_depth or len(articles) >= max_articles:
                return
            
            for member in category_page.categorymembers.values():
                if len(articles) >= max_articles:
                    break
                    
                if member.ns == wikipediaapi.Namespace.MAIN:
                    # Clean and add article text
                    clean_text = self._clean_text(member.text)
                    if clean_text:
                        articles.append(clean_text)
                        
                elif member.ns == wikipediaapi.Namespace.CATEGORY and depth < max_depth:
                    collect_articles(member, depth + 1, max_depth)
        
        collect_articles(category_page)
        logger.info(f"Collected {len(articles)} articles from Wikipedia category: {category}")
        return articles
    
    @staticmethod
    def _clean_text(text: str) -> str:
        """Clean Wikipedia article text."""
        # Remove references, URLs, and special characters
        text = re.sub(r'\[\d+\]', '', text)  # Remove reference numbers
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'\s+', ' ', text)     # Normalize whitespace
        return text.strip()


In [10]:
def get_training_data(db_url: str, wiki_categories: List[str]) -> Dict[str, pd.DataFrame]:
    """Collect both HackerNews and Wikipedia training data."""
    # Collect HackerNews data
    hn_collector = HackerNewsDataCollector(db_url)
    hn_data = hn_collector.fetch_hn_data()
    
    # Collect Wikipedia data
    wiki_collector = WikipediaDataCollector()
    wiki_texts = []
    for category in wiki_categories:
        wiki_texts.extend(wiki_collector.get_articles_by_category(category))
    
    wiki_data = pd.DataFrame({'text': wiki_texts})
    
    return {
        'hacker_news': hn_data,
        'wikipedia': wiki_data
    }

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from typing import List, Tuple
import logging
from collections import Counter
import re

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class Word2VecDataset(Dataset):
    def __init__(self, texts: List[str], window_size: int = 5, min_count: int = 5):
        """
        Initialize Word2Vec dataset.
        
        Args:
            texts: List of text documents
            window_size: Context window size
            min_count: Minimum word frequency
        """
        self.window_size = window_size
        
        # Preprocess texts and build vocabulary
        self.word_counts = Counter()
        self.processed_texts = []
        
        for text in texts:
            words = self._preprocess_text(text)
            self.word_counts.update(words)
            self.processed_texts.append(words)
        
        # Filter vocabulary by minimum count
        self.vocab = {word: idx + 1 for idx, (word, count) in 
                     enumerate([item for item in self.word_counts.items() 
                              if item[1] >= min_count])}
        self.vocab['<UNK>'] = 0
        
        self.idx_to_word = {idx: word for word, idx in self.vocab.items()}
        self.vocab_size = len(self.vocab)
        
        # Create training pairs
        self.pairs = self._create_pairs()
        
    def _preprocess_text(self, text: str) -> List[str]:
        """Clean and tokenize text."""
        # Convert to lowercase and split into words
        text = text.lower()
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text.split()
    
    def _create_pairs(self) -> List[Tuple[int, int]]:
        """Create (target, context) pairs for training."""
        pairs = []
        for words in self.processed_texts:
            word_ids = [self.vocab.get(word, 0) for word in words]
            
            for i in range(len(word_ids)):
                target = word_ids[i]
                # Generate context words within window
                for j in range(max(0, i - self.window_size), 
                             min(len(word_ids), i + self.window_size + 1)):
                    if i != j:
                        context = word_ids[j]
                        pairs.append((target, context))
        
        return pairs
    
    def __len__(self) -> int:
        return len(self.pairs)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        target, context = self.pairs[idx]
        return torch.tensor(target), torch.tensor(context)

class Word2Vec(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        """
        Initialize Word2Vec model.
        
        Args:
            vocab_size: Size of vocabulary
            embedding_dim: Dimension of word embeddings
        """
        super(Word2Vec, self).__init__()
        
        self.target_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # Initialize embeddings
        self.target_embeddings.weight.data.uniform_(-0.1, 0.1)
        self.context_embeddings.weight.data.uniform_(-0.1, 0.1)
    
    def forward(self, target: torch.Tensor, context: torch.Tensor) -> torch.Tensor:
        """Forward pass of the model."""
        target_embeds = self.target_embeddings(target)
        context_embeds = self.context_embeddings(context)
        
        # Compute dot product between target and context embeddings
        output = torch.sum(target_embeds * context_embeds, dim=1)
        return output
    
    def get_word_vector(self, word_idx: int) -> torch.Tensor:
        """Get the embedding vector for a word."""
        return self.target_embeddings.weight[word_idx].detach()

def train_word2vec(texts: List[str], 
                  embedding_dim: int = 100,
                  window_size: int = 5,
                  min_count: int = 5,
                  batch_size: int = 1024,
                  num_epochs: int = 5,
                  learning_rate: float = 0.025) -> Tuple[Word2Vec, Word2VecDataset]:
    """
    Train Word2Vec model on input texts.
    
    Args:
        texts: List of input texts
        embedding_dim: Dimension of word embeddings
        window_size: Context window size
        min_count: Minimum word frequency
        batch_size: Training batch size
        num_epochs: Number of training epochs
        learning_rate: Learning rate
    
    Returns:
        Trained model and dataset
    """
    # Create dataset
    dataset = Word2VecDataset(texts, window_size, min_count)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Initialize model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = Word2Vec(dataset.vocab_size, embedding_dim).to(device)
    
    # Initialize optimizer and loss
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.BCEWithLogitsLoss()
    
    # Training loop
    logger.info("Starting Word2Vec training...")
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (target, context) in enumerate(dataloader):
            target = target.to(device)
            context = context.to(device)
            
            # Forward pass
            output = model(target, context)
            loss = criterion(output, torch.ones_like(output))
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
        avg_loss = total_loss / len(dataloader)
        logger.info(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    
    return model, dataset

In [13]:
import torch
import torch.nn as nn
from typing import Dict, List, Optional
import numpy as np
from sklearn.preprocessing import StandardScaler
import logging

In [14]:
logger = logging.getLogger(__name__)

class FeatureExtractor:
    def __init__(self, word2vec_model, word2vec_dataset):
        """Initialize feature extractor with trained word2vec model."""
        self.word2vec_model = word2vec_model
        self.word2vec_dataset = word2vec_dataset
        self.numerical_scaler = StandardScaler()
        
    def extract_title_features(self, titles: List[str]) -> torch.Tensor:
        """Extract features from article titles using word2vec."""
        title_vectors = []
        
        for title in titles:
            # Preprocess title
            words = self.word2vec_dataset._preprocess_text(title)
            # Get word indices
            word_indices = [self.word2vec_dataset.vocab.get(word, 0) for word in words]
            
            if not word_indices:
                # If no valid words, use zero vector
                title_vectors.append(torch.zeros(self.word2vec_model.target_embeddings.weight.shape[1]))
                continue
            
            # Get word vectors and average them
            vectors = [self.word2vec_model.get_word_vector(idx) for idx in word_indices]
            title_vector = torch.stack(vectors).mean(dim=0)
            title_vectors.append(title_vector)
        
        return torch.stack(title_vectors)
    
    def extract_numerical_features(self, data: Dict[str, List]) -> np.ndarray:
        """Extract and normalize numerical features."""
        numerical_features = np.column_stack([
            data['num_comments']
        ])
        
        # Fit scaler if not already fit
        if not hasattr(self.numerical_scaler, 'mean_'):
            self.numerical_scaler.fit(numerical_features)
        
        return self.numerical_scaler.transform(numerical_features)

In [15]:
class LateFusionModel(nn.Module):
    def __init__(self, text_embedding_dim: int, num_numerical_features: int):
        """
        Initialize Late Fusion model.
        
        Args:
            text_embedding_dim: Dimension of text embeddings from word2vec
            num_numerical_features: Number of numerical features
        """
        super(LateFusionModel, self).__init__()
        
        # Text feature processing
        self.text_network = nn.Sequential(
            nn.Linear(text_embedding_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32)
        )
        
        # Numerical feature processing
        self.numerical_network = nn.Sequential(
            nn.Linear(num_numerical_features, 16),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(16, 8)
        )
        
        # Fusion layer
        fusion_input_dim = 32 + 8  # Combined dimensions from both networks
        self.fusion_network = nn.Sequential(
            nn.Linear(fusion_input_dim, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )
    
    def forward(self, text_features: torch.Tensor, numerical_features: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of the model.
        
        Args:
            text_features: Tensor of text features from word2vec
            numerical_features: Tensor of numerical features
        """
        # Process text features
        text_output = self.text_network(text_features)
        
        # Process numerical features
        numerical_output = self.numerical_network(numerical_features)
        
        # Concatenate features
        combined = torch.cat([text_output, numerical_output], dim=1)
        
        # Final prediction
        output = self.fusion_network(combined)
        return output

In [16]:
def train_late_fusion(model: LateFusionModel,
                     feature_extractor: FeatureExtractor,
                     train_data: Dict,
                     num_epochs: int = 10,
                     batch_size: int = 32,
                     learning_rate: float = 0.001) -> LateFusionModel:
    """
    Train the Late Fusion model.
    
    Args:
        model: Late Fusion model instance
        feature_extractor: Feature extractor instance
        train_data: Dictionary containing training data
        num_epochs: Number of training epochs
        batch_size: Training batch size
        learning_rate: Learning rate
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    # Extract features
    title_features = feature_extractor.extract_title_features(train_data['title'])
    numerical_features = feature_extractor.extract_numerical_features(train_data)
    
    # Convert to tensors
    title_features = title_features.to(device)
    numerical_features = torch.FloatTensor(numerical_features).to(device)
    targets = torch.FloatTensor(train_data['score']).to(device)
    
    # Initialize optimizer and loss
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    
    # Training loop
    n_samples = len(train_data['title'])
    n_batches = (n_samples + batch_size - 1) // batch_size
    
    logger.info("Starting Late Fusion model training...")
    for epoch in range(num_epochs):
        total_loss = 0
        
        for i in range(n_batches):
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, n_samples)
            
            # Get batch
            batch_title_features = title_features[start_idx:end_idx]
            batch_numerical_features = numerical_features[start_idx:end_idx]
            batch_targets = targets[start_idx:end_idx]
            
            # Forward pass
            predictions = model(batch_title_features, batch_numerical_features)
            loss = criterion(predictions.squeeze(), batch_targets)
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / n_batches
        logger.info(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    
    return model

In [20]:
import logging
import torch
from typing import List
import pandas as pd
from sklearn.model_selection import train_test_split

from data_collection import get_training_data
from word2vec_model import train_word2vec
from late_fusion_model import LateFusionModel, FeatureExtractor, train_late_fusion


In [None]:
"""
Hacker News Predictor package.
This package contains modules for predicting Hacker News article scores using a Late Fusion model.
"""