In [1]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# Load the CSV
data = pd.read_csv('/Users/nigelkiernan/Documents/GitProjects/MLXProjects/data-1737988940684.csv')

In [3]:
# Inspect the data
print(data.head(10))

                                               title  score
0                                       Y Combinator     57
1                      A Student's Guide to Startups     16
2             Woz Interview: the early days of Apple      7
3                              NYC Developer Dilemma      5
4  Google, YouTube acquisition announcement could...      7
5  Business Intelligence the Inkling Way: cool pr...      4
6                         Sevin Rosen Unfunds - why?      5
7                         LikeBetter featured by BBC     10
8           weekendr: social network for the weekend      4
9            PhotoShow: Broadcast Photos to Cable TV      3


In [4]:
# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # Remove punctuation and lowercase
    tokens = word_tokenize(text)
    return tokens

In [5]:
data.head(10)

Unnamed: 0,title,score
0,Y Combinator,57
1,A Student's Guide to Startups,16
2,Woz Interview: the early days of Apple,7
3,NYC Developer Dilemma,5
4,"Google, YouTube acquisition announcement could come tonight",7
5,Business Intelligence the Inkling Way: cool prediction markets software,4
6,Sevin Rosen Unfunds - why?,5
7,LikeBetter featured by BBC,10
8,weekendr: social network for the weekend,4
9,PhotoShow: Broadcast Photos to Cable TV,3


In [6]:
# Apply preprocessing
data['tokens'] = data['title'].apply(preprocess_text)

In [None]:
# Build the Skip-Gram Word2Vec Model

In [7]:
# Flatten tokenized text to build vocabulary
all_words = [word for tokens in data['tokens'] for word in tokens]
word_counts = Counter(all_words)
vocab = {word: i for i, word in enumerate(word_counts.keys())}


In [8]:
# Vocabulary size
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")


Vocabulary size: 10934


In [None]:
# Create Skip-Gram Pairs

In [9]:
# Generate skip-gram pairs
def generate_skip_gram_pairs(tokens, window_size):
    pairs = []
    for sentence in tokens:
        for center_idx in range(len(sentence)):
            for offset in range(-window_size, window_size + 1):
                context_idx = center_idx + offset
                if context_idx < 0 or context_idx >= len(sentence) or center_idx == context_idx:
                    continue
                pairs.append((vocab[sentence[center_idx]], vocab[sentence[context_idx]]))
    return pairs

In [10]:
# Create pairs
window_size = 2
skip_gram_pairs = generate_skip_gram_pairs(data['tokens'], window_size)
print(f"Total skip-gram pairs: {len(skip_gram_pairs)}")

Total skip-gram pairs: 250232


In [None]:
# PyTorch Dataset and Dataloader

In [11]:
class SkipGramDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return torch.tensor(self.pairs[idx][0]), torch.tensor(self.pairs[idx][1])



In [12]:
# Create Dataset and DataLoader
dataset = SkipGramDataset(skip_gram_pairs)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)


In [13]:
# Define the Word2Vec Model

In [14]:
import torch.nn as nn


In [15]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.center_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center, context):
        center_embeds = self.center_embeddings(center)
        context_embeds = self.context_embeddings(context)
        scores = torch.sum(center_embeds * context_embeds, dim=1)
        return scores


In [16]:
# Train the Word2Vec Model

In [17]:
embedding_dim = 100
word2vec_model = Word2Vec(vocab_size, embedding_dim)
optimizer = torch.optim.Adam(word2vec_model.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()



In [18]:
# Training loop
for epoch in range(10):
    total_loss = 0
    for center, context in dataloader:
        labels = torch.ones(center.shape[0])  # Positive samples
        optimizer.zero_grad()
        scores = word2vec_model(center, context)
        loss = criterion(scores, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 3790.7425
Epoch 2, Loss: 73.7711
Epoch 3, Loss: 1.5394
Epoch 4, Loss: 0.3664
Epoch 5, Loss: 0.1537
Epoch 6, Loss: 0.0635
Epoch 7, Loss: 0.0253
Epoch 8, Loss: 0.0099
Epoch 9, Loss: 0.0039
Epoch 10, Loss: 0.0015


In [None]:
# Generate Text Embeddings

In [19]:
# Create embedding matrix
def get_sentence_embedding(tokens, model, vocab):
    vectors = []
    for word in tokens:
        if word in vocab:
            vectors.append(model.center_embeddings.weight[vocab[word]].detach().numpy())
    return np.mean(vectors, axis=0) if vectors else np.zeros(embedding_dim)



In [20]:
# Generate embeddings for each title
data['embedding'] = data['tokens'].apply(lambda x: get_sentence_embedding(x, word2vec_model, vocab))


In [None]:
# Late Fusion Integration

In [22]:
# Example numerical features
data['length'] = data['title'].str.len()



In [24]:
# Combine embeddings with numerical features


X_text = np.vstack(data['embedding'].values)
X_numeric = data[['length']].values
X = np.hstack([X_text, X_numeric])
y = data['score'].values


In [None]:
# Train-Test Split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Train Regression Model

In [26]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)

# Predict and evaluate
y_pred = regressor.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))


MSE: 50.68267415569801
R²: -0.09938579139549253
