In [84]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [85]:
# Load the CSV
data = pd.read_csv('/Users/nigelkiernan/Documents/GitProjects/MLXProjects/data-1737988940684.csv')

In [86]:
# Inspect the data
print(data.head(10))

                                               title  score
0                                       Y Combinator     57
1                      A Student's Guide to Startups     16
2             Woz Interview: the early days of Apple      7
3                              NYC Developer Dilemma      5
4  Google, YouTube acquisition announcement could...      7
5  Business Intelligence the Inkling Way: cool pr...      4
6                         Sevin Rosen Unfunds - why?      5
7                         LikeBetter featured by BBC     10
8           weekendr: social network for the weekend      4
9            PhotoShow: Broadcast Photos to Cable TV      3


In [87]:
# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # Remove punctuation and lowercase
    tokens = word_tokenize(text)
    return tokens

In [88]:
data.head(10)

Unnamed: 0,title,score
0,Y Combinator,57
1,A Student's Guide to Startups,16
2,Woz Interview: the early days of Apple,7
3,NYC Developer Dilemma,5
4,"Google, YouTube acquisition announcement could...",7
5,Business Intelligence the Inkling Way: cool pr...,4
6,Sevin Rosen Unfunds - why?,5
7,LikeBetter featured by BBC,10
8,weekendr: social network for the weekend,4
9,PhotoShow: Broadcast Photos to Cable TV,3


In [89]:
# Apply preprocessing
data['tokens'] = data['title'].apply(preprocess_text)

In [90]:
# Build the Skip-Gram Word2Vec Model

In [91]:
# Flatten tokenized text to build vocabulary
all_words = [word for tokens in data['tokens'] for word in tokens]
word_counts = Counter(all_words)
vocab = {word: i for i, word in enumerate(word_counts.keys())}


In [92]:
# Vocabulary size
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")


Vocabulary size: 10934


In [93]:
# Create Skip-Gram Pairs

In [94]:
# Generate skip-gram pairs
def generate_skip_gram_pairs(tokens, window_size):
    pairs = []
    for sentence in tokens:
        for center_idx in range(len(sentence)):
            for offset in range(-window_size, window_size + 1):
                context_idx = center_idx + offset
                if context_idx < 0 or context_idx >= len(sentence) or center_idx == context_idx:
                    continue
                pairs.append((vocab[sentence[center_idx]], vocab[sentence[context_idx]]))
    return pairs

In [95]:
# Create pairs
window_size = 2
skip_gram_pairs = generate_skip_gram_pairs(data['tokens'], window_size)
print(f"Total skip-gram pairs: {len(skip_gram_pairs)}")

Total skip-gram pairs: 250232


In [96]:
# PyTorch Dataset and Dataloader

In [97]:
class SkipGramDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return torch.tensor(self.pairs[idx][0]), torch.tensor(self.pairs[idx][1])



In [98]:
# Create Dataset and DataLoader
dataset = SkipGramDataset(skip_gram_pairs)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)


In [99]:
# Define the Word2Vec Model

In [100]:
import torch.nn as nn


In [101]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.center_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center, context):
        center_embeds = self.center_embeddings(center)
        context_embeds = self.context_embeddings(context)
        scores = torch.sum(center_embeds * context_embeds, dim=1)
        return scores


In [102]:
# Train the Word2Vec Model

In [103]:
embedding_dim = 200
word2vec_model = Word2Vec(vocab_size, embedding_dim)
optimizer = torch.optim.Adam(word2vec_model.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()



In [104]:
# Training loop
for epoch in range(10):
    total_loss = 0
    for center, context in dataloader:
        labels = torch.ones(center.shape[0])  # Positive samples
        optimizer.zero_grad()
        scores = word2vec_model(center, context)
        loss = criterion(scores, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 5494.0114
Epoch 2, Loss: 38.5411
Epoch 3, Loss: 0.3759
Epoch 4, Loss: 0.0850
Epoch 5, Loss: 0.0424
Epoch 6, Loss: 0.0199
Epoch 7, Loss: 0.0089
Epoch 8, Loss: 0.0038
Epoch 9, Loss: 0.0016
Epoch 10, Loss: 0.0007


In [105]:
# Generate Text Embeddings

In [106]:
# Create embedding matrix
def get_sentence_embedding(tokens, model, vocab):
    vectors = []
    for word in tokens:
        if word in vocab:
            vectors.append(model.center_embeddings.weight[vocab[word]].detach().numpy())
    return np.mean(vectors, axis=0) if vectors else np.zeros(embedding_dim)



In [107]:
# Generate embeddings for each title
data['embedding'] = data['tokens'].apply(lambda x: get_sentence_embedding(x, word2vec_model, vocab))


In [108]:
# Late Fusion Integration

In [109]:
# Example numerical features
data['length'] = data['title'].str.len()



In [110]:
# Combine embeddings with numerical features


X_text = np.vstack(data['embedding'].values)
X_numeric = data[['length']].values
X = np.hstack([X_text, X_numeric])
y = data['score'].values


In [111]:
# Train-Test Split

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [113]:
# Train Regression Model

In [114]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)

# Predict and evaluate
y_pred = regressor.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))


MSE: 50.75570540790115
R²: -0.10096995249862006


In [115]:
# Enhanced numerical features
print("Generating numerical features...")
data['length'] = data['title'].str.len()
data['word_count'] = data['title'].str.split().str.len()
data['avg_word_length'] = data['title'].apply(lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0)
data['contains_number'] = data['title'].str.contains('\d').astype(int)
data['starts_with_number'] = data['title'].str.match('^\d').astype(int)
data['capital_letters'] = data['title'].apply(lambda x: sum(1 for c in x if c.isupper()))
data['word_density'] = data['length'] / (data['word_count'] + 1)
data['punctuation_count'] = data['title'].str.count('[^\w\s]')

Generating numerical features...


In [116]:
# Combine embeddings with numerical features
X_text = np.vstack(data['embedding'].values)
X_numeric = data[[
    'length',
    'word_count',
    'avg_word_length',
    'contains_number',
    'starts_with_number',
    'capital_letters',
    'word_density',
    'punctuation_count'
]].values

In [117]:
# Late fusion
X = np.hstack([X_text, X_numeric])
y = data['score'].values


In [118]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [119]:
# Train Random Forest Model
print("Training Random Forest model...")
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train, y_train)


Training Random Forest model...


In [120]:
# Predict and evaluate
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [121]:
print("\nModel Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")


Model Performance:
Mean Squared Error: 49.7469
R-squared Score: -0.0791


In [122]:
# Feature importance analysis
feature_names = [f'embedding_{i}' for i in range(embedding_dim)] + [
    'length',
    'word_count',
    'avg_word_length',
    'contains_number',
    'starts_with_number',
    'capital_letters',
    'word_density',
    'punctuation_count'
]

In [123]:
# Get feature importances
importances = regressor.feature_importances_
numeric_feature_importances = importances[embedding_dim:]
numeric_feature_names = feature_names[embedding_dim:]

# Print importance of numerical features
print("\nNumerical Feature Importances:")
for name, importance in zip(numeric_feature_names, numeric_feature_importances):
    print(f"{name}: {importance:.4f}")


Numerical Feature Importances:
length: 0.0143
word_count: 0.0210
avg_word_length: 0.0115
contains_number: 0.0002
starts_with_number: 0.0002
capital_letters: 0.0103
word_density: 0.0104
punctuation_count: 0.0035


In [124]:
# Model Evaluation
print("\n=== Model Performance Evaluation ===")

# Basic Metrics
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = np.mean(np.abs(y_test - y_pred))

print(f"\nPerformance Metrics:")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R² Score: {r2:.4f}")

# Feature Importance Analysis
print("\nFeature Importance Analysis:")
print("\nTop 10 Most Important Features:")
feature_importance = list(zip(feature_names, regressor.feature_importances_))
feature_importance.sort(key=lambda x: x[1], reverse=True)
for name, importance in feature_importance[:10]:
    print(f"{name}: {importance:.4f}")

# Numerical Features Summary
print("\nNumerical Features Summary:")
numeric_features = [
    'length', 'word_count', 'avg_word_length', 'contains_number',
    'starts_with_number', 'capital_letters', 'word_density', 'punctuation_count'
]
print("\nFeature Statistics:")
print(data[numeric_features].describe())

# Correlation with Target
print("\nCorrelation with Target (score):")
correlations = data[numeric_features + ['score']].corr()['score'].sort_values(ascending=False)
print(correlations)


=== Model Performance Evaluation ===

Performance Metrics:
Mean Squared Error (MSE): 49.7469
Root Mean Squared Error (RMSE): 7.0531
Mean Absolute Error (MAE): 4.2717
R² Score: -0.0791

Feature Importance Analysis:

Top 10 Most Important Features:
embedding_183: 0.0675
word_count: 0.0210
embedding_111: 0.0202
embedding_82: 0.0174
length: 0.0143
avg_word_length: 0.0115
embedding_38: 0.0113
embedding_151: 0.0110
word_density: 0.0104
capital_letters: 0.0103

Numerical Features Summary:

Feature Statistics:
             length    word_count  avg_word_length  contains_number  \
count  10000.000000  10000.000000     10000.000000     10000.000000   
mean      48.870700      8.053300         5.372001         0.179400   
std       22.599191      3.890369         1.289503         0.383706   
min        1.000000      0.000000         0.000000         0.000000   
25%       33.000000      5.000000         4.500000         0.000000   
50%       45.000000      7.000000         5.200000         0.0000