In [2]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


ModuleNotFoundError: No module named 'pandas'

In [2]:
# Load the CSV
data = pd.read_csv('/Users/nigelkiernan/Documents/GitProjects/MLXProjects/data-1737988940684.csv')

In [None]:
# Inspect the data
print(data.head(10))

In [4]:
# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # Remove punctuation and lowercase
    tokens = word_tokenize(text)
    return tokens

In [None]:
data.head(10)

In [6]:
# Apply preprocessing
data['tokens'] = data['title'].apply(preprocess_text)

In [None]:
# Build the Skip-Gram Word2Vec Model

In [7]:
# Flatten tokenized text to build vocabulary
all_words = [word for tokens in data['tokens'] for word in tokens]
word_counts = Counter(all_words)
vocab = {word: i for i, word in enumerate(word_counts.keys())}


In [None]:
# Vocabulary size
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")


In [None]:
# Create Skip-Gram Pairs

In [9]:
# Generate skip-gram pairs
def generate_skip_gram_pairs(tokens, window_size):
    pairs = []
    for sentence in tokens:
        for center_idx in range(len(sentence)):
            for offset in range(-window_size, window_size + 1):
                context_idx = center_idx + offset
                if context_idx < 0 or context_idx >= len(sentence) or center_idx == context_idx:
                    continue
                pairs.append((vocab[sentence[center_idx]], vocab[sentence[context_idx]]))
    return pairs

In [None]:
# Create pairs
window_size = 2
skip_gram_pairs = generate_skip_gram_pairs(data['tokens'], window_size)
print(f"Total skip-gram pairs: {len(skip_gram_pairs)}")

In [None]:
# PyTorch Dataset and Dataloader

In [11]:
class SkipGramDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return torch.tensor(self.pairs[idx][0]), torch.tensor(self.pairs[idx][1])



In [12]:
# Create Dataset and DataLoader
dataset = SkipGramDataset(skip_gram_pairs)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)


In [13]:
# Define the Word2Vec Model

In [14]:
import torch.nn as nn


In [15]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.center_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center, context):
        center_embeds = self.center_embeddings(center)
        context_embeds = self.context_embeddings(context)
        scores = torch.sum(center_embeds * context_embeds, dim=1)
        return scores


In [16]:
# Train the Word2Vec Model

In [17]:
embedding_dim = 100
word2vec_model = Word2Vec(vocab_size, embedding_dim)
optimizer = torch.optim.Adam(word2vec_model.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()



In [None]:
# Training loop
for epoch in range(10):
    total_loss = 0
    for center, context in dataloader:
        labels = torch.ones(center.shape[0])  # Positive samples
        optimizer.zero_grad()
        scores = word2vec_model(center, context)
        loss = criterion(scores, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


In [None]:
# Generate Text Embeddings

In [19]:
# Create embedding matrix
def get_sentence_embedding(tokens, model, vocab):
    vectors = []
    for word in tokens:
        if word in vocab:
            vectors.append(model.center_embeddings.weight[vocab[word]].detach().numpy())
    return np.mean(vectors, axis=0) if vectors else np.zeros(embedding_dim)



In [20]:
# Generate embeddings for each title
data['embedding'] = data['tokens'].apply(lambda x: get_sentence_embedding(x, word2vec_model, vocab))


In [None]:
# Late Fusion Integration

In [22]:
# Example numerical features
data['length'] = data['title'].str.len()



In [24]:
# Combine embeddings with numerical features


X_text = np.vstack(data['embedding'].values)
X_numeric = data[['length']].values
X = np.hstack([X_text, X_numeric])
y = data['score'].values


In [None]:
# Train-Test Split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Train Regression Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)

# Predict and evaluate
y_pred = regressor.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))


In [None]:
# Enhanced numerical features
print("Generating numerical features...")
data['length'] = data['title'].str.len()
data['word_count'] = data['title'].str.split().str.len()
data['avg_word_length'] = data['title'].apply(lambda x: np.mean([len(word) for word in x.split()]) if len(x.split()) > 0 else 0)
data['contains_number'] = data['title'].str.contains('\d').astype(int)
data['starts_with_number'] = data['title'].str.match('^\d').astype(int)
data['capital_letters'] = data['title'].apply(lambda x: sum(1 for c in x if c.isupper()))
data['word_density'] = data['length'] / (data['word_count'] + 1)
data['punctuation_count'] = data['title'].str.count('[^\w\s]')

In [None]:
# Combine embeddings with numerical features
X_text = np.vstack(data['embedding'].values)
X_numeric = data[[
    'length',
    'word_count',
    'avg_word_length',
    'contains_number',
    'starts_with_number',
    'capital_letters',
    'word_density',
    'punctuation_count'
]].values

In [None]:
# Late fusion
X = np.hstack([X_text, X_numeric])
y = data['score'].values


In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Train Random Forest Model
print("Training Random Forest model...")
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train, y_train)


In [None]:
# Predict and evaluate
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print("\nModel Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}"

In [None]:
# Feature importance analysis
feature_names = [f'embedding_{i}' for i in range(embedding_dim)] + [
    'length',
    'word_count',
    'avg_word_length',
    'contains_number',
    'starts_with_number',
    'capital_letters',
    'word_density',
    'punctuation_count'
]

In [None]:
# Get feature importances
importances = regressor.feature_importances_
numeric_feature_importances = importances[embedding_dim:]
numeric_feature_names = feature_names[embedding_dim:]

# Print importance of numerical features
print("\nNumerical Feature Importances:")
for name, importance in zip(numeric_feature_names, numeric_feature_importances):
    print(f"{name}: {importance:.4f}")

In [None]:
# Model Evaluation
print("\n=== Model Performance Evaluation ===")

# Basic Metrics
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = np.mean(np.abs(y_test - y_pred))

print(f"\nPerformance Metrics:")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R² Score: {r2:.4f}")

# Feature Importance Analysis
print("\nFeature Importance Analysis:")
print("\nTop 10 Most Important Features:")
feature_importance = list(zip(feature_names, regressor.feature_importances_))
feature_importance.sort(key=lambda x: x[1], reverse=True)
for name, importance in feature_importance[:10]:
    print(f"{name}: {importance:.4f}")

# Numerical Features Summary
print("\nNumerical Features Summary:")
numeric_features = [
    'length', 'word_count', 'avg_word_length', 'contains_number',
    'starts_with_number', 'capital_letters', 'word_density', 'punctuation_count'
]
print("\nFeature Statistics:")
print(data[numeric_features].describe())

# Correlation with Target
print("\nCorrelation with Target (score):")
correlations = data[numeric_features + ['score']].corr()['score'].sort_values(ascending=False)
print(correlations)

In [None]:
print("Testing if code is running...")
print("Current shape of data:", data.shape)
print("Available columns:", data.columns.tolist())

In [1]:
print("Testing if notebook is running...")

Testing if notebook is running...
