In [76]:
import datasets
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

from deep_sentences.autoencoder import Autoencoder

In [2]:
ds = datasets.load_dataset("mteb/stsbenchmark-sts", "default")

In [3]:
dev_set = datasets.concatenate_datasets([ds['train'], ds['validation']])

In [4]:
dev_df = pd.DataFrame(dev_set)

In [5]:
all_sentences = pd.concat([dev_df['sentence1'], dev_df['sentence2']], ignore_index=True).drop_duplicates()

In [6]:
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=2000,
    norm='l2',
)
tfidf = vectorizer.fit_transform(all_sentences)

In [7]:
# Check TF-IDF matrix statistics
import numpy as np

print(f"TF-IDF matrix shape: {tfidf.shape}")
print(f"TF-IDF matrix density: {tfidf.nnz / (tfidf.shape[0] * tfidf.shape[1]) * 100:.2f}%")
print(f"Number of non-zero elements: {tfidf.nnz}")

# Convert to dense for inspection
tfidf_dense = tfidf.toarray()
print(f"\nTF-IDF statistics:")
print(f"  Min: {tfidf_dense.min():.6f}")
print(f"  Max: {tfidf_dense.max():.6f}")
print(f"  Mean: {tfidf_dense.mean():.6f}")
print(f"  Std: {tfidf_dense.std():.6f}")

# Check row norms (should be 1.0 due to L2 normalization)
row_norms = np.linalg.norm(tfidf_dense, axis=1)
print(f"\nRow L2 norms:")
print(f"  Mean: {row_norms.mean():.6f}")
print(f"  Std: {row_norms.std():.6f}")
print(f"  Min: {row_norms.min():.6f}")
print(f"  Max: {row_norms.max():.6f}")

TF-IDF matrix shape: (13227, 2000)
TF-IDF matrix density: 0.22%
Number of non-zero elements: 57105

TF-IDF statistics:
  Min: 0.000000
  Max: 1.000000
  Mean: 0.000986
  Std: 0.022212

Row L2 norms:
  Mean: 0.988660
  Std: 0.105886
  Min: 0.000000
  Max: 1.000000


In [8]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping

In [9]:
class TfidfDataset(Dataset):
    """Dataset for TF-IDF features."""
    
    def __init__(self, tfidf_matrix):
        # Convert sparse matrix to dense tensor
        self.features = torch.FloatTensor(tfidf_matrix.toarray())
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        # Return feature and dummy target (autoencoder doesn't need labels)
        return self.features[idx], 0

In [10]:
# Create dataset from TF-IDF features
dataset = TfidfDataset(tfidf)

# Split into train (80%) and validation (20%)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(
    dataset, 
    [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

print(f"Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}")

Train size: 10581, Validation size: 2646


In [11]:
# Create data loaders
batch_size = 64

train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True,
    num_workers=0  # Use 0 for CPU training to avoid multiprocessing overhead
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=batch_size,
    num_workers=0
)

In [12]:
# Inspect the data to diagnose potential issues
sample_batch, _ = next(iter(train_loader))
print(f"Sample batch shape: {sample_batch.shape}")
print(f"Sample batch dtype: {sample_batch.dtype}")
print(f"Sample batch min: {sample_batch.min():.6f}")
print(f"Sample batch max: {sample_batch.max():.6f}")
print(f"Sample batch mean: {sample_batch.mean():.6f}")
print(f"Sample batch std: {sample_batch.std():.6f}")
print(f"Percentage of zeros: {(sample_batch == 0).float().mean() * 100:.2f}%")
print(f"\nFirst sample (first 20 features): {sample_batch[0, :20]}")

Sample batch shape: torch.Size([64, 2000])
Sample batch dtype: torch.float32
Sample batch min: 0.000000
Sample batch max: 1.000000
Sample batch mean: 0.000970
Sample batch std: 0.022340
Percentage of zeros: 99.80%

First sample (first 20 features): tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


In [13]:
# Initialize the Autoencoder model
model = Autoencoder(
    dimensions=[2000, 300, 100],  # Input -> Hidden -> Latent
    activation="relu",
    latent_activation="linear",
    l1_alpha=0.001,  # L1 regularization on latent space
    learning_rate=3e-4,
    weight_decay=0.01
)

print(f"Model architecture:")
print(f"Encoder: {model.encoder}")
print(f"\nDecoder: {model.decoder}")

Model architecture:
Encoder: Sequential(
  (0): Linear(in_features=2000, out_features=300, bias=True)
  (1): ReLU()
  (2): Linear(in_features=300, out_features=100, bias=True)
)

Decoder: Sequential(
  (0): Linear(in_features=100, out_features=300, bias=True)
  (1): ReLU()
  (2): Linear(in_features=300, out_features=2000, bias=True)
)


In [14]:
# Test model forward pass and loss calculation before training
test_batch, _ = next(iter(train_loader))
with torch.no_grad():
    test_output = model(test_batch)
    test_loss, test_recon, test_l1 = model.compute_loss(test_batch, test_output)
    
print(f"Test batch shape: {test_batch.shape}")
print(f"Test output shape: {test_output.shape}")
print(f"Test output min: {test_output.min():.6f}, max: {test_output.max():.6f}")
print(f"Test output mean: {test_output.mean():.6f}, std: {test_output.std():.6f}")
print(f"\nInitial losses (before training):")
print(f"  Total loss: {test_loss:.6f}")
print(f"  Reconstruction loss: {test_recon:.6f}")
print(f"  L1 loss: {test_l1:.6f}")

Test batch shape: torch.Size([64, 2000])
Test output shape: torch.Size([64, 2000])
Test output min: -0.120033, max: 0.123690
Test output mean: 0.000968, std: 0.041581

Initial losses (before training):
  Total loss: 0.002234
  Reconstruction loss: 0.002208
  L1 loss: 0.000027


In [15]:
# Set up callbacks
checkpoint_callback = ModelCheckpoint(
    dirpath='../checkpoints/autoencoder',
    filename='autoencoder-{epoch:02d}-{val_loss:.4f}',
    monitor='val_loss',
    mode='min',
    save_top_k=3,
    save_last=True
)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    mode='min',
    verbose=True
)

In [16]:
# Initialize trainer
trainer = L.Trainer(
    max_epochs=100,
    callbacks=[checkpoint_callback, early_stopping],
    accelerator='cpu',  # Force CPU usage
    devices=1,
    log_every_n_steps=10,
    enable_progress_bar=True,
    enable_model_summary=True
)

print(f"Trainer initialized. Using device: {trainer.strategy.root_device}")

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores


Trainer initialized. Using device: cpu


/home/felipefg/Dropbox/DSc/src/CPE727/CPE727-2025-03/TrabalhoFinal/FelipeGrael/.venv/lib/python3.13/site-packages/lightning/pytorch/trainer/setup.py:175: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.


In [17]:
# Train the model
trainer.fit(model, train_loader, val_loader)

/home/felipefg/Dropbox/DSc/src/CPE727/CPE727-2025-03/TrabalhoFinal/FelipeGrael/.venv/lib/python3.13/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:881: Checkpoint directory /home/felipefg/Dropbox/DSc/src/CPE727/CPE727-2025-03/TrabalhoFinal/FelipeGrael/checkpoints/autoencoder exists and is not empty.


Output()

Metric val_loss improved. New best score: 0.000
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.000
Metric val_loss improved by 0.000 >= min

In [18]:
# Load the best model checkpoint
best_model_path = checkpoint_callback.best_model_path
print(f"Best model saved at: {best_model_path}")

# Load the best model
best_model = Autoencoder.load_from_checkpoint(best_model_path)
best_model.eval()

print(f"\nBest model metrics:")
print(f"Validation loss: {checkpoint_callback.best_model_score:.6f}")

Best model saved at: /home/felipefg/Dropbox/DSc/src/CPE727/CPE727-2025-03/TrabalhoFinal/FelipeGrael/checkpoints/autoencoder/autoencoder-epoch=99-val_loss=0.0004.ckpt

Best model metrics:
Validation loss: 0.000354


In [19]:
# Example: Encode some sentences to 300-dimensional latent space
with torch.no_grad():
    # Get a batch of data
    sample_batch, _ = next(iter(val_loader))
    
    # Encode to latent space
    latent_representations = best_model.encode(sample_batch)
    
    # Reconstruct
    reconstructions = best_model(sample_batch)
    
    # Calculate reconstruction error
    recon_error = torch.mean((sample_batch - reconstructions) ** 2, dim=1)
    
    print(f"Sample batch shape: {sample_batch.shape}")
    print(f"Latent representation shape: {latent_representations.shape}")
    print(f"Mean reconstruction error: {recon_error.mean():.6f}")
    print(f"Std reconstruction error: {recon_error.std():.6f}")

Sample batch shape: torch.Size([64, 2000])
Latent representation shape: torch.Size([64, 100])
Mean reconstruction error: 0.000366
Std reconstruction error: 0.000128


In [33]:
latent_representations.dtype

torch.float32

In [83]:
def autoencoder_similarity(row):
    batch = vectorizer.transform([row['sentence1'], row['sentence2']])
    batch = torch.tensor(batch.toarray()).float()

    with torch.no_grad():
        latent_representation = best_model.encode(batch)

    latent_representation = latent_representation.numpy()
    return 5 * cosine_similarity(latent_representation[0,:].reshape(1, -1), latent_representation[1,:].reshape(1,-1))[0][0]
    #return np.linalg.norm(latent_representation[0,:] - latent_representation[1,:])

In [84]:
dev_df['ae_sim'] = dev_df.apply(autoencoder_similarity, axis=1)

In [85]:
pearsonr(dev_df['score'], dev_df['ae_sim'])

PearsonRResult(statistic=np.float64(0.49031845585933187), pvalue=np.float64(0.0))