In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

df = pd.read_csv(
    "/content/drive/MyDrive/cs3244/dataset/train-balanced.csv.bz2",
    compression="bz2",
    sep="\t",
    header=None  # if there’s no header row
)


In [4]:
# assign column names for clarity
df.columns = [
    "label", "text", "author", "subreddit",
    "upvotes", "downvotes", "score",
    "date", "timestamp", "comment"
]

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm

def embed_text(df, text_col="text", model_name="all-MiniLM-L6-v2", batch_size=64, device=None):
    """
    Embeds text from a dataframe column using a pretrained SentenceTransformer model.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe containing the text column.
    text_col : str
        Name of the column containing text to embed.
    model_name : str
        Pretrained model name from Hugging Face (default = 'all-MiniLM-L6-v2').
    batch_size : int
        Batch size for encoding (higher = faster but more memory).
    device : str or None
        'cuda' for GPU, 'cpu' for CPU, None = auto-detect.

    Returns
    -------
    np.ndarray
        Embedding matrix of shape (n_samples, embedding_dim).
    """
    # Load model
    model = SentenceTransformer(model_name, device=device)

    # Extract text
    texts = df[text_col].astype(str).tolist()

    # Encode in batches
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True  # L2 normalize vectors
    )

    return embeddings


In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("all-mpnet-base-v2", device=device)

batch_size = 128
save_path = "/content/drive/MyDrive/cs3244/features/embeddings_memmap.npy"

n_samples = len(df)
embedding_dim = model.get_sentence_embedding_dimension()

# Create memory-mapped file (doesn't load all in RAM)
embeddings = np.memmap(save_path, dtype='float32', mode='w+', shape=(n_samples, embedding_dim))

for start_idx in tqdm(range(0, n_samples, batch_size)):
    end_idx = min(start_idx + batch_size, n_samples)
    batch_texts = df["text"].iloc[start_idx:end_idx].astype(str).tolist()

    batch_embeddings = model.encode(
        batch_texts,
        convert_to_numpy=True,
        normalize_embeddings=True
    )

    embeddings[start_idx:end_idx] = batch_embeddings

# Flush to disk
embeddings.flush()
print("Embeddings saved incrementally to:", save_path)


100%|██████████| 8795/8795 [34:14<00:00,  4.28it/s]

Embeddings saved incrementally to: /content/drive/MyDrive/cs3244/features/embeddings_memmap.npy





In [None]:
import numpy as np

n_samples = len(df)  # number of rows in your dataset
embedding_dim = 768   # dimension of embeddings (all-mpnet-base-v2)

embeddings = np.memmap(
    "/content/drive/MyDrive/cs3244/features/embeddings_memmap.npy",
    dtype='float32',
    mode='r',
    shape=(n_samples, embedding_dim)
)

print("Embeddings shape:", embeddings.shape)


Embeddings shape: (1125678, 768)


In [5]:
import numpy as np
from sklearn.decomposition import IncrementalPCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

# ----- PARAMETERS -----
n_samples = len(df)
embedding_dim = 768
pca_components = 100    # reduce to 100 dimensions
batch_size_pca = 1024   # batch size for incremental PCA
cv_folds = 5            # cross-validation folds

# ----- LOAD EMBEDDINGS -----
embeddings = np.memmap(
    "/content/drive/MyDrive/cs3244/features/embeddings_memmap.npy",
    dtype='float32',
    mode='r',
    shape=(n_samples, embedding_dim)
)
print("Embeddings shape:", embeddings.shape)

# ----- LOAD LABELS -----
y = df["label"].values  # 0/1 sarcasm labels

# ----- INCREMENTAL PCA (batch-safe) -----
ipca = IncrementalPCA(n_components=pca_components, batch_size=batch_size_pca)
for start in range(0, n_samples, batch_size_pca):
    end = min(start + batch_size_pca, n_samples)
    ipca.partial_fit(embeddings[start:end])

# Transform embeddings in batches and store in smaller array
X_reduced = np.zeros((n_samples, pca_components), dtype=np.float32)
for start in range(0, n_samples, batch_size_pca):
    end = min(start + batch_size_pca, n_samples)
    X_reduced[start:end] = ipca.transform(embeddings[start:end])

print("Reduced embeddings shape:", X_reduced.shape)

# ----- LOGISTIC REGRESSION + STRATIFIED CV -----
clf = LogisticRegression(
    max_iter=200,
    solver='liblinear',
    class_weight='balanced'
)

cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
scores = cross_val_score(clf, X_reduced, y, cv=cv, scoring='accuracy')

print("Cross-Validation Accuracies:", scores)
print("Mean CV Accuracy: {:.4f} ± {:.4f}".format(np.mean(scores), np.std(scores)))


Embeddings shape: (1125678, 768)
Reduced embeddings shape: (1125678, 100)
Cross-Validation Accuracies: [0.65295199 0.65245896 0.6538892  0.65363449 0.65209319]
Mean CV Accuracy: 0.6530 ± 0.0007


In [9]:
import joblib
model_file = "/content/drive/MyDrive/cs3244/models/sarcasm_logreg_model_embeds.pkl"
joblib.dump(clf, model_file)
print(f"Saved trained model to {model_file}")

Saved trained model to /content/drive/MyDrive/cs3244/models/sarcasm_logreg_model_embeds.pkl


BEEP