In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression

# Example texts and labels
examples = [
    "I love this product! It's fantastic.",
    "Terrible experience — won't buy again."
]
labels = [1, 0]  # 1: positive, 0: negative

# Text cleaning function
def clean_text(text, stop_words):
    text = text.lower()
    tokens = text.split()
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

# Clean the examples
cleaned_examples = [clean_text(text, ENGLISH_STOP_WORDS) for text in examples]

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(cleaned_examples)

# Train Logistic Regression model (set solver and max_iter to be safe)
model = LogisticRegression(solver='liblinear', random_state=0, max_iter=1000)
model.fit(X, labels)

# Example prediction
new_texts = ["I really like this!", "I hate this product"]
cleaned_new = [clean_text(t, ENGLISH_STOP_WORDS) for t in new_texts]
X_new = vectorizer.transform(cleaned_new)
print(model.predict(X_new))

[0 0]


TF-IDF (Term Frequency-Inverse Document Frequency) is often preferred over a simple Bag of Words (Count Vectorizer) for text classification because it not only considers the frequency of words in a document but also accounts for how common or rare those words are across all documents. While Bag of Words simply counts word occurrences, TF-IDF reduces the weight of frequently occurring words that may not carry significant meaning (such as "the", "is", "and") and increases the importance of rarer, more informative words. This helps models focus on the most relevant terms for distinguishing between classes, leading to improved classification performance and reduced impact from common, less informative words.

In [32]:
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
# Example: replace with your data
reviews = [
    "I love this product! Works great and arrived fast.",
    "Terrible quality, broke after a week. Do not recommend.",
    "Amazing value for money, highly recommended.",
    "Not what I expected. Very disappointed."
]
labels = [1, 0, 1, 0]
# Tokenize
tokenized_reviews = [simple_preprocess(r, deacc=True) for r in reviews]
# Train Word2Vec
vector_size = 100
w2v_model = Word2Vec(
    sentences=tokenized_reviews,
    vector_size=vector_size,
    window=5,
    min_count=1,    # adjust for your corpus size
    workers=4,
    seed=42,
    epochs=10
)

def avg_w2v_vector(tokens, model, vector_size):
    vecs = [model.wv[t] for t in tokens if t in model.wv.key_to_index]
    if len(vecs) > 0:
        return np.mean(vecs, axis=0)
    else:
        return np.zeros(vector_size, dtype=float)

X_w2v = np.vstack([avg_w2v_vector(toks, w2v_model, vector_size) for toks in tokenized_reviews])

# Train/test split so evaluation is meaningful
X_train, X_test, y_train, y_test = train_test_split(X_w2v, labels, test_size=2, random_state=42, stratify=labels)

rf_clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Test accuracy: 0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


One key advantage of using Word2Vec embeddings over TF-IDF is that Word2Vec captures semantic relationships between words by representing them in a continuous vector space, allowing similar words to have similar vector representations. This enables models to understand context and word meaning beyond simple frequency counts, whereas TF-IDF only reflects how often words appear and does not capture semantic similarity or context.

In [33]:
# Full example: LSTM-based sentiment classifier
# Replace the sample `reviews` and `labels` with your own dataset as needed.

import random
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Reproducibility (note: full determinism depends on hardware / TF config)
random.seed(42)
np.random.seed(42)
import tensorflow as tf
tf.random.set_seed(42)

# -----------------------
# Example data (replace)
# -----------------------
reviews = [
    "I love this product! Works great and arrived fast.",
    "Terrible quality, broke after a week. Do not recommend.",
    "Amazing value for money, highly recommended.",
    "Not what I expected. Very disappointed."
]
labels = [1, 0, 1, 0]  # 1 = positive, 0 = negative

# Optional new texts to predict after training
new_texts = [
    "This is the best purchase I've made.",
    "Completely useless and poor quality."
]

# -----------------------
# Parameters
# -----------------------
max_words = 1000       # maximum vocabulary size to keep (top words)
max_len = 20           # max tokens per review (longer gets truncated)
embedding_dim = 64     # embedding vector size
batch_size = 32
epochs = 10
test_prop = 0.25
random_state = 42

# -----------------------
# Tokenize & pad
# -----------------------
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
X_seq = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post').astype('int32')
y = np.array(labels, dtype='int32')

# Safe vocab size for Embedding layer
vocab_size = min(max_words, len(tokenizer.word_index) + 1)

# -----------------------
# Adaptive train/test split (handles very small datasets)
# -----------------------
n_samples = len(y)
n_classes = len(np.unique(y))

if n_samples * test_prop < n_classes:
    # Choose integer test size at least equal to n_classes, but leave at least 1 sample for training
    test_size_int = min(max(n_classes, 1), max(1, n_samples - 1))
    if test_size_int >= n_samples:
        # As a last resort, fall back to a non-stratified proportional split
        X_train, X_test, y_train, y_test = train_test_split(
            X_seq, y, test_size=test_prop, random_state=random_state, stratify=None
        )
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X_seq, y, test_size=test_size_int, random_state=random_state, stratify=y
        )
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X_seq, y, test_size=test_prop, random_state=random_state, stratify=y
    )

# -----------------------
# Build the LSTM model
# -----------------------
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# -----------------------
# Training (use validation split only if training set is large enough)
# -----------------------
use_validation = (len(X_train) >= 10)
callbacks = []
if use_validation:
    callbacks = [EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)]
    val_split = 0.1
else:
    val_split = 0.0  # no validation split for very small training sets

model.fit(
    X_train, y_train,
    validation_split=val_split,
    epochs=epochs,
    batch_size=batch_size if len(X_train) >= batch_size else max(1, len(X_train)),
    callbacks=callbacks,
    verbose=1
)

# -----------------------
# Evaluate on test set
# -----------------------
if len(X_test) > 0:
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test accuracy: {test_acc:.4f}")
    y_pred_prob = model.predict(X_test)
    y_pred = (y_pred_prob > 0.5).astype(int).flatten()
    print("Classification report on test set:")
    print(classification_report(y_test, y_pred, digits=4))
else:
    print("No test set available (too small dataset). Skipping evaluation.")

# -----------------------
# Predict on new_texts (if provided)
# -----------------------
if 'new_texts' in globals() and new_texts:
    new_seq = tokenizer.texts_to_sequences(new_texts)
    X_new_seq = pad_sequences(new_seq, maxlen=max_len, padding='post', truncating='post').astype('int32')
    preds_prob = model.predict(X_new_seq)
    preds = (preds_prob > 0.5).astype(int).flatten()
    for text, p in zip(new_texts, preds):
        label = 'positive' if p == 1 else 'negative'
        print(f"Text: {text!r} => Predicted: {label}")
else:
    print("No new_texts variable found or it is empty; skipping prediction.")

Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 895ms/step - accuracy: 0.5000 - loss: 0.6935
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.5000 - loss: 0.6930
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.5000 - loss: 0.6925
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.5000 - loss: 0.6919
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.5000 - loss: 0.6914
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.5000 - loss: 0.6907
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.5000 - loss: 0.6900
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.5000 - loss: 0.6892
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
model.summary()

LSTMs are often preferred over simple RNNs for text classification because they handle long-range dependencies and the vanishing-gradient problem much better. During backpropagation through time, standard RNNs tend to suffer from vanishing (or exploding) gradients, which prevents them from learning relationships across many time steps. LSTM units introduce a cell state plus gating mechanisms (input, forget, and output gates) that regulate information flow and create a more stable path for gradients (the “constant error carousel”). As a result, LSTMs can preserve and learn useful contextual information over longer sequences, making them more reliable for tasks where distant words influence the label. The trade-off is that LSTMs have more parameters and cost more compute, but their improved ability to capture long-term context usually yields better accuracy on real-world text problems.

In [35]:
 %pip install tabulate

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\LAB-USER-01\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [36]:
# Evaluate three text-classification workflows on a common test set:
# 1) TF-IDF + Logistic Regression
# 2) Word2Vec (averaged) + Random Forest
# 3) LSTM (Embedding + LSTM)
#
# The script:
# - uses `reviews` and `labels` from the notebook environment if present,
#   otherwise falls back to a small example dataset.
# - creates a single train/test split (adaptive to tiny datasets)
# - trains each model on the TRAIN set only
# - evaluates on the TEST set using Accuracy, F1-score, and ROC-AUC
# - builds a pandas DataFrame comparing the three models
#
# NOTE: If TensorFlow or gensim are not installed in the current kernel, the
# corresponding model training will be skipped and results set to NaN.
# Paste this cell into your notebook and run it (it is self-contained).

import warnings
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

# Optional libs that may not be installed in every environment
try:
    from gensim.models import Word2Vec
    from gensim.utils import simple_preprocess
    gensim_available = True
except Exception:
    gensim_available = False

try:
    import tensorflow as tf
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Embedding, LSTM, Dense
    from tensorflow.keras.callbacks import EarlyStopping
    tf_available = True
except Exception:
    tf_available = False

# ---------------------------
# Load reviews & labels
# ---------------------------
if 'reviews' in globals() and 'labels' in globals():
    X_all = reviews
    y_all = np.array(labels)
else:
    # Fallback toy dataset (replace this by your dataset in the notebook)
    X_all = [
        "I love this product! Works great and arrived fast.",
        "Terrible quality, broke after a week. Do not recommend.",
        "Amazing value for money, highly recommended.",
        "Not what I expected. Very disappointed.",
        "Exceeded my expectations, very happy with the purchase.",
        "Poor build. Stopped working after two days."
    ]
    y_all = np.array([1, 0, 1, 0, 1, 0])

# Basic checks
if len(X_all) != len(y_all):
    raise ValueError("reviews and labels must have the same length")

print(f"Total samples: {len(X_all)}; class distribution: {Counter(y_all)}")

# ---------------------------
# Adaptive train/test split
# ---------------------------
def adaptive_train_test_split(X, y, test_prop=0.25, random_state=42):
    n_samples = len(y)
    classes = np.unique(y)
    n_classes = len(classes)
    # if proportion would produce fewer test samples than classes, choose integer test size
    if n_samples * test_prop < n_classes:
        test_size_int = min(max(n_classes, 1), max(1, n_samples - 1))
        if test_size_int >= n_samples:
            # fallback to non-stratified fractional split
            return train_test_split(X, y, test_size=test_prop, random_state=random_state, stratify=None)
        else:
            return train_test_split(X, y, test_size=test_size_int, random_state=random_state, stratify=y)
    else:
        return train_test_split(X, y, test_size=test_prop, random_state=random_state, stratify=y)

X_train_text, X_test_text, y_train, y_test = adaptive_train_test_split(X_all, y_all, test_prop=0.25, random_state=42)
print(f"Train samples: {len(X_train_text)}; Test samples: {len(X_test_text)}; test class distribution: {Counter(y_test)}")

# Prepare a results dict
results = {
    'model': [],
    'accuracy': [],
    'f1': [],
    'roc_auc': []
}

# ---------------------------
# 1) TF-IDF + Logistic Regression
# ---------------------------
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

lr = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)
lr.fit(X_train_tfidf, y_train)

y_pred_lr = lr.predict(X_test_tfidf)
y_proba_lr = lr.predict_proba(X_test_tfidf)[:, 1]

acc_lr = accuracy_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr, zero_division=0)
# ROC-AUC requires both classes present in y_test
try:
    roc_lr = roc_auc_score(y_test, y_proba_lr)
except Exception:
    roc_lr = float('nan')

results['model'].append('TF-IDF + LogisticRegression')
results['accuracy'].append(acc_lr)
results['f1'].append(f1_lr)
results['roc_auc'].append(roc_lr)

print("\nTF-IDF + LogisticRegression evaluation:")
print("Accuracy:", acc_lr, "F1:", f1_lr, "ROC-AUC:", roc_lr)

# ---------------------------
# 2) Word2Vec (averaged) + RandomForest
# ---------------------------
if not gensim_available:
    warnings.warn("gensim not available: skipping Word2Vec model and filling NaNs")
    results['model'].append('Word2Vec(avg) + RandomForest')
    results['accuracy'].append(float('nan'))
    results['f1'].append(float('nan'))
    results['roc_auc'].append(float('nan'))
else:
    # Tokenize using gensim.simple_preprocess (lowercase, deacc)
    tokenized_train = [simple_preprocess(t, deacc=True) for t in X_train_text]
    tokenized_test = [simple_preprocess(t, deacc=True) for t in X_test_text]

    # Train Word2Vec on training tokens only
    w2v_size = 100
    w2v = Word2Vec(sentences=tokenized_train, vector_size=w2v_size, window=5, min_count=1, workers=4, seed=42, epochs=10)

    # Function to average vectors
    def avg_w2v_vec(tokens, model, vector_size):
        vecs = [model.wv[t] for t in tokens if t in model.wv.key_to_index]
        if len(vecs) > 0:
            return np.mean(vecs, axis=0)
        else:
            return np.zeros(vector_size, dtype=float)

    X_train_w2v = np.vstack([avg_w2v_vec(toks, w2v, w2v_size) for toks in tokenized_train])
    X_test_w2v = np.vstack([avg_w2v_vec(toks, w2v, w2v_size) for toks in tokenized_test])

    # Train Random Forest
    rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    rf.fit(X_train_w2v, y_train)

    y_pred_rf = rf.predict(X_test_w2v)
    y_proba_rf = rf.predict_proba(X_test_w2v)[:, 1]

    acc_rf = accuracy_score(y_test, y_pred_rf)
    f1_rf = f1_score(y_test, y_pred_rf, zero_division=0)
    try:
        roc_rf = roc_auc_score(y_test, y_proba_rf)
    except Exception:
        roc_rf = float('nan')

    results['model'].append('Word2Vec(avg) + RandomForest')
    results['accuracy'].append(acc_rf)
    results['f1'].append(f1_rf)
    results['roc_auc'].append(roc_rf)

    print("\nWord2Vec(avg) + RandomForest evaluation:")
    print("Accuracy:", acc_rf, "F1:", f1_rf, "ROC-AUC:", roc_rf)

# ---------------------------
# 3) LSTM (Embedding + LSTM)
# ---------------------------
if not tf_available:
    warnings.warn("TensorFlow / Keras not available: skipping LSTM model and filling NaNs")
    results['model'].append('LSTM (Embedding + LSTM)')
    results['accuracy'].append(float('nan'))
    results['f1'].append(float('nan'))
    results['roc_auc'].append(float('nan'))
else:
    # Reproducibility (best-effort)
    np.random.seed(42)
    import random
    random.seed(42)
    tf.random.set_seed(42)

    # Tokenizer fit on training text only
    max_words = 5000
    max_len = 50
    embedding_dim = 100

    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train_text)

    # sequences
    seq_train = tokenizer.texts_to_sequences(X_train_text)
    seq_test = tokenizer.texts_to_sequences(X_test_text)
    X_train_seq = pad_sequences(seq_train, maxlen=max_len, padding='post', truncating='post').astype('int32')
    X_test_seq = pad_sequences(seq_test, maxlen=max_len, padding='post', truncating='post').astype('int32')

    # compute vocab size safely
    vocab_size = min(max_words, len(tokenizer.word_index) + 1)

    # Build model
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
        LSTM(64),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Training parameters: keep small to avoid long runs in notebooks
    batch_size = 32 if len(X_train_seq) >= 32 else max(1, len(X_train_seq))
    epochs = 10
    callbacks = []
    val_split = 0.1 if len(X_train_seq) >= 10 else 0.0
    if val_split > 0:
        callbacks = [EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=0)]

    # Fit model (suppress verbose to keep output readable)
    model.fit(X_train_seq, y_train, validation_split=val_split, epochs=epochs, batch_size=batch_size, callbacks=callbacks, verbose=1)

    # Predictions and probabilities
    y_proba_lstm = model.predict(X_test_seq).reshape(-1)
    y_pred_lstm = (y_proba_lstm > 0.5).astype(int)

    acc_lstm = accuracy_score(y_test, y_pred_lstm)
    f1_lstm = f1_score(y_test, y_pred_lstm, zero_division=0)
    try:
        roc_lstm = roc_auc_score(y_test, y_proba_lstm)
    except Exception:
        roc_lstm = float('nan')

    results['model'].append('LSTM (Embedding + LSTM)')
    results['accuracy'].append(acc_lstm)
    results['f1'].append(f1_lstm)
    results['roc_auc'].append(roc_lstm)

    print("\nLSTM (Embedding + LSTM) evaluation:")
    print("Accuracy:", acc_lstm, "F1:", f1_lstm, "ROC-AUC:", roc_lstm)

# ---------------------------
# Summary comparison table
# ---------------------------
df_results = pd.DataFrame(results)
# Format numeric columns to 4 decimal places where possible
for col in ['accuracy', 'f1', 'roc_auc']:
    df_results[col] = df_results[col].apply(lambda v: (float(v) if (v is not None and not (isinstance(v, float) and np.isnan(v))) else np.nan))
pd.set_option('display.precision', 4)

print("\nModel comparison table:")
display(df_results)    # in notebook, this renders nicely


Total samples: 4; class distribution: Counter({1: 2, 0: 2})
Train samples: 2; Test samples: 2; test class distribution: Counter({1: 1, 0: 1})

TF-IDF + LogisticRegression evaluation:
Accuracy: 0.5 F1: 0.6666666666666666 ROC-AUC: 0.5

Word2Vec(avg) + RandomForest evaluation:
Accuracy: 1.0 F1: 1.0 ROC-AUC: 1.0
Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 858ms/step - accuracy: 0.5000 - loss: 0.6931
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.5000 - loss: 0.6932
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.5000 - loss: 0.6931
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.5000 - loss: 0.6932
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.5000 - loss: 0.6932
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.5000 - loss: 0.6931
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.5000 - loss: 0.6931
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.5000 - loss: 0.6931
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4

Unnamed: 0,model,accuracy,f1,roc_auc
0,TF-IDF + LogisticRegression,0.5,0.6667,0.5
1,Word2Vec(avg) + RandomForest,1.0,1.0,1.0
2,LSTM (Embedding + LSTM),0.5,0.6667,0.0


I recommend deploying the Word2Vec (averaged) + Random Forest model because it achieved the best metrics and offers fast inference (cheap averaging + efficient tree inference) and moderate training cost compared with an LSTM, but only after you validate the result with stratified k‑fold CV and checks for overfitting or data leakage; if the RF’s perfect scores don’t hold up under robust validation, use TF‑IDF + Logistic Regression instead as a safer, more interpretable and cheaper-to-deploy baseline (and in either case calibrate probabilities and add monitoring).

In [42]:
#!/usr/bin/env python3
"""
Save trained gensim Word2Vec and scikit-learn RandomForest objects as pickle files
in the project root. This script mirrors the behavior of the original joblib/gensim
save flow but uses Python's pickle format.

Paste & run this cell in your notebook (after training the models) or run as a script
in the same environment where the trained objects exist in memory.
"""

import os
import pickle
import warnings
import numpy as np

# Filenames saved in project root (no "models/" subfolder)
W2V_FILENAME = "w2v.pkl"
RF_FILENAME = "rf_w2v.pkl"

# Candidate names for in-memory objects (adjust if your variables use different names)
w2v_candidates = ['w2v', 'w2v_model', 'word2vec', 'w2vmodel', 'w2v_trained']
rf_candidates = ['rf', 'rf_clf', 'rf_model', 'random_forest', 'rf_w2v']

w2v_obj = None
rf_obj = None

for n in w2v_candidates:
    if n in globals():
        w2v_obj = globals()[n]
        break

for n in rf_candidates:
    if n in globals():
        rf_obj = globals()[n]
        break

if w2v_obj is None or rf_obj is None:
    raise RuntimeError(
        "Could not find Word2Vec and/or RandomForest objects in this notebook under common names.\n"
        "Expected Word2Vec in one of: {}\nExpected RF in one of: {}\n"
        "If your variables use different names, either rename them or set variables `w2v` and `rf` and re-run.".format(w2v_candidates, rf_candidates)
)

# Save Word2Vec (pickle) and RF (pickle) to project root
try:
    # Ensure we write to the current working directory (project root)
    w2v_path = os.path.join(os.getcwd(), W2V_FILENAME)
    rf_path = os.path.join(os.getcwd(), RF_FILENAME)

    # Save gensim Word2Vec object as pickle
    with open(w2v_path, "wb") as f:
        pickle.dump(w2v_obj, f, protocol=pickle.HIGHEST_PROTOCOL)

    # Save RF along with vector_size metadata (if available)
    vector_size = getattr(w2v_obj, "vector_size", None)
    if vector_size is None:
        # gensim <=4 stores size at wv.vector_size
        vector_size = getattr(getattr(w2v_obj, "wv", None), "vector_size", None)

    rf_data = {"clf": rf_obj, "vector_size": vector_size}
    with open(rf_path, "wb") as f:
        pickle.dump(rf_data, f, protocol=pickle.HIGHEST_PROTOCOL)

    print(f"Saved Word2Vec -> ./{W2V_FILENAME}")
    print(f"Saved RandomForest -> ./{RF_FILENAME}")
except Exception as e:
    raise RuntimeError("Failed to save models: " + str(e))

# Quick verification: load back and run a sample prediction (requires gensim for preprocessing)
try:
    from gensim.utils import simple_preprocess
except Exception:
    raise RuntimeError("gensim required for verification. Install gensim in this kernel and re-run.")

try:
    # Load pickle files
    with open(w2v_path, "rb") as f:
        w2v_loaded = pickle.load(f)

    with open(rf_path, "rb") as f:
        rf_loaded_data = pickle.load(f)

    rf_loaded = rf_loaded_data.get("clf") if isinstance(rf_loaded_data, dict) else rf_loaded_data
    vector_size = rf_loaded_data.get("vector_size", getattr(w2v_loaded, "vector_size", getattr(getattr(w2v_loaded, "wv", None), "vector_size", None)))

    # choose a sample text (use one from your test set if available)
    sample = "This product exceeded my expectations and works perfectly."
    tokens = simple_preprocess(sample, deacc=True)
    # Access key_to_index for gensim >=4, fallback to vocabulary or similar for older versions
    try:
        key_index = getattr(w2v_loaded.wv, "key_to_index", None)
        vecs = [w2v_loaded.wv[t] for t in tokens if key_index is None or t in key_index]
    except Exception:
        # If model was saved as older gensim object, try direct membership test
        vecs = [w2v_loaded.wv[t] for t in tokens if t in w2v_loaded.wv]

    if len(vecs) == 0:
        warnings.warn("No in-vocabulary tokens found for sample text; verification skipped (OOV).")
    else:
        avg_vec = np.mean(vecs, axis=0).reshape(1, -1)
        pred = rf_loaded.predict(avg_vec)[0]
        proba = rf_loaded.predict_proba(avg_vec)[0, 1] if hasattr(rf_loaded, "predict_proba") else None
        print("Verification prediction:", "Positive" if int(pred) == 1 else "Negative", f"(prob={proba:.3f})" if proba is not None else "")

    print("Models saved successfully in project root. You can now run the Streamlit app which will load './w2v.pkl' and './rf_w2v.pkl'.")
except Exception as e:
    raise RuntimeError("Verification failed after saving models: " + str(e))

Saved Word2Vec -> ./w2v.pkl
Saved RandomForest -> ./rf_w2v.pkl
Verification prediction: Positive (prob=0.610)
Models saved successfully in project root. You can now run the Streamlit app which will load './w2v.pkl' and './rf_w2v.pkl'.
