
# Classify Reviews

To install packages that are not installed by default,
uncomment the last line of this cell and replace with a list of needed packages.

This will ensure the notebook has all the dependencies and works everywhere.


In [None]:

import sys
# Uncomment and run if you need to install packages in this environment
!{sys.executable} -m pip install pandas numpy torch scikit-learn matplotlib seaborn


In [None]:

# Libraries
import pandas as pd
import numpy as np
import re, random, time, os
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix

pd.set_option("display.max_columns", 101)
pd.set_option("display.max_colwidth", 200)

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Device:', device)



## Data Description

Column | Description
---|---
`title` | Title of the review
`review` | Review text
`recommendation` | Will the user recommend the product or not (0=No, 1=Yes)

Load the training and test datasets (place `train.csv` and `test.csv` in the same folder as this notebook).


In [None]:

train_path = "train.csv"
test_path  = "test.csv"

if not os.path.exists(train_path):
    raise FileNotFoundError(f"{train_path} not found. Please upload it to the current directory.")
if not os.path.exists(test_path):
    raise FileNotFoundError(f"{test_path} not found. Please upload it to the current directory.")

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

print("Train shape:", train_df.shape)
print("Test  shape:", test_df.shape)
display(train_df.head(3))
display(test_df.head(3))



## Preprocessing & Vocabulary

- Combine `title` and `review` into one text field.
- Clean (lowercase, remove non-alphanumerics) and tokenize using whitespace.
- Build a vocabulary with minimum frequency filtering.
- Encode sequences and pad to fixed length.


In [None]:

def build_text(df: pd.DataFrame) -> pd.Series:
    title = df.get("title", pd.Series([""]*len(df))).fillna("")
    review = df.get("review", pd.Series([""]*len(df))).fillna("")
    text = (title + " " + review).str.strip()
    return text

_non_alnum_re = re.compile(r"[^a-z0-9\s]+")
_multi_space_re = re.compile(r"\s+")

def clean(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = _non_alnum_re.sub(" ", text)
    text = _multi_space_re.sub(" ", text).strip()
    return text

def tokenize(text: str):
    return clean(text).split()

train_texts = build_text(train_df)
test_texts  = build_text(test_df)

min_freq = 2
counter = Counter()
for t in train_texts:
    counter.update(tokenize(t))
for t in test_texts:
    counter.update(tokenize(t))

PAD, UNK = "<pad>", "<unk>"
vocab = {PAD: 0, UNK: 1}
for tok, freq in counter.items():
    if freq >= min_freq and tok not in vocab:
        vocab[tok] = len(vocab)
id2tok = {i: t for t, i in vocab.items()}
vocab_size = len(vocab)
print("Vocab size:", vocab_size)

train_lengths = np.array([len(tokenize(t)) for t in train_texts])
p95 = int(np.percentile(train_lengths, 95)) if len(train_lengths) else 50
max_len = int(max(8, min(300, p95)))
print("Max sequence length:", max_len)

def encode(text: str, max_len: int = max_len):
    toks = tokenize(text)
    ids = [vocab.get(tok, vocab[UNK]) for tok in toks][:max_len]
    if len(ids) < max_len:
        ids = ids + [vocab[PAD]] * (max_len - len(ids))
    return ids, min(len(toks), max_len)


## Dataset class and DataLoaders

In [None]:

class ReviewDataset(Dataset):
    def __init__(self, df: pd.DataFrame, max_len: int, has_labels: bool = True):
        self.texts = build_text(df).tolist()
        self.max_len = max_len
        self.has_labels = has_labels
        if self.has_labels:
            self.labels = df['recommendation'].astype(int).tolist()
        else:
            self.labels = None

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        ids, eff_len = encode(self.texts[idx], self.max_len)
        item = {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'lengths': torch.tensor(eff_len, dtype=torch.long)
        }
        if self.has_labels:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

train_df_, val_df_ = train_test_split(train_df, test_size=0.2, stratify=train_df['recommendation'], random_state=42)
train_ds = ReviewDataset(train_df_, max_len=max_len, has_labels=True)
val_ds   = ReviewDataset(val_df_,   max_len=max_len, has_labels=True)
test_ds  = ReviewDataset(test_df,   max_len=max_len, has_labels=False)

batch_size = 12
train_loader = DataLoader(train_ds, batch_size=batch_size, sampler=RandomSampler(train_ds))
val_loader   = DataLoader(val_ds,   batch_size=batch_size, sampler=SequentialSampler(val_ds))
test_loader  = DataLoader(test_ds,  batch_size=batch_size, sampler=SequentialSampler(test_ds))

print('Dataset sizes -> train:', len(train_ds), 'val:', len(val_ds), 'test:', len(test_ds))


## BiLSTM Model

In [None]:

class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size: int, embed_dim: int = 128, hidden_dim: int = 128, dropout: float = 0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            batch_first=True,
            bidirectional=True,
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, 1)

    def forward(self, input_ids, lengths):
        x = self.embedding(input_ids)
        output, (h_n, c_n) = self.lstm(x)
        h_forward = h_n[-2, :, :]
        h_backward = h_n[-1, :, :]
        h = torch.cat([h_forward, h_backward], dim=1)
        h = self.dropout(h)
        logits = self.fc(h).squeeze(1)
        return logits

model = BiLSTMClassifier(vocab_size=vocab_size, embed_dim=128, hidden_dim=128, dropout=0.3).to(device)
print(model)


## Training loop and Evaluation

In [None]:

def evaluate(model, data_loader, criterion):
    model.eval()
    total_loss = 0.0
    all_preds, all_targets = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            lengths   = batch['lengths'].to(device)
            labels    = batch['labels'].to(device)

            logits = model(input_ids, lengths)
            loss = criterion(logits, labels)
            total_loss += loss.item() * input_ids.size(0)

            probs = torch.sigmoid(logits)
            preds = (probs >= 0.5).long().cpu().numpy()
            all_preds.extend(preds.tolist())
            all_targets.extend(labels.long().cpu().numpy().tolist())

    avg_loss = total_loss / len(data_loader.dataset)
    f1 = f1_score(all_targets, all_preds)
    return avg_loss, f1, np.array(all_targets), np.array(all_preds)

def train_model(model, train_loader, val_loader, epochs=8, lr=1e-3, weight_decay=0.0, patience=2):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.BCEWithLogitsLoss()

    best_val_f1 = -1.0
    epochs_no_improve = 0

    for epoch in range(1, epochs+1):
        model.train()
        running_loss = 0.0
        t0 = time.time()

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            lengths   = batch['lengths'].to(device)
            labels    = batch['labels'].to(device)

            optimizer.zero_grad()
            logits = model(input_ids, lengths)
            loss = criterion(logits, labels)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            running_loss += loss.item() * input_ids.size(0)

        train_loss = running_loss / len(train_loader.dataset)
        val_loss, val_f1, y_true, y_pred = evaluate(model, val_loader, criterion)
        elapsed = time.time() - t0
        print(f"Epoch {epoch}/{epochs} | {elapsed:.1f}s  train_loss={train_loss:.4f}  val_loss={val_loss:.4f}  val_f1={val_f1:.4f}")

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            epochs_no_improve = 0
            best_state_dict = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        else:
            epochs_no_improve += 1
            if epochs_no_improve > patience:
                print('Early stopping.')
                break

    if 'best_state_dict' in locals():
        model.load_state_dict(best_state_dict)
    print('Best val F1:', best_val_f1)
    return model

model = train_model(model, train_loader, val_loader, epochs=10, lr=1e-3, patience=3)


## Validation report

In [None]:

criterion = nn.BCEWithLogitsLoss()
val_loss, val_f1, y_true, y_pred = evaluate(model, val_loader, criterion)
print('Validation F1:', round(val_f1,4))
print('\nClassification report:\n', classification_report(y_true, y_pred, digits=4))
print('\nConfusion matrix:\n', confusion_matrix(y_true, y_pred))


## Inference on test set & create submissions.csv

In [None]:
# Inference and submission (minimal)
model.eval()
all_test_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        lengths   = batch['lengths'].to(device)
        logits = model(input_ids, lengths)
        probs = torch.sigmoid(logits)
        preds = (probs >= 0.5).long().cpu().numpy()
        all_test_preds.extend(preds.tolist())

submission_df = test_df.copy()[['title','review']]
submission_df['recommendation'] = np.array(all_test_preds, dtype=int).clip(0,1)
submission_df.to_csv('submissions.csv', index=False)