# Module 9 — NLP Basics & Embeddings (Expanded)

This notebook covers practical, classroom-friendly NLP workflows:

- text preprocessing and TF-IDF baseline
- LSTM-based classifier (Keras) on IMDB (small subset)
- Hugging Face tokenization and feature extraction (DistilBERT)
- fine-tuning DistilBERT for text classification (short demo)

Notes: run in Colab with a GPU for transformer fine-tuning. Training steps are intentionally short for demos.

## 1 — Setup (install packages and imports)

In [None]:
# Install libraries (transformers and datasets are optional but useful for HF experiments)
!pip -q install -U transformers datasets sentencepiece --quiet
!pip -q install -U tensorflow scikit-learn --quiet

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

print('TF version:', tf.__version__)
print('Transformers available')


## 2 — TF-IDF Baseline (small synthetic dataset)

In [None]:
# Small synthetic dataset for quick TF-IDF demo
docs = [
    'I love this movie, it is fantastic and exciting',
    'This film was terrible and boring',
    'What a great and thrilling performance',
    'I did not like the movie, it was dull',
    'Amazing acting and wonderful score',
    'Bad plot and poor acting'
]
labels = [1,0,1,0,1,0]  # 1=positive, 0=negative

v = TfidfVectorizer(ngram_range=(1,2), max_features=50)
X = v.fit_transform(docs)
print('TF-IDF shape:', X.shape)

# Simple logistic regression baseline
clf = LogisticRegression().fit(X, labels)
print('Baseline training done. Example preds:', clf.predict(v.transform(['fantastic movie','poor acting'])))


## 3 — LSTM classifier on IMDB (Keras) — quick demo

In [None]:
# Use TF Keras IMDB dataset (integer-encoded)
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models

# limit vocab and sequence length for speed
VOCAB_SIZE = 10000
MAXLEN = 200
(x_train,y_train),(x_test,y_test) = imdb.load_data(num_words=VOCAB_SIZE)
# use small subset for demo
x_train = x_train[:8000]; y_train = y_train[:8000]
x_test = x_test[:2000]; y_test = y_test[:2000]

x_train = pad_sequences(x_train, maxlen=MAXLEN)
x_test = pad_sequences(x_test, maxlen=MAXLEN)

model = models.Sequential([
    layers.Embedding(VOCAB_SIZE, 64, input_length=MAXLEN),
    layers.Bidirectional(layers.LSTM(64)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(x_train, y_train, validation_split=0.1, epochs=3, batch_size=64)
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.legend(); plt.title('LSTM accuracy')

# Evaluate
print('Eval:', model.evaluate(x_test, y_test, verbose=1))


## 4 — Hugging Face tokenization and feature extraction (DistilBERT)

In [None]:
from transformers import AutoTokenizer, TFAutoModel

model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
encoder = TFAutoModel.from_pretrained(model_name)

# sample texts
texts = ['I love this movie', 'This movie is awful']
enc = tokenizer(texts, padding=True, truncation=True, return_tensors='tf')
outputs = encoder(enc)
# pooled output example: take mean of last hidden states
embeds = tf.reduce_mean(outputs.last_hidden_state, axis=1)
print('Embeddings shape:', embeds.shape)


## 5 — Fine-tune DistilBERT for text classification (demo)

In [None]:
from transformers import TFAutoModelForSequenceClassification

# Use a very small dataset for demo (reuse TF-IDF docs above)
texts = docs
labels = np.array(labels)
enc = tokenizer(texts, padding=True, truncation=True, return_tensors='tf')

model_tf = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# compile with TF optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model_tf.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

# Create a tf.data dataset
dataset = tf.data.Dataset.from_tensor_slices((dict(enc), labels)).batch(2)
# train briefly (this is a tiny toy example)
model_tf.fit(dataset, epochs=1)

# Predict
pred = model_tf.predict(dict(enc))
print('Logits shape:', pred.logits.shape)


## 6 — Save/export models and tips

- Save Keras models using `model.save('path')`.
- For Hugging Face TF models, use `model_tf.save_pretrained('dir')` and `tokenizer.save_pretrained('dir')`.
- Exercises: try training on a small subset of AG News or Yelp datasets from `datasets` library and compare TF-IDF, LSTM, and Transformer approaches.