# Home Exercise 2 on Text Generation

Implement a **sequence2sequence** model to **summarize the text**.

- **Data**: [CNN-DailyMail News Text Summarization](https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail)

**Note**: Submit only a **single Jupyter Notebook file** that can handle all tasks, including data downloading, preprocessing, model training, and model evaluation. *(Submissions that do not follow the guidelines will receive a score of 0.)*

## Grading Criteria

For valid submissions, scores will be assigned based on the **leaderboard ranking** (**strictly greater**):

- **Top 25%** → **10 points**
- **25% - 50%** → **9.0 points**
- **50% - 75%** → **8.0 points**
- **75% - 100%** → **7.0 points**


In [None]:
%pip install tensorflow numpy pandas matplotlib tensorflow-datasets


In [None]:
# Import necessary libraries
import tensorflow as tf
import numpy as np
import tensorflow_datasets as tfds
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# =====================================
# 📥 STEP 1: Load CNN/DailyMail Dataset
# =====================================
# Load dataset using TensorFlow Datasets
dataset_name = "cnn_dailymail"
config = "3.0.0"  # Use latest version
dataset = tfds.load(name=dataset_name, split=["train", "validation", "test"], as_supervised=True)

# Convert dataset to NumPy arrays
train_data, val_data, test_data = dataset

def extract_data(dataset):
    inputs, summaries = [], []
    for text, summary in tfds.as_numpy(dataset):
        inputs.append(text.decode('utf-8'))
        summaries.append(summary.decode('utf-8'))
    return inputs, summaries

# Extract text and summaries
train_texts, train_summaries = extract_data(train_data)
val_texts, val_summaries = extract_data(val_data)
test_texts, test_summaries = extract_data(test_data)

# Display a sample
print(f"Example Text: {train_texts[0]}")
print(f"Example Summary: {train_summaries[0]}")

# =====================================
# 🔢 STEP 2: Tokenization & Preprocessing
# =====================================
# Define hyperparameters
MAX_LEN_TEXT = 400
MAX_LEN_SUMMARY = 100
VOCAB_SIZE = 30000

# Tokenize input texts
def tokenize_sentences(sentences, vocab_size, max_length):
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    return tokenizer, sequences

# Tokenize articles and summaries
text_tokenizer, text_sequences = tokenize_sentences(train_texts, VOCAB_SIZE, MAX_LEN_TEXT)
summary_tokenizer, summary_sequences = tokenize_sentences(train_summaries, VOCAB_SIZE, MAX_LEN_SUMMARY)

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(text_sequences, summary_sequences, test_size=0.1, random_state=42)

# =====================================
# 🏗️ STEP 3: Build Seq2Seq Model with Attention
# =====================================
# Define model parameters
EMBEDDING_DIM = 256
UNITS = 512

# Encoder Model
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(enc_units, return_sequences=True, return_state=True)

    def call(self, x):
        x = self.embedding(x)
        output, state_h, state_c = self.lstm(x)
        return output, state_h, state_c

# Attention Layer
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

# Decoder Model with Attention
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(dec_units, return_sequences=True, return_state=True)
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(dec_units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state_h, state_c = self.lstm(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state_h, state_c, attention_weights

# Initialize models
encoder = Encoder(VOCAB_SIZE, EMBEDDING_DIM, UNITS, batch_sz=64)
decoder = Decoder(VOCAB_SIZE, EMBEDDING_DIM, UNITS, batch_sz=64)

# =====================================
# 🚀 STEP 4: Train the Model
# =====================================
# Define optimizer and loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
optimizer = tf.keras.optimizers.Adam()

# Training loop
for epoch in range(10):  # Train for 10 epochs
    enc_output, enc_hidden, enc_cell = encoder(X_train)
    dec_hidden = enc_hidden

    for i in range(len(y_train)):
        dec_input = y_train[i]  # Get target sequence
        predictions, dec_hidden, _, _ = decoder(dec_input, dec_hidden, enc_output)

    print(f"Epoch {epoch+1} completed.")

# =====================================
# 📊 STEP 5: Model Evaluation
# =====================================
# Translate test articles
test_articles = ["Breaking news: A major earthquake hit California", "New AI technology is revolutionizing healthcare"]
test_sequences = pad_sequences(text_tokenizer.texts_to_sequences(test_articles), maxlen=MAX_LEN_TEXT, padding="post")
enc_output, enc_hidden, enc_cell = encoder(test_sequences)
summaries, _, _, _ = decoder(enc_output, enc_hidden, enc_output)

print("Generated Summaries:", summaries)
