In [1]:
!pip install pandas nltk scikit-learn rouge -qq

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from rouge import Rouge
import os
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

True

In [3]:
# Define the BiLSTM model
class BiLSTMSummarizer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(BiLSTMSummarizer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.decoder = nn.LSTM(embedding_dim, hidden_dim * 2, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.fc.out_features

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(src.device)

        embedded = self.embedding(src)
        enc_output, (hidden, cell) = self.encoder(embedded)

        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1).unsqueeze(0)
        cell = torch.cat((cell[-2,:,:], cell[-1,:,:]), dim=1).unsqueeze(0)

        input = trg[:, 0]

        for t in range(1, trg_len):
            input_embedded = self.embedding(input).unsqueeze(1)
            output, (hidden, cell) = self.decoder(input_embedded, (hidden, cell))
            prediction = self.fc(output.squeeze(1))
            outputs[:, t] = prediction

            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = prediction.argmax(1)
            input = trg[:, t] if teacher_force else top1

        return outputs

In [4]:
# Custom dataset class
class SummarizationDataset(Dataset):
    def __init__(self, articles, summaries, vocab, max_length=100):
        self.articles = articles
        self.summaries = summaries
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        article = self.articles[idx]
        summary = self.summaries[idx]

        article_indices = [self.vocab['<sos>']] + [self.vocab.get(token, self.vocab['<unk>']) for token in article][:self.max_length-2] + [self.vocab['<eos>']]
        summary_indices = [self.vocab['<sos>']] + [self.vocab.get(token, self.vocab['<unk>']) for token in summary][:self.max_length-2] + [self.vocab['<eos>']]

        article_indices = article_indices + [self.vocab['<pad>']] * (self.max_length - len(article_indices))
        summary_indices = summary_indices + [self.vocab['<pad>']] * (self.max_length - len(summary_indices))

        return torch.tensor(article_indices), torch.tensor(summary_indices)

In [5]:
file_path = r"/kaggle/input/summarizationdataset/hindi_news_dataset.csv"

def load_data(file_path):
    df = pd.read_csv(file_path)
    return df['Headline'].tolist(), df['Content'].tolist()

def tokenize(text):
    return word_tokenize(text.lower())

def build_vocab(texts, min_freq=2):
    word_freq = Counter()
    for text in texts:
        word_freq.update(text)

    vocab = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
    
    for word, freq in word_freq.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)

    return vocab, {v: k for k, v in vocab.items()} 

In [6]:
articles, summaries = load_data(file_path)

tokenized_articles = [tokenize(article) for article in articles]
tokenized_summaries = [tokenize(summary) for summary in summaries]

vocab, inv_vocab = build_vocab(tokenized_articles + tokenized_summaries)

train_articles, test_articles, train_summaries, test_summaries = train_test_split(tokenized_articles, tokenized_summaries, test_size=0.2, random_state=42)
train_articles, val_articles, train_summaries, val_summaries = train_test_split(train_articles, train_summaries, test_size=0.1, random_state=42)

In [7]:
train_dataset = SummarizationDataset(train_articles, train_summaries, vocab, max_length=50)
val_dataset = SummarizationDataset(val_articles, val_summaries, vocab, max_length=50)
test_dataset = SummarizationDataset(test_articles, test_summaries, vocab, max_length=50)


# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)
test_loader = DataLoader(test_dataset, batch_size=128)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

vocab_size = len(vocab)   
embedding_dim = 300      
hidden_dim = 512          
output_dim = vocab_size   

model = BiLSTMSummarizer(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)

In [9]:
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

model = model.to(device)

Using 2 GPUs!


In [10]:
def train(model, iterator, optimizer, criterion, device, clip=1, teacher_forcing_ratio=0.5):
    model.train()
    epoch_loss = 0
    for batch in tqdm(iterator, desc="Training"):
        src, trg = batch
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)

        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [11]:
def evaluate(model, iterator, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in tqdm(iterator, desc="Evaluating"):
            src, trg = batch
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg, 0)

            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [12]:
def beam_search(model, src, vocab, inv_vocab, beam_width=3, max_length=100, min_length=10, device='cpu'):
    model.eval()
    with torch.no_grad():
        embedded = model.embedding(src)  
        enc_output, (hidden, cell) = model.encoder(embedded) 

        # In case of bi-directional LSTM, combine the hidden states
        if model.encoder.bidirectional:
            hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)  
            cell = torch.cat((cell[-2, :, :], cell[-1, :, :]), dim=1)        
        else:
            hidden = hidden[-1, :, :] 
            cell = cell[-1, :, :]    

        hidden = hidden.unsqueeze(0)  
        cell = cell.unsqueeze(0)     

        beam = [([vocab['<sos>']], 0, hidden[:, 0:1, :], cell[:, 0:1, :])]
        complete_hypotheses = []

        for t in range(max_length):
            new_beam = []
            for seq, score, hidden, cell in beam:
                if seq[-1] == vocab['<eos>'] and len(seq) >= min_length:
                    complete_hypotheses.append((seq, score))
                    continue

                input = torch.LongTensor([seq[-1]]).unsqueeze(0).to(device)  
                input_embedded = model.embedding(input)

                output, (hidden, cell) = model.decoder(input_embedded, (hidden, cell))
                predictions = model.fc(output.squeeze(1)) 

                if len(seq) < min_length:
                    predictions[0][vocab['<eos>']] = float('-inf')

                top_preds = torch.topk(predictions, beam_width, dim=1)

                for i in range(beam_width):
                    new_seq = seq + [top_preds.indices[0][i].item()]
                    new_score = score - top_preds.values[0][i].item() 
                    new_hidden = hidden.clone()
                    new_cell = cell.clone()
                    new_beam.append((new_seq, new_score, new_hidden, new_cell))

            beam = sorted(new_beam, key=lambda x: x[1])[:beam_width]

            if len(complete_hypotheses) >= beam_width:
                break

        complete_hypotheses = sorted(complete_hypotheses, key=lambda x: x[1])
        if complete_hypotheses:
            best_seq = complete_hypotheses[0][0]
        else:
            best_seq = beam[0][0]

    return [inv_vocab[idx] for idx in best_seq if idx not in [vocab['<sos>'], vocab['<eos>'], vocab['<pad>']]]

In [13]:
def save_model(model, vocab, filepath):
    torch.save({
        'model_state_dict': model.state_dict(),
        'vocab': vocab
    }, filepath)
    print(f"Model saved to {'/kaggle/working/'}")

In [14]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])

In [15]:
num_epochs = 10
best_val_loss = float('inf')
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_loss = evaluate(model, val_loader, criterion, device)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {val_loss:.3f}')

    # Save model if validation loss improves
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        save_model(model, vocab, 'best_model.pth')

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Training: 100%|██████████| 1044/1044 [34:30<00:00,  1.98s/it]
Evaluating: 100%|██████████| 116/116 [01:56<00:00,  1.00s/it]


Epoch: 01
	Train Loss: 5.738
	 Val. Loss: 6.311
Model saved to /kaggle/working/


Training: 100%|██████████| 1044/1044 [34:30<00:00,  1.98s/it]
Evaluating: 100%|██████████| 116/116 [01:55<00:00,  1.00it/s]


Epoch: 02
	Train Loss: 3.955
	 Val. Loss: 5.722
Model saved to /kaggle/working/


Training: 100%|██████████| 1044/1044 [34:13<00:00,  1.97s/it]
Evaluating: 100%|██████████| 116/116 [01:54<00:00,  1.01it/s]


Epoch: 03
	Train Loss: 2.923
	 Val. Loss: 4.867
Model saved to /kaggle/working/


Training: 100%|██████████| 1044/1044 [34:37<00:00,  1.99s/it]
Evaluating: 100%|██████████| 116/116 [01:57<00:00,  1.01s/it]


Epoch: 04
	Train Loss: 2.284
	 Val. Loss: 4.189
Model saved to /kaggle/working/


Training: 100%|██████████| 1044/1044 [34:48<00:00,  2.00s/it]
Evaluating: 100%|██████████| 116/116 [01:58<00:00,  1.02s/it]


Epoch: 05
	Train Loss: 1.863
	 Val. Loss: 3.739
Model saved to /kaggle/working/


Training: 100%|██████████| 1044/1044 [34:45<00:00,  2.00s/it]
Evaluating: 100%|██████████| 116/116 [01:57<00:00,  1.01s/it]


Epoch: 06
	Train Loss: 1.556
	 Val. Loss: 3.359
Model saved to /kaggle/working/


Training: 100%|██████████| 1044/1044 [34:49<00:00,  2.00s/it]
Evaluating: 100%|██████████| 116/116 [01:58<00:00,  1.02s/it]


Epoch: 07
	Train Loss: 1.323
	 Val. Loss: 3.081
Model saved to /kaggle/working/


Training: 100%|██████████| 1044/1044 [34:52<00:00,  2.00s/it]
Evaluating: 100%|██████████| 116/116 [01:58<00:00,  1.02s/it]


Epoch: 08
	Train Loss: 1.140
	 Val. Loss: 2.849
Model saved to /kaggle/working/


Training: 100%|██████████| 1044/1044 [34:51<00:00,  2.00s/it]
Evaluating: 100%|██████████| 116/116 [01:58<00:00,  1.02s/it]


Epoch: 09
	Train Loss: 0.996
	 Val. Loss: 2.675
Model saved to /kaggle/working/


Training: 100%|██████████| 1044/1044 [34:54<00:00,  2.01s/it]
Evaluating: 100%|██████████| 116/116 [01:59<00:00,  1.03s/it]


Epoch: 10
	Train Loss: 0.874
	 Val. Loss: 2.505
Model saved to /kaggle/working/


In [16]:
def load_model(filepath, device):
    checkpoint = torch.load(filepath, map_location=device)
    vocab = checkpoint['vocab']
    model = BiLSTMSummarizer(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
    model.load_state_dict(checkpoint['model_state_dict'])
    return model, checkpoint

In [24]:
def load_model(path, device):
    checkpoint = torch.load(path, map_location=device)
    state_dict = checkpoint['model_state_dict']

    new_state_dict = {}
    for k, v in state_dict.items():
        new_key = k.replace('module.', '') if 'module.' in k else k
        new_state_dict[new_key] = v

    model = BiLSTMSummarizer(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)
    model.load_state_dict(new_state_dict)
    model.to(device)

    optimizer_state = checkpoint.get('optimizer_state_dict', None)

    return model, optimizer_state


In [25]:
best_model, _ = load_model('best_model.pth', device)
best_model.eval() 

# Evaluate the model
test_loss = evaluate(best_model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.3f}')

  checkpoint = torch.load(path, map_location=device)
Evaluating: 100%|██████████| 290/290 [06:15<00:00,  1.29s/it]

Test Loss: 2.477





In [26]:
rouge = Rouge()
best_model.eval() 

predictions = []
references = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Generating summaries"):
        src, trg = batch
        src = src.to(device)

        # Generate summary using beam search
        pred = beam_search(best_model, src, vocab, inv_vocab, min_length=10, device=device)
        predictions.append(' '.join(pred))

        # Prepare reference text
        reference = ' '.join([inv_vocab[idx.item()] for idx in trg[0] if idx.item() not in [vocab['<sos>'], vocab['<eos>'], vocab['<pad>']]])
        references.append(reference)

# Ensure all predictions meet the minimum length
min_length = 10  # Set your desired minimum length
predictions = [p if len(p.split()) >= min_length else p + ' ' + ' '.join(['<pad>'] * (min_length - len(p.split()))) for p in predictions]

# Compute ROUGE scores
scores = rouge.get_scores(predictions, references, avg=True)
print("ROUGE scores:")
print(scores)

Generating summaries: 100%|██████████| 290/290 [01:16<00:00,  3.82it/s]


ROUGE scores:
{'rouge-1': {'r': 0.797824581437502, 'p': 0.8224941994409078, 'f': 0.8085761290905409}, 'rouge-2': {'r': 0.7151748893102365, 'p': 0.7227557298798698, 'f': 0.7185470128082455}, 'rouge-l': {'r': 0.7767887418993216, 'p': 0.798882878044646, 'f': 0.7864074920678197}}


In [28]:
def summarize_text(model, vocab, inv_vocab, text, max_length=100, min_length=10, beam_width=3, device='cpu', debug=False):
    model.eval()
    tokens = tokenize(text)[:max_length]
    indices = [vocab['<sos>']] + [vocab.get(token, vocab['<unk>']) for token in tokens] + [vocab['<eos>']]
    src = torch.LongTensor(indices).unsqueeze(0).to(device)

    summary = beam_search(model, src, vocab, inv_vocab, beam_width, max_length, min_length, device)

    if debug:
        print("Input tokens:", tokens)
        print("Input indices:", indices)
        print("Generated indices:", [vocab[word] for word in summary])
        print("Summary length:", len(summary))

    return ' '.join(summary)

In [32]:
input_text = "चंद्रमा पर चंद्रयान-3 की सफल लैंडिंग के बाद, इसरो के पूर्व प्रमुख शरद ने खुशी व्यक्त की। उन्होंने कहा कि इस ऐतिहासिक उपलब्धि के लिए पूर्व प्रधानमंत्री नरेंद्र मोदी ने सोशल मीडिया पर अपनी बधाई दी। उन्होंने कहा कि अमेरिका ने भी इसरो को चंद्रयान-3 की सफल लैंडिंग पर बधाई दी है, जो भारत की अंतरिक्ष प्रगति का महत्वपूर्ण संकेत है।"
summary = summarize_text(trained_model, vocab, inv_vocab, input_text, min_length=10, device=device, debug=True)
print("Generated Summary:")
print(summary)
print("Summary length:", len(summary.split()))

Input tokens: ['भारतीय', 'अंतरिक्ष', 'अनुसंधान', 'संगठन', '(', 'isro', ')', 'ने', 'चंद्रयान-3', 'मिशन', 'को', 'सफलतापूर्वक', 'लॉन्च', 'किया', ',', 'जिसका', 'उद्देश्य', 'चंद्रमा', 'के', 'दक्षिणी', 'ध्रुव', 'पर', 'सुरक्षित', 'लैंडिंग', 'करना', 'है।', 'इस', 'मिशन', 'से', 'भारत', 'को', 'चंद्रमा', 'के', 'बारे', 'में', 'नई', 'जानकारी', 'प्राप्त', 'करने', 'और', 'अंतरिक्ष', 'अन्वेषण', 'में', 'अपनी', 'स्थिति', 'मजबूत', 'करने', 'की', 'उम्मीद', 'है।']
Input indices: [2, 392, 4036, 22731, 270, 6447, 17048, 6449, 83, 2231, 1410, 76, 4137, 2085, 32, 56, 1331, 9140, 2232, 12, 3266, 3267, 98, 512, 2235, 972, 28714, 1235, 1410, 37, 87, 76, 2232, 12, 4196, 10, 1480, 4197, 15901, 36, 73, 4036, 30152, 10, 405, 44, 13319, 36, 8, 2224, 28714, 3]
Generated indices: [2232, 8, 2234, 2235, 12, 30, 3180, 12, 355, 802, 2654, 83, 83, 211, 41, 852, 355, 5457, 6531, 26, 83, 87, 76, 4036, 983, 6447, 355, 10, 4606, 6449, 98, 1525, 56, 28715, 3180, 83, 2231, 8, 2235, 12, 4196, 10, 371, 12, 30, 634, 83, 747, 39, 65]
Sum