In [None]:
!pip install transformers
!pip install torch
!pip install wikipedia-api


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
from torch.utils.data import DataLoader


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizerFast, BertModel
from sklearn.model_selection import train_test_split
import wikipediaapi
from bs4 import BeautifulSoup

# Define the keyword extractor model
class KeywordExtractor(nn.Module):
    def __init__(self, bert_model_name, lstm_hidden_dim, attention_units, output_dim):
        super(KeywordExtractor, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.lstm = nn.LSTM(self.bert.config.hidden_size, lstm_hidden_dim, bidirectional=True, batch_first=True)
        self.W1 = nn.Linear(lstm_hidden_dim * 2, attention_units)
        self.W2 = nn.Linear(attention_units, 1)
        self.fc = nn.Linear(lstm_hidden_dim * 2, output_dim)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state
        H, _ = self.lstm(sequence_output)
        u = torch.tanh(self.W1(H))
        attention_scores = self.W2(u).squeeze(-1)
        attention_weights = torch.softmax(attention_scores, dim=-1)
        context_vector = torch.bmm(attention_weights.unsqueeze(1), H).squeeze(1)
        output = self.fc(H)  # Token-level classification
        return output, attention_weights

# Preprocess text
# Preprocess text
# Preprocess text
# Preprocess text
def preprocess_text(text, tokenizer, max_seq_length=512):
    tokens = tokenizer(text, max_length=max_seq_length, padding='max_length', truncation=True, return_tensors='pt')
    input_ids = tokens.input_ids
    attention_mask = tokens.attention_mask
    return input_ids, attention_mask


# Scrape Wikipedia articles and extract keywords
# Scrape Wikipedia articles and extract keywords
def scrape_wikipedia_articles(categories, max_articles=10):
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    wiki_wiki = wikipediaapi.Wikipedia(language='en', extract_format=wikipediaapi.ExtractFormat.HTML, user_agent=user_agent)
    articles = []
    labels = []
    for category in categories:
        category_page = wiki_wiki.page("Category:" + category)
        count = 0
        for title, page in category_page.categorymembers.items():
            if count >= max_articles:
                break
            if page.exists() and not page.namespace == wikipediaapi.Namespace.CATEGORY:
                soup = BeautifulSoup(page.text, 'html.parser')
                first_paragraph = soup.p
                if first_paragraph:
                    text = first_paragraph.get_text()
                    hyperlinks = [a.get_text() for a in first_paragraph.find_all('a')]
                    articles.append(text)
                    label = [1 if word in hyperlinks else 0 for word in text.split()]
                    labels.append(label[:512])  # Truncate labels to match input sequence length
                    count += 1
    return articles, labels

# Train the model
# Train the model
# Train the model
# Train the model
def train_model(model, criterion, optimizer, train_data, train_labels, tokenizer, max_seq_length=512, num_epochs=5, batch_size=8):
    model.train()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    train_dataset = CustomDataset(train_data, train_labels, tokenizer, max_seq_length)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for batch in train_loader:
            batch_input_ids = batch['input_ids'].to(device)
            batch_attention_mask = batch['attention_mask'].to(device)
            batch_labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs, _ = model(batch_input_ids, batch_attention_mask)
            loss = criterion(outputs.view(-1), batch_labels.view(-1))
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * batch_input_ids.size(0)

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_dataset)}')
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_seq_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_seq_length, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        # Pad labels to match max_seq_length
        padded_label = label + [0] * (self.max_seq_length - len(label))
        label_tensor = torch.tensor(padded_label, dtype=torch.float32)

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': label_tensor}



# Extract keywords
# Extract keywords
# Extract keywords
# Extract keywords
# Extract keywords
def extract_keywords(text, model, tokenizer, threshold=0, max_seq_length=512):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    input_ids, attention_mask = preprocess_text(text, tokenizer, max_seq_length)
    print("Input IDs shape:", input_ids.shape)
    print("Attention mask shape:", attention_mask.shape)
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    with torch.no_grad():
        output, _ = model(input_ids, attention_mask)
    predictions = torch.sigmoid(output)

    # Get the tokens exceeding the threshold
    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
    keywords = [(token, pred.item()) for token, pred in zip(tokens, predictions.squeeze()) if pred.item() > threshold]

    print("predictions",predictions)
    print("Tokens:", tokens)
    print("Keywords:", keywords)

    return keywords


In [None]:
def main(custom_text):
    # Define parameters
    bert_model_name = 'bert-base-uncased'
    lstm_hidden_dim = 128
    attention_units = 64
    output_dim = 1
    num_epochs = 10
    batch_size = 8

    # Load BERT tokenizer
    tokenizer = BertTokenizerFast.from_pretrained(bert_model_name)

    # Scrape Wikipedia articles
    categories = ["Machine_learning", "Natural_language_processing"]
    articles, labels = scrape_wikipedia_articles(categories, max_articles=1000)

    # Split data into train and test sets
    train_data, test_data, train_labels, test_labels = train_test_split(articles, labels, test_size=0.2, random_state=42)

    # Initialize model, criterion, and optimizer
    model = KeywordExtractor(bert_model_name, lstm_hidden_dim, attention_units, output_dim)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Train the model
    train_model(model, criterion, optimizer, train_data, train_labels, tokenizer, num_epochs=num_epochs, batch_size=batch_size)

    # Extract keywords
    keywords = extract_keywords(custom_text, model, tokenizer, threshold=0, max_seq_length=512)



    # Sort the keywords by descending order of importance
    sorted_keywords = sorted(keywords, key=lambda x: x[1], reverse=True)
    top_keywords = sorted_keywords[:10]

    print("Top Keywords:", top_keywords)





In [None]:
if __name__ == "__main__":
    custom_text = "BERT is a powerful model for natural language understanding. It has revolutionized many NLP tasks."
    main(custom_text)

Epoch 1/10, Loss: 0.05450168617521924
Epoch 2/10, Loss: 0.00023904201208554596
Epoch 3/10, Loss: 0.00016148818786002996
Epoch 4/10, Loss: 0.00011995375005668298
Epoch 5/10, Loss: 9.324607530856408e-05
Epoch 6/10, Loss: 7.484527035600189e-05
Epoch 7/10, Loss: 6.130578082507056e-05
Epoch 8/10, Loss: 5.183335813569885e-05
Epoch 9/10, Loss: 4.436436642198204e-05
Epoch 10/10, Loss: 3.860919397173589e-05
Input IDs shape: torch.Size([1, 512])
Attention mask shape: torch.Size([1, 512])
predictions tensor([[[1.4485e-03],
         [2.3213e-04],
         [5.8635e-05],
         [3.7038e-05],
         [3.2713e-05],
         [3.2063e-05],
         [3.2949e-05],
         [3.1819e-05],
         [3.3533e-05],
         [3.1493e-05],
         [3.6804e-05],
         [3.2013e-05],
         [3.1181e-05],
         [3.0626e-05],
         [3.1761e-05],
         [3.3764e-05],
         [3.3396e-05],
         [3.2393e-05],
         [3.2240e-05],
         [3.5627e-05],
         [3.2459e-05],
         [2.9585e-05],

NameError: name 'keywords' is not defined