In [None]:
!pip install transformers
!pip install torch
!pip install wikipedia-api


Traceback (most recent call last):
  File "/usr/local/bin/pip3", line 5, in <module>
    from pip._internal.cli.main import main
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main.py", line 3, in <module>
    import locale
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 879, in exec_module
  File "<frozen importlib._bootstrap_external>", line 1012, in get_code
  File "<frozen importlib._bootstrap_external>", line 672, in _compile_bytecode
KeyboardInterrupt
^C
Collecting wikipedia-api
  Downloading Wikipedia_API-0.6.0-py3-none-any.whl (14 kB)
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.6.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizerFast, BertModel
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import wikipediaapi
from bs4 import BeautifulSoup

# Define the keyword extractor model
class KeywordExtractor(nn.Module):
    def __init__(self, bert_model_name, lstm_hidden_dim, attention_units, output_dim):
        super(KeywordExtractor, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.lstm = nn.LSTM(self.bert.config.hidden_size, lstm_hidden_dim, bidirectional=True, batch_first=True)
        self.W1 = nn.Linear(lstm_hidden_dim * 2, attention_units)
        self.W2 = nn.Linear(attention_units, 1)
        self.fc = nn.Linear(lstm_hidden_dim * 2, output_dim)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state
        H, _ = self.lstm(sequence_output)
        u = torch.tanh(self.W1(H))
        attention_scores = self.W2(u).squeeze(-1)
        attention_weights = torch.softmax(attention_scores, dim=-1)
        context_vector = torch.bmm(attention_weights.unsqueeze(1), H).squeeze(1)
        output = self.fc(H)  # Token-level classification
        return output, attention_weights

# Preprocess text
def preprocess_text(text, tokenizer, max_seq_length=512):
    tokens = tokenizer(text, max_length=max_seq_length, padding='max_length', truncation=True, return_tensors='pt')
    input_ids = tokens.input_ids
    attention_mask = tokens.attention_mask
    return input_ids, attention_mask

# Load the provided dataset
def load_dataset():
    # Sample dataset of 100 labeled sentences
    dataset = [
    {"text": "Machine learning algorithms can classify data efficiently", "labels": [1, 1, 1, 0, 1, 0, 0,]},
    {"text": "Deep learning models can learn from large amounts of unlabeled data", "labels": [1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]},
    {"text": "Supervised learning requires labeled datasets for training", "labels": [1, 1, 0, 1, 0, 0, 0]},
    {"text": "Neural networks consist of interconnected layers of nodes", "labels": [1, 1, 0, 0, 1, 0, 0, 1]},
    {"text": "Unsupervised learning techniques aim to find hidden patterns in data", "labels": [1, 1, 0, 0, 0, 0, 0, 1, 0, 0]},
    {"text": "Support vector machines are effective for classification tasks", "labels": [1, 1, 1, 0, 0, 0, 1, 0]},
    {"text": "Reinforcement learning agents learn from feedback received from their environment", "labels": [1, 1, 0, 0, 0, 1, 0, 0, 0, 0]},
    {"text": "Feature engineering is crucial for improving model performance", "labels": [1, 1, 0, 0, 0, 0, 0, 1]},
    {"text": "Decision trees are a type of supervised learning algorithm", "labels": [1, 1, 0, 0, 0, 0, 1, 1, 0]},
    {"text": "Natural language processing enables computers to understand human language", "labels": [1, 1, 1, 0, 0, 0, 0, 1, 1]}]



    return dataset

# Combine datasets
def combine_datasets(wikipedia_articles, labeled_sentences):
    combined_data = []
    combined_data.extend(wikipedia_articles)
    combined_data.extend(labeled_sentences)
    return combined_data

# Train the model
def train_model(model, criterion, optimizer, train_data, tokenizer, max_seq_length=512, num_epochs=5, batch_size=8):
    # Split data into train and validation sets
    train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

    train_dataset = CustomDataset(train_data, tokenizer, max_seq_length)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    val_dataset = CustomDataset(val_data, tokenizer, max_seq_length)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0.0
        for batch in train_loader:
            batch_input_ids = batch['input_ids'].to(device)
            batch_attention_mask = batch['attention_mask'].to(device)
            batch_labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs, _ = model(batch_input_ids, batch_attention_mask)
            loss = criterion(outputs.view(-1), batch_labels.view(-1))
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * batch_input_ids.size(0)

        print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {epoch_loss / len(train_dataset)}')

        # Evaluate on validation set
        model.eval()
        val_loss = 0.0
        for batch in val_loader:
            batch_input_ids = batch['input_ids'].to(device)
            batch_attention_mask = batch['attention_mask'].to(device)
            batch_labels = batch['labels'].to(device)

            with torch.no_grad():
                outputs, _ = model(batch_input_ids, batch_attention_mask)
                loss = criterion(outputs.view(-1), batch_labels.view(-1))
                val_loss += loss.item() * batch_input_ids.size(0)

        print(f'Epoch {epoch + 1}/{num_epochs}, Val Loss: {val_loss / len(val_dataset)}')

# Custom dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_seq_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

        # Convert each element of data into a dictionary
        self.process_data()

    def process_data(self):
        processed_data = []
        for item in self.data:
            # Convert item to a dictionary if it's not already
            if isinstance(item, dict):
                processed_data.append(item)
            else:
                processed_data.append({'text': str(item), 'labels': []})  # Assuming labels are empty for non-dictionary items
        self.data = processed_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]

        text = example['text']
        label = example['labels']

        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_seq_length, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        # Pad labels to match max_seq_length
        padded_label = label + [0] * (self.max_seq_length - len(label))
        label_tensor = torch.tensor(padded_label, dtype=torch.float32)

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': label_tensor}






# Extract keywords
def extract_keywords(text, model, tokenizer, threshold=0, max_seq_length=512):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    input_ids, attention_mask = preprocess_text(text, tokenizer, max_seq_length)
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    with torch.no_grad():
        output, _ = model(input_ids, attention_mask)
    predictions = torch.sigmoid(output)

    # Get the tokens exceeding the threshold
    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
    keywords = [(token, pred.item()) for token, pred in zip(tokens, predictions.squeeze()) if pred.item() > threshold]

    print("Keywords:", keywords)

    return keywords
# Function to scrape Wikipedia articles
def scrape_wikipedia_articles(categories, max_articles=10):
    wiki_wiki = wikipediaapi.Wikipedia('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36')
    articles = []
    labels = []

    for category in categories:
        cat = wiki_wiki.page(f"Category:{category}")
        pages = cat.categorymembers.values()
        for page in pages:
            if len(articles) >= max_articles:
                break
            if page.ns == 0:  # only get articles (namespace 0)
                text = page.text
                soup = BeautifulSoup(text, 'html.parser')
                cleaned_text = soup.get_text()
                tokens = cleaned_text.split()
                if len(tokens) > 0:
                    articles.append({'text': cleaned_text, 'labels': [0] * len(tokens)})  # Placeholder labels
                    labels.append([0] * len(tokens))  # Placeholder labels

    return articles, labels


def main(custom_text):
    # Stage 1: Pre-training on Wikipedia Data
    bert_model_name = 'bert-base-uncased'
    lstm_hidden_dim = 128
    attention_units = 64
    output_dim = 1
    num_epochs_stage1 = 5
    batch_size = 8

    # Load BERT tokenizer
    tokenizer = BertTokenizerFast.from_pretrained(bert_model_name)

    # Scrape Wikipedia articles
    categories = ["Machine_learning", "Natural_language_processing"]
    articles, labels = scrape_wikipedia_articles(categories, max_articles=1000)

    # Combine Wikipedia data
    combined_wiki_data = combine_datasets(articles, labels)

    # Initialize model, criterion, and optimizer
    model = KeywordExtractor(bert_model_name, lstm_hidden_dim, attention_units, output_dim)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Train the model on Wikipedia data
    train_model(model, criterion, optimizer, combined_wiki_data, tokenizer, num_epochs=num_epochs_stage1, batch_size=batch_size)

    # Stage 2: Fine-tuning on Custom Data
    # Load the provided dataset
    labeled_sentences = load_dataset()

    # Combine custom data
    combined_custom_data = labeled_sentences

    # Fine-tune the model on custom data
    train_model(model, criterion, optimizer, combined_custom_data, tokenizer, num_epochs=num_epochs_stage2, batch_size=batch_size)

    # Extract keywords
    keywords = extract_keywords(custom_text, model, tokenizer, threshold=0, max_seq_length=512)

    # Sort the keywords by descending order of importance
    sorted_keywords = sorted(keywords, key=lambda x: x[1], reverse=True)
    top_keywords = sorted_keywords[:10]

    print("Top Keywords:", top_keywords)

if __name__ == "__main__":
    custom_text = "Before Machine learning was invented, many tasks were difficult"
    main(custom_text)


if __name__ == "__main__":
    custom_text = "Before Machine learning was invented,many tasks were difficult"
    main(custom_text)
