### Install newspaper3k

In [None]:
!pip install newspaper3k
!pip install lxml_html_clean

## dataset

In [4]:
from newspaper import Article
import requests
from transformers import BertTokenizer, BertModel
import torch
import json

def preprocess_and_vectorize(text, model_name='bert-base-uncased', max_length=512):
    """
    Preprocesses the input text and converts it into a vector using BERT.

    Args:
        text (str): The input text/document.
        model_name (str): The name of the pre-trained BERT model to use.
        max_length (int): Maximum length of the tokenized input.

    Returns:
        torch.Tensor: A vector representation of the input text.
    """
    # Load pre-trained BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    # Move the model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Tokenize the input text
    inputs = tokenizer(
        text,
        return_tensors='pt',  # Return PyTorch tensors
        max_length=max_length,
        truncation=True,
        padding='max_length'
    )

    # Move input tensors to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get BERT embeddings
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**inputs)

    # Extract the embeddings
    # Use the [CLS] token embedding as the document representation
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze()

    return cls_embedding.cpu().numpy().tolist()  # Convert to list for JSON serialization


# API and file setup
base_url = "https://content.guardianapis.com/search"
api_key = "test"  # Public API key (no authentication required)
vector_file_path = '\data\preprocessed\documents_vectors.json'
categories = ["culture", "sport", "technology", "science", "health", "world"]

# Open the file in append mode
with open(vector_file_path, 'a') as vector_file:
    for category in categories:
        page = 1
        total_articles_collected = 0

        while total_articles_collected < 1000:
            params = {
                "section": category,
                "page-size": 50,  # Maximum allowed per request
                "page": page,
                "api-key": api_key,
                "show-fields": "body"  # Include full article content
            }
            response = requests.get(base_url, params=params)

            if response.status_code != 200:
                print(f"Failed to fetch articles for {category}: {response.status_code}")
                break

            articles = response.json().get("response", {}).get("results", [])

            if not articles:
                break  # No more articles available

            for i, article in enumerate(articles):
                url = article.get("webUrl")
                try:
                    # Extract the article content from the API response
                    content = article.get("fields", {}).get("body", "")

                    if not content:
                        # Fallback to newspaper3k if content is not available
                        news_article = Article(url)
                        news_article.download()
                        news_article.parse()
                        content = news_article.text

                    # Get the vector representation of the article
                    document_vector = preprocess_and_vectorize(content)

                    # Save the vector and metadata to the JSON file
                    vector_entry = {
                        'id': total_articles_collected + 1,
                        'vector': document_vector,
                        'category': category,
                        'url': url
                    }
                    json.dump(vector_entry, vector_file)
                    vector_file.write('\n')  # Add newline for each entry

                    total_articles_collected += 1

                    if total_articles_collected >= 1000:
                        break  # Stop after collecting 1000 articles

                except Exception as e:
                    print(f"Error processing article {url}: {e}")

            page += 1  # Move to the next page

        print(f"Collected {total_articles_collected} articles for category: {category}")

Collected 1000 articles for category: culture
Collected 1000 articles for category: sport
Collected 1000 articles for category: technology
Collected 1000 articles for category: science
Collected 0 articles for category: health
Collected 1000 articles for category: world


## the model

In [5]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.utils import to_categorical
from collections import Counter
import tensorflow as tf

def filter_categories_with_insufficient_samples(vectors, categories, min_samples=4):
    """
    Filters out categories with fewer than `min_samples` samples.

    Args:
        vectors (np.array): Array of document vectors.
        categories (np.array): Array of corresponding categories.
        min_samples (int): Minimum number of samples required for each category.

    Returns:
        filtered_vectors (np.array): Filtered array of document vectors.
        filtered_categories (np.array): Filtered array of corresponding categories.
    """
    # Count the number of samples per category
    category_counts = Counter(categories)

    # Identify categories with sufficient samples
    valid_categories = [category for category, count in category_counts.items() if count >= min_samples]

    # Filter vectors and categories
    mask = np.isin(categories, valid_categories)
    filtered_vectors = vectors[mask]
    filtered_categories = categories[mask]

    return filtered_vectors, filtered_categories

def load_data_from_json(file_path):
    """
    Loads data (vectors and categories) from a JSON file.

    Args:
        file_path (str): Path to the JSON file.

    Returns:
        vectors (list): List of document vectors.
        categories (list): List of corresponding categories.
    """
    vectors = []
    categories = []

    with open(file_path, 'r') as file:
        for line in file:
            entry = json.loads(line)
            vectors.append(entry['vector'])
            categories.append(entry['category'])

    return np.array(vectors), np.array(categories)


def train_lstm_model(vectors, categories, num_classes=9, test_size=0.2, random_state=42):
    """
    Trains an LSTM model to predict the category of a document.

    Args:
        vectors (np.array): Array of document vectors.
        categories (np.array): Array of corresponding categories.
        num_classes (int): Number of unique categories.
        test_size (float): Proportion of the dataset to include in the test split.
        random_state (int): Random seed for reproducibility.

    Returns:
        model: Trained LSTM model.
        history: Training history.
    """
    # Filter out categories with insufficient samples
    vectors, categories = filter_categories_with_insufficient_samples(vectors, categories)

    # Encode categories into integers
    label_encoder = LabelEncoder()
    categories_encoded = label_encoder.fit_transform(categories)
    categories_one_hot = to_categorical(categories_encoded, num_classes=num_classes)

    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        vectors, categories_one_hot, test_size=test_size, stratify=categories, random_state=random_state
    )

    # Reshape input data for LSTM (samples, timesteps, features)
    X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
    X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

    # Use MirroredStrategy to distribute the model across 2 GPUs
    strategy = tf.distribute.MirroredStrategy()

    with strategy.scope():
        # Build the LSTM model
        model = Sequential()
        model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2])))
        model.add(Dropout(0.2))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(num_classes, activation='softmax'))

        # Compile the model
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

    return model, history


# Example usage
if __name__ == "__main__":
    # Load data from JSON file
    file_path = '/kaggle/working/documents_vectors.json'
    vectors, categories = load_data_from_json(file_path)
    
    print(Counter(categories))
    # Train the LSTM model
    model, history = train_lstm_model(vectors, categories)

    # Evaluate the model
    test_loss, test_accuracy = model.evaluate(
        vectors.reshape((vectors.shape[0], 1, vectors.shape[1])),
        to_categorical(LabelEncoder().fit_transform(categories), num_classes=9)
    )
    print(f"Test Accuracy: {test_accuracy:.4f}")

Counter({'culture': 1000, 'sport': 1000, 'technology': 1000, 'science': 1000, 'world': 1000})


  super().__init__(**kwargs)


Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 13ms/step - accuracy: 0.6108 - loss: 1.1071 - val_accuracy: 0.8920 - val_loss: 0.3279
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8906 - loss: 0.3318 - val_accuracy: 0.8920 - val_loss: 0.3429
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8976 - loss: 0.2848 - val_accuracy: 0.9180 - val_loss: 0.2513
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9278 - loss: 0.2144 - val_accuracy: 0.9160 - val_loss: 0.2608
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9281 - loss: 0.2015 - val_accuracy: 0.9120 - val_loss: 0.2555
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9375 - loss: 0.1953 - val_accuracy: 0.9100 - val_loss: 0.2807
Epoch 7/10
[1m125/125[0m 