In [1]:
from database_handling.DataDownload import DataDownloader
from database_handling.DataHandleAndOtherHelpers import DataHandler
from database_handling.DataUpload import DataUploader
from database_handling.KeycloakLogin import KeycloakLogin
from scrapers.SueddeutscheScraper import SueddeutscheScraper
from text_analysis.NEExtractor import NEExtractor
from text_analysis.Summarizer import Summarizer
from text_analysis.TopicExtractor import TopicExtractor
from text_analysis.Vectorizers import Vectorizer

In [2]:
import torch
import gc
import json

In [3]:
# Specify the path to your JSON dump file
json_file_path = 'sueddeutsche_articles.json'

# Open the file and load its content
with open(json_file_path, 'r') as file:
    this_runs_articles = json.load(file)

In [4]:
def delete_and_free_memory(obj):
    """
    Deletes the given object and frees GPU memory.
    
    Parameters:
    obj (object): The object to delete.
    """
    del obj  # Delete the object
    torch.cuda.empty_cache()  # Clear the GPU cache
    gc.collect()  # Force Python to clean up unused memory

In [5]:
def process_articles(processor_class, process_function_name, articles):
    """
    Process all articles using the specified processor class and function.
    
    Parameters:
    processor_class (class): The class of the processor (e.g., NEExtractor).
    process_function_name (str): The name of the function to call on the processor.
    articles (list): List of articles to process.
    
    Returns:
    list: The processed articles.
    """
    processor = processor_class()
    process_function = getattr(processor, process_function_name)
    with torch.no_grad():
        print(f"Starting {process_function_name.replace('_', ' ').capitalize()} on all articles")
        articles = process_function(articles)  # Process each article in the list one by one
    delete_and_free_memory(processor)
    return articles

In [6]:
def upload_articles(articles, token):
    """
    Upload all articles to the database.
    
    Parameters:
    articles (list): List of articles to upload.
    token (str): The authentication token.
    
    Returns:
    list: The responses from the database.
    """
    data_uploader = DataUploader(token)
    responses = []
    for i, article in enumerate(articles):
        print(f"Uploading article {i + 1} of {len(articles)}")
        response = data_uploader.post_content(article)
        responses.append(response)
    return responses

In [7]:
def remove_text_fields(articles, fields):
    """
    Remove specified text fields from each article.
    
    Parameters:
    articles (list): List of articles to process.
    fields (list): List of fields to remove from each article.
    
    Returns:
    list: The articles with the fields removed.
    """
    for article in articles:
        for field in fields:
            article.pop(field, None)
    return articles

In [9]:
articles = this_runs_articles

In [10]:
articles = process_articles(NEExtractor, 'extract_entities', articles)
#delete_and_free_memory(articles)

2024-08-21 06:21:12,437 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, B-PER, E-PER, S-LOC, B-MISC, I-MISC, E-MISC, S-PER, B-ORG, E-ORG, S-ORG, I-ORG, B-LOC, E-LOC, S-MISC, I-PER, I-LOC, <START>, <STOP>
Tagger instantiated successfully
Starting Extract entities on all articles


OutOfMemoryError: CUDA out of memory. Tried to allocate 658.00 MiB. GPU 0 has a total capacity of 23.68 GiB of which 324.69 MiB is free. Process 2212 has 4.50 GiB memory in use. Process 948357 has 18.80 GiB memory in use. Of the allocated memory 17.52 GiB is allocated by PyTorch, and 1010.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Process articles using different processors

articles = process_articles(TopicExtractor, 'extract_topics', articles)
articles = process_articles(Vectorizer, 'vectorize', articles)

# Remove main_text and lead_text from each article
sueddeutsche_articles_not_yet_in_db_list_of_dicts = remove_text_fields(sueddeutsche_articles_not_yet_in_db_list_of_dicts, ['main_text', 'lead_text'])

# Upload each article to the database
keycloak_login = KeycloakLogin()
token = keycloak_login.return_token()
responses = upload_articles(sueddeutsche_articles_not_yet_in_db_list_of_dicts, token)

# Save the responses to a json file
with open('responses.json', 'w') as f:
    json.dump(responses, f)

# Free up VRAM after uploading
print_vram_usage()
delete_and_free_memory(sueddeutsche_articles_not_yet_in_db_list_of_dicts)
print_vram_usage()
