In [2]:
from database_handling.DataDownload import DataDownloader
from database_handling.DataHandleAndOtherHelpers import DataHandler
from database_handling.DataUpload import DataUploader
from database_handling.DataDelete import DataDeleter
from database_handling.KeycloakLogin import KeycloakLogin
from scrapers.SueddeutscheScraper import SueddeutscheScraper 
from text_analysis.NEExtractor import NEExtractor
from text_analysis.Summarizer import Summarizer
from text_analysis.TopicExtractor import TopicExtractor
from text_analysis.Vectorizers import Vectorizer

import transformers
import json
import gc
import torch

In [3]:
scraper = SueddeutscheScraper()
scraper.start_browser()
scraper.login()

In [4]:
scraper.driver.save_screenshot('screenshot.png')

True

In [5]:
this_runs_articles = scraper.scrape()
# TODO: (maybe) check and validate articles that are in the database right here, maybe even before accessing it, only asking the outlet server if there is something there. 
# That might 1. decrease computational/scraping time a little and 2. decrease our footprint on the target server, if we don't scrape/fully access ~1000 articles every day but only the really news one
# which are usually a lot let (50-200)

In [5]:
# Open the file and load its content
#with open("spiegel_articles.json", 'r') as file:
#    this_runs_articles = json.load(file)

In [6]:
#Save the responses to a json file
with open('sz_articles.json', 'w') as f:
    json.dump(this_runs_articles, f)


In [7]:
len(this_runs_articles)

1097

In [8]:
this_runs_articles[919]['main_text'].count('Tag 888')

0

In [9]:
this_runs_articles[918]

{'url': 'https://www.sueddeutsche.de/muenchen/landkreismuenchen/ottobrunn-polizei-randalierer-radfahrer-trunkenheit-lux.Xbx87HSgD9fq3tU15JPxmB',
 'main_text': 'Ein Polizist ist in der Nacht zum Mittwoch bei der Kontrolle eines 28-jährigen angetrunkenen, randalierenden Radfahrers in Ottobrunn schwerer verletzt worden. Drei Kollegen zogen sich leichte Verletzungen zu. Die Beamten hatten den Mann nach Angaben des Münchner Polizeipräsidiums zunächst gegen 22.45 Uhr angehalten und ihm ein verbotenes Einhandmesser abgenommen. Außerdem wiesen sie ihn darauf hin, dass er nicht mehr in der Lage sei, mit seinem Fahrrad zu fahren. Als er ihnen gegen 0 Uhr im Bereich der Kreuzung Mozartstraße und Altendorfstraße radelnd begegnete, eskalierte die Situation. Laut Polizeibericht wurde eine weitere Streife hinzugerufen und der 28-Jährige konnte erst nach minutenlanger Gegenwehr gefesselt werden. Er wurde unter anderem wegen Trunkenheit im Verkehr, Widerstands gegen Vollstreckungsbeamte und Beleidigung

In [10]:
# get the token for the database
keycloak_login = KeycloakLogin()
token = keycloak_login.return_token()

In [11]:
# Initialize the data downloader with the provided token
data_downloader = DataDownloader(token)

# Initialize variables for pagination
all_articles_in_db = []
page_size = 100  # Assuming the server returns 100 items per page by default
offset = 0
url = "https://www.sueddeutsche.de/"

# Loop until all pages are retrieved
while True:
    # Fetch articles from the current page
    articles_in_db = data_downloader.get_content(url=url, limit=page_size, offset=offset)
    
    # Check if articles were returned
    if not articles_in_db or 'items' not in articles_in_db:
        break
    
    # Add the current batch of articles to the complete list
    all_articles_in_db.extend(articles_in_db['items'])
    
    # Update the offset for the next page
    offset += page_size
    
    # Check if we have fetched all articles
    if len(all_articles_in_db) >= articles_in_db['count']:
        break

# The total number of articles retrieved
article_count = len(all_articles_in_db)

print(f"Total articles fetched: {article_count}")


Total articles fetched: 0


In [12]:
# instantiate the data handler
data_handler = DataHandler()

In [13]:
# for the articles that are already in the database, only update the last_verification_date
articles_for_last_verifcation_date_update = data_handler.find_scraped_articles_already_in_db(this_runs_articles, all_articles_in_db)
# safe the responses to the last verification date update
responses_to_last_verifcation_date_update = data_handler.patch_last_online_verification_date(token, articles_for_last_verifcation_date_update)
# get the articles that are not yet in the database
articles_not_yet_in_db = data_handler.find_scraped_articles_not_already_in_db(this_runs_articles, all_articles_in_db)
articles_not_yet_in_db_list_of_dicts = [article for article in this_runs_articles if article['url'] in articles_not_yet_in_db]

# Define the number of articles to process and upload per iteration
articles_per_iteration = 10

# Calculate the number of iterations
iterations = len(articles_not_yet_in_db_list_of_dicts) // articles_per_iteration + (len(articles_not_yet_in_db_list_of_dicts) % articles_per_iteration > 0)

responses = []

In [26]:
torch.cuda.empty_cache()
gc.collect()


0

In [23]:
del processor

NameError: name 'processor' is not defined

In [None]:
import torch
import gc
import json

def clear_gpu_memory():
    """Clears GPU memory and forces garbage collection."""
    torch.cuda.empty_cache()
    gc.collect()

def process_articles_in_batches(text_analysis_class, method_name, articles, batch_size):
    """Process articles in batches using the specified text analysis class and method."""
    processor = text_analysis_class()
    method = getattr(processor, method_name)
    
    for i in range(0, len(articles), batch_size):
        batch = articles[i:i + batch_size]
        batch = method(batch)
        
        # Reassign the processed batch back to the main list
        articles[i:i + batch_size] = batch

        del batch
        clear_gpu_memory()  # Clear memory after each batch
    
    del processor  # Delete the processor instance to free up GPU memory
    clear_gpu_memory()
    clear_gpu_memory()

# Define your batch size
batch_size = len(articles_not_yet_in_db_list_of_dicts)  # Adjust this based on your GPU capacity

# Process with Entity Extractor
print("Starting entity extraction...")
process_articles_in_batches(NEExtractor, 'extract_entities', articles_not_yet_in_db_list_of_dicts, batch_size)
print("Entity extraction completed.")

# Process with Topic Extractor
print("Starting topic extraction...")
process_articles_in_batches(TopicExtractor, 'extract_topics', articles_not_yet_in_db_list_of_dicts, batch_size)
print("Topic extraction completed.")

# Process with Vectorizer
print("Starting vectorization...")
process_articles_in_batches(Vectorizer, 'vectorize', articles_not_yet_in_db_list_of_dicts, batch_size)
print("Vectorization completed.")

# Remove main_text and lead_text from articles to save space before uploading
for article in articles_not_yet_in_db_list_of_dicts:
    article.pop('main_text', None)
    article.pop('lead_text', None)

# Upload each article to the database
print("Beginning article upload...")
responses = []
keycloak_login = KeycloakLogin()
token = keycloak_login.return_token()
data_uploader = DataUploader(token)

for article in articles_not_yet_in_db_list_of_dicts:
    response = data_uploader.post_content(article)
    responses.append(response)

# Save the responses to a JSON file
with open('responses.json', 'w') as f:
    json.dump(responses, f)

print("Article upload completed.")


Starting entity extraction...
2024-08-29 17:03:35,013 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, B-PER, E-PER, S-LOC, B-MISC, I-MISC, E-MISC, S-PER, B-ORG, E-ORG, S-ORG, I-ORG, B-LOC, E-LOC, S-MISC, I-PER, I-LOC, <START>, <STOP>
Tagger instantiated successfully


Map:   0%|          | 0/1097 [00:00<?, ? examples/s]

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f70ba33df30>>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [None]:
#for i in range(iterations):
#    print("Processing articles", i*articles_per_iteration, "to", (i+1)*articles_per_iteration, "from", len(articles_not_yet_in_db_list_of_dicts))
#    # Get the articles for this iteration
#    articles = spiegel_articles_not_yet_in_db_list_of_dicts[i*articles_per_iteration:(i+1)*articles_per_iteration]#
#
#    print("Running text processing on articles", i*articles_per_iteration, "to", (i+1)*articles_per_iteration, "from", len(articles_not_yet_in_db_list_of_dicts))
    # Add the summaries, named entities, topics, and vectors to the articles dict
    #articles = summarizer.summarize(articles)
#    articles = entity_extractor.extract_entities(articles)
#    articles = topic_extractor.extract_topics(articles)
#    articles = vectorizer.vectorize(articles)

    # Remove main_text and lead_text from articles
#    for article in articles:
#        article.pop('main_text', None)
#        article.pop('lead_text', None)
        
#    print("Uploading articles", i*articles_per_iteration, "to", (i+1)*articles_per_iteration, "from", len(articles_not_yet_in_db_list_of_dicts))
    # Ensure that the token is still valid every n iterations
    # TODO: Tell Mario chuncking was done because I get a new token every 30 uploads to make sure the token is always valid
    # if we do that every 1 upload that takes much longer since it takes ~20 seconds to get a net token/ensure the token is valid#
#    keycloak_login = KeycloakLogin()
#    token = keycloak_login.return_token()
    
    # Loop over articles and put every article into the database
#    data_uploader = DataUploader(token)
    
#    for article in articles:
#        response = data_uploader.post_content(article)
#        responses.append(response)
    
#    print("Processed and uploaded articles", i*articles_per_iteration, "to", (i+1)*articles_per_iteration, "from", len(articles_not_yet_in_db_list_of_dicts))
    
        
    # save the responses to a json file
#    with open('responses.json', 'w') as f:
#        json.dump(responses, f)