In [None]:
import re
import tomllib
import os
import time
import spacy
import xml.etree.ElementTree as ET
from elasticsearch import Elasticsearch
from pathlib import Path
from datetime import datetime
from transformers import BartTokenizer, BartForConditionalGeneration

In [None]:
# Elastic connection
with open(Path(f"./config/elastic.toml"), "rb") as f:
    config = tomllib.load(f)

disable_security = config.get('disable_security', False)
es = Elasticsearch(
    config['instance'],
    basic_auth=(config['username'], config['password']),
    verify_certs=not disable_security,
    ssl_show_warn=not disable_security
)

if not es:
    raise RuntimeError("Could not configure Elasticsearch instance")
if not es.ping():
    raise RuntimeError("Elasticsearch instance not available")

# Miner Configuration
index_pre = config['pre_index']
batch_size = config['batch_size']
research_project_index = config['research_project_index']

# create index if not exist
es.indices.create(index=research_project_index, ignore=400) # ignore 400 Index Already Exists exception

In [None]:
# Bart summarization
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

def preprocess_text(text):
    return tokenizer.batch_encode_plus(
        [text],
        max_length=1024,
        truncation=True,
        return_tensors='pt'
    )

def generate_summary(inputs):
    summary_ids = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        num_beams=4,
        length_penalty=2.0,
        max_length=1024,
        min_length=200,
        no_repeat_ngram_size=3
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def summarize(input_text):
    start_time = time.time()

    inputs = preprocess_text(input_text)
    summary = generate_summary(inputs)

    end_time = time.time()
    print("Summary time elapsed: ", end_time - start_time)

    return  summary

In [None]:
# People Recognition
english_nlp = spacy.load("en_core_web_lg")
german_nlp = spacy.load("de_core_news_lg")

def extract_people_names(text, nlp):
    doc = nlp(text)

    people_names = []
    for entity in doc.ents:
        if entity.label_ == "PERSON" or entity.label_ == "PER":
            people_names.append(entity.text)

    return are_proper_nouns(people_names, nlp)

def are_proper_nouns(words, nlp):
    doc = nlp(" ".join(words))

    proper_nouns = set()
    for token in doc:
        if token.pos_ == 'PROPN':
            if len(token.text) > 4:
                proper_nouns.add(token.text)

    return proper_nouns

In [None]:
# Extract Preprocessing data page from Elastic

def reformat_content(txt):
    txt = txt.replace("http", "   http")
    txt = txt.replace("mailto", "   mailto")
    txt = txt.replace("](/"," ")

    # Regular expression pattern to match URLs
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

    # Remove URLs from the text
    txt = re.sub(url_pattern, '', txt)

    # Regular expression pattern to match PDF filenames
    pdf_filename_pattern = re.compile(r'\b[\w-]+\.pdf\b', re.IGNORECASE)

    # Remove PDF filenames from the text
    txt = re.sub(pdf_filename_pattern, '', txt)

    return txt

def extract_data_from_index(elastic, name_input, query, last_sort_index):

    if last_sort_index == 0:
        search_result = elastic.search(index=name_input, body=query, sort=[{"timestamp":"asc"}] )
    else:
        search_result = elastic.search(index=name_input, body=query, search_after=last_sort_index, sort=[{"timestamp":"asc"}] )

    for hit in search_result['hits']['hits']:
        extracted_data = {
            'url': hit['_source']['url'],
            'title': hit['_source']['title'],
            'content': reformat_content(hit['_source']['content']),# content_txt
            'language': hit['_source']['language'],
            'extracted_from': hit['_source']['extracted_from'],
            'extracted_xml': hit['_source']['extracted_xml'],
            #'person_names': hit['_source']['person_names'],
        }

        yield extracted_data, hit

In [None]:
# Helpers

def contains_keyword(keywords, text):
    for keyword in keywords:
           pattern = r"\b" + re.escape(keyword) + r"\w*\b"
           if re.search(pattern, text, flags=re.IGNORECASE):
               return True

    return False

def should_be_ignored(data):
    # 1 - First ignore publications

    # List of keywords to check
    keywords = ["pdf", "prof", "download", "publication", "publikation", "archiv", "promov", "team", "talks",
                "vortraege", "associate",
                "meldungen", "office", "secretary",
                "student", "hilfskraft", "assistenz", "mitarbeiter", "angebote", "forum", "elnrw", "termin",
                "neuigkeit", "arbeitsgruppe", "dr.", "research group", "working group", "theme", "coop", "koop"]

    # In - URL
    if contains_keyword(keywords, data['url']):
        return True

    # In - XML content
    root = ET.fromstring(data['extracted_xml'])
    h1_elements = root.findall(".//head[@rend='h1']")
    h2_elements = root.findall(".//head[@rend='h2']")

    if h1_elements:
        for h1_element in h1_elements:
            h1_text = h1_element.text.lower()
            if contains_keyword(keywords, h1_text):
                return True

    if h2_elements:
        for h2_element in h2_elements:
            if h2_element.text is not None:
                h2_text = h2_element.text.lower()
                if contains_keyword(keywords, h2_text):
                   return True

    return False

In [None]:
# Mining logic
# Get Document with *project* *projekt* *research* or *forschung* in title or in url
search_query = {
    "query": { "bool": {
            "should": [ { "regexp": { "title": ".*project.*" } }, { "regexp": { "title": ".*projekt.*" } },
                        { "regexp": { "title": ".*research.*" } }, { "regexp": { "title": ".*forschung.*" } },
                        { "regexp": { "url": ".*project.*" } }, { "regexp": { "url": ".*projekt.*" } },
                        { "regexp": { "url": ".*research.*" } }, { "regexp": { "url": ".*forschung.*" } },

            #            To retrieve documents that do not necessarily have project in their url or title
            #            { "regexp": { "content_xml": ".*projektbeschreibung.*" } }, { "regexp": { "content_xml": ".*projektleitung.*" } },
            #            { "regexp": { "content_xml": ".*projektkoordination.*" } }, { "regexp": { "content_xml": ".*projektförderung.*" } },

            #            { "regexp": { "content_xml": ".*projektlaufzeit.*" } }, { "regexp": { "content_xml": ".*Projektbearbeiter.*" } },
            #            { "regexp": { "content_xml": ".*project description.*" } }, { "regexp": { "content_xml": ".*project management.*" } },

            #            { "regexp": { "content_xml": ".*project coordination.*" } }, { "regexp": { "content_xml": ".*project funding.*" } },
            #            { "regexp": { "content_xml": ".*project duration.*" } }
                     ],
        }
    },
    "fields": [ "title", "url", "content_xml" ]
}

# Get the number of documents in the index
num_documents = es.count(index=index_pre)['count']
print("num_documents:",num_documents)

# Calculate the number of batches
num_batches = (num_documents + batch_size - 1) // batch_size
print("num_batches:",num_batches)

# Last Sort
last_sort = 0

# Only for test: limit definition
index = 1
limit = 100
break_outer_loop = False

ignored_page = 0
considered_page = 0
for batch_number in range(num_batches):
    print("last_sort", last_sort)

    for data, raw_data  in extract_data_from_index(es, index_pre, search_query, last_sort):
        last_sort = raw_data['sort']
        print("document_number --> ", index)
        print("URL:", data['url'])

        if should_be_ignored(data):
            #print("Filtering - Ignore this page: -- ", data['url'])
            #print("nothing")
            ignored_page += 1
        else:
            if data['language'] == "en":
                nlp = english_nlp
            else:
                nlp = german_nlp

            # Extract people
            people = list(extract_people_names(data['content'], nlp))

            research_data = {
                'id': data['extracted_from'] + '_' + raw_data['_id'],
                'url': data['url'],
                'title': data['title'],
                'language': data['language'],
                'type': 'research projects',
                'last_update': datetime.utcnow().isoformat(timespec='milliseconds') + "Z",
                'extracted_from': data['extracted_from'],
                'people': people,
                #'summary': summarize(data['content'])
            }
            considered_page += 1
            #print("Filtering - consider this page: --  -- ", data['url'])
            #print("Summary:", research_data['summary'])
            #es.index(index=research_project_index, body=research_data)
        index += 1
        if index == limit:
            break_outer_loop = True
            break
    if break_outer_loop:
        break

print("IGNORED:", ignored_page)
print("CONSIDERED:", considered_page)

"""
Prüfung von 100 Seiten (100 ersten Preprocessing Dokumenten nach Timestamp asc)

=== Script Ergebnisse ===
Keine Projektseiten: 57
Projektseiten: 43


=== Meine Ergebnisse ===
Keine Projektseiten: 63
Projektseiten: 37

Es gibt einige Seiten, die einen Titel und eine Url haben, die dem Titel oder URL eines Projekts entsprechen, aber entweder sind sie oft leer, enthalten keinen Projekttext oder sind nur eine Liste von mehreren Projekten, die ich als Mensch nicht als Projektseiten betrachte. Das erklärt den Unterschied.

Es ist möglich, dass wir auch mehrere positive False in größeren Stichproben haben. Es gibt noch einige Verbesserungen, die wir vornehmen müssen.
"""