# AGROVOC

In [1]:
#imports
import pandas as pd
import pickle
import warnings

from multiprocessing import Process
import concurrent.futures as cf
from tqdm import tqdm

from keyword_extraction import DictLU_Create_Dict

import rdflib
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD

In [2]:
#settings 
warnings.simplefilter(action='ignore', category=FutureWarning)

## get data from Agrovoc

In [None]:
#parse graph from RDF-file
def parse_graph(path,file_format):
    g = rdflib.Graph()
    g.parse(path, format= file_format)
    return g

with cf.ProcessPoolExecutor(max_workers=12) as executor:
    future = executor.submit(parse_graph, '/home/ubuntu/ullrich/my_code/data/agrovoc_2023-04-04_core.nt', 'nt11')
    g = future.result()

In [None]:
#save graph as pickle-file for faster loading
with open('AGROVOC_graph.p', 'wb') as handle:
    pickle.dump(g, handle)

In [3]:
#load pickle file
with open('data/AGROVOC_graph.p', 'rb') as handle:
    g = pickle.load(handle)

In [5]:
#extracte terms and synonyms from graph with sparql-query
def extract_terms(sparql_query, prefix1, namespace1, prefix2, namespace2):
    terms = pd.DataFrame()
    synonyms = pd.DataFrame()
    res = g.query(sparql_query, initNs={prefix1: namespace1, prefix2: namespace2})
    total_count = 0  # counter for the added elements

    for i, x in enumerate(res, 1):
        terms = terms.append({'id': x.concept, 'term': str(x.PrefLabel), 'lang': x.lang}, ignore_index=True)
        synonyms = synonyms.append({'id': x.concept, 'term': str(x.AltLabel), 'lang': x.lang2}, ignore_index=True)
        
        if i % 100 == 0:
            added_count = i - total_count
            total_count = i
            print(f"{added_count} neue Elemente hinzugefügt. Gesamt: {total_count}")
    
    return terms, synonyms

In [6]:
#define namespace
prefix_skosxl = 'skosxl'
SKOSXL = Namespace("http://www.w3.org/2008/05/skos-xl#")
prefix_skos = 'skos'
SKOS = Namespace('http://www.w3.org/2004/02/skos/core#')

# formulate SPARQL-query
query = """
SELECT ?concept ?PrefLabel ?AltLabel (lang(?PrefLabel) as ?lang) (lang(?AltLabel) as ?lang2)
WHERE { 
  ?concept a skos:Concept .
  ?concept skosxl:prefLabel/skosxl:literalForm ?PrefLabel .
  ?concept skosxl:altLabel/skosxl:literalForm ?AltLabel .
  FILTER (
    ((langMatches(lang(?PrefLabel), "de") && langMatches(lang(?AltLabel), "de")) ||
    (langMatches(lang(?PrefLabel), "fr") && langMatches(lang(?AltLabel), "fr")) ||
    (langMatches(lang(?PrefLabel), "en") && langMatches(lang(?AltLabel), "en")))
  )
}
LIMIT 1000
"""
...
# start the process pool
with cf.ProcessPoolExecutor(max_workers=12) as executor:
    future_query = executor.submit(extract_terms, query, prefix_skosxl, SKOSXL, prefix_skos, SKOS)
    result_query = future_query.result()

100 neue Elemente hinzugefügt. Gesamt: 100
100 neue Elemente hinzugefügt. Gesamt: 200
100 neue Elemente hinzugefügt. Gesamt: 300
100 neue Elemente hinzugefügt. Gesamt: 400
100 neue Elemente hinzugefügt. Gesamt: 500
100 neue Elemente hinzugefügt. Gesamt: 600
100 neue Elemente hinzugefügt. Gesamt: 700
100 neue Elemente hinzugefügt. Gesamt: 800
100 neue Elemente hinzugefügt. Gesamt: 900
100 neue Elemente hinzugefügt. Gesamt: 1000


## preprocess dataframes and create dictionary

In [None]:
#split column ID to get only the ID
def split_ID(df):
    df['id'] = df['id'].str.split('/')
    liste = [] 
    for x in df['id']:
        liste.append(x[5])
    df['id'] = liste

In [None]:
def process_df(result):
    language = ['de', 'en', 'fr']
    terms = pd.DataFrame()
    synonyms = pd.DataFrame()

    for x in language:
        #filter for language
        terms = result[0][result[0]['lang'].str.contains(x)]
        synonyms = result[1][result[1]['lang'].str.contains(x)]
        #drop duplicates
        terms = terms.drop_duplicates()
        synonyms = synonyms.drop_duplicates()
        #split the ids
        split_ID(terms)
        split_ID(synonyms)
        #concat list of terms and synonyms
        lookuplist = pd.concat([terms,synonyms]).reset_index(drop=True)
        print(f'    -> {len(lookuplist)} terms in total\n')
        # create dictionary
        DCC = DictLU_Create_Dict(lookuplist)
        dicts_lower = DCC.dicts_lower
        dicts_upper = DCC.dicts_upper
        # save AGROVOG-dictionary as pickle
        with open('data/pickle/AGROVOC/AGROVOC_dict_'+ x + '.p', 'wb') as handle:
            pickle.dump([dicts_lower,dicts_upper], handle)


In [None]:
process_df(result_query)

# TO DO
- progress bar ?
- IDs