# AGROVOC

In [52]:
#imports
import pandas as pd
import pickle
import warnings

from multiprocessing import Process
import concurrent.futures as cf
from tqdm import tqdm

from keyword_extraction import DictLU_Create_Dict

import rdflib
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD

In [2]:
#settings 
warnings.simplefilter(action='ignore', category=FutureWarning)

## get data from Agrovoc

In [3]:
#parse graph from RDF-file
def parse_graph(path,file_format):
    g = rdflib.Graph()
    g.parse(path, format= file_format)
    return g

with cf.ProcessPoolExecutor(max_workers=12) as executor:
    future = executor.submit(parse_graph, '/home/ubuntu/ullrich/my_code/data/agrovoc_2023-04-04_core.nt', 'nt11')
    result = future.result()

In [25]:
#extracte terms and synonyms from graph with sparql-query
def extract_terms(sparql_query, prefix1, namespace1, prefix2, namespace2):
    terms = pd.DataFrame()
    synonyms = pd.DataFrame()
    res = result.query(sparql_query, initNs={prefix1: namespace1, prefix2:namespace2})
    for x in res:
        terms = terms.append({'id': x.concept, 'term': x.dePrefLabel},ignore_index=True)
        synonyms = synonyms.append({'id': x.concept, 'term': x.deAltLabel},ignore_index=True)
    return terms, synonyms

In [26]:
#define namespace
prefix_skosxl = 'skosxl'
SKOSXL = Namespace("http://www.w3.org/2008/05/skos-xl#")
prefix_skos = 'skos'
SKOS = Namespace('http://www.w3.org/2004/02/skos/core#')

# formulate SPARQL-query
query = """
SELECT ?concept ?dePrefLabel ?deAltLabel
WHERE { 
  ?concept a skos:Concept .
  ?concept skosxl:prefLabel/skosxl:literalForm ?dePrefLabel .
  ?concept skosxl:altLabel/skosxl:literalForm ?deAltLabel .
  FILTER (lang(?dePrefLabel) = 'de' && lang(?deAltLabel) = 'de')
}
LIMIT 1000
"""
...
# start the process pool
with cf.ProcessPoolExecutor(max_workers=12) as executor:
    future_query = executor.submit(extract_terms, query, prefix_skosxl, SKOSXL, prefix_skos, SKOS)
    result_query = future_query.result()

## preprocess dataframes

In [35]:
#drop duplicates
terms = result_query[0].drop_duplicates()
synonyms = result_query[1].drop_duplicates()

In [36]:
#split column ID to get only the ID
def split_ID(df):
    df['id'] = df['id'].str.split('/')
    liste = [] 
    for x in df['id']:
        liste.append(x[len(df['id'][0])-1])
    df['id'] = liste

In [37]:
split_ID(terms)
split_ID(synonyms)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['id'] = df['id'].str.split('/')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['id'] = liste


# TO DO
- Bereinigung der Wörter - z.B. (Tier)
- mehrere Sprachen einbauen
- extract keyword Funktion mit mehreren Themengebieten
- progress bar ?

In [72]:
terms

Unnamed: 0,id,term
0,c_4788,Methode
1,c_2208,Entwurf
2,c_27596,Fortpflanzungskontrolle
3,c_432,Tierhaltungsanlage
5,c_1375,Kastration
...,...,...
995,c_13555,Putenküken
996,c_11373,Rauhfusshuhn
997,c_2352,Hund
998,c_3698,Jagdhund


In [73]:
synonyms

Unnamed: 0,id,term
0,c_4788,Arbeitsverfahren
1,c_2208,Design
2,c_27596,Kontrazeption (Tier)
3,c_432,Stallung
4,c_432,Unterbringung (Tier)
...,...,...
995,c_13555,Truthahnküken
996,c_11373,Auerhuhn
997,c_2352,Canis familiaris
998,c_3698,Hetzhund


In [74]:
lookuplist = pd.concat([terms,synonyms]).reset_index(drop=True)
print(f'    -> {len(lookuplist)} terms in total\n')

    -> 1711 terms in total



In [78]:
print(lookuplist['term'].where(lookuplist['id'] == 'c_15968'))

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
1706    NaN
1707    NaN
1708    NaN
1709    NaN
1710    NaN
Name: term, Length: 1711, dtype: object


In [76]:
# create dictionary
DCC = DictLU_Create_Dict(lookuplist)
dicts_lower = DCC.dicts_lower
dicts_upper = DCC.dicts_upper

In [77]:
for x in dicts_upper:
    print(x)

{rdflib.term.Literal('ELISA', lang='de'): 'c_15968', rdflib.term.Literal('RFLP', lang='de'): 'c_34255', rdflib.term.Literal('PCR', lang='de'): 'c_34079', rdflib.term.Literal('RAPD', lang='de'): 'c_36341', rdflib.term.Literal('IPM', lang='de'): 'c_34030', rdflib.term.Literal('BLUP', lang='de'): 'c_16525', rdflib.term.Literal('AFLP', lang='de'): 'c_331160', rdflib.term.Literal('RUSITEC', lang='de'): 'c_35848', rdflib.term.Literal('CCM', lang='de'): 'c_16171', rdflib.term.Literal('SCP', lang='de'): 'c_7078', rdflib.term.Literal('IKT', lang='de'): 'c_9000083', rdflib.term.Literal('NPN', lang='de'): 'c_24055'}


In [53]:
with open('AGROVOC_dict_deutsch.p', 'wb') as handle:
    pickle.dump([dicts_lower,dicts_upper], handle)