# General Purpose and DB dump datasets

The main purpose of this notebook is to analyze the presence of general purpose datasets in Section 1 and database dumps in Section 2 in the ACORDAR collection.

In [1]:
import os

In [2]:
#common code for mapping query_id -> text

map_query_text = dict()

scriptDir = os.path.dirname(os.path.realpath('__file__'))
qrels_file = open(os.path.join(scriptDir,"../../files/all_queries.txt"), "r")

while True:
    line = qrels_file.readline()

    if not line:
        break

    split = line.split("\t")
    query_id = int(split[0])
    query_text = split[1].strip()

    map_query_text[query_id] = query_text

In [3]:
# common code for mapping qrels: dataset_id -> query_id -> relevance

qrels_file = open(os.path.join(scriptDir,"../../files/qrels.txt"), "r")

qrels = dict()

while True:
    line = qrels_file.readline()

    if not line:
        break

    split = line.split("\t")
    query_id = int(split[0])
    dataset_id = int(split[2])
    score = int(split[3])

    if dataset_id not in qrels.keys():
        qrels[dataset_id] = dict()

    qrels[dataset_id][query_id] = score


## General Purpose datasets

This kind of datasets is very interesting to analyze because they contain a lot of information from different fields (they are big knowledge base graphs), so they can be relevant for multiple queries and not only one in particular. 

In [4]:
#let's analyze how many files refers to dbpedia, foaf or other big ontologies

GENERAL_PURPOSE_ONTOLOGY = ["dbpedia", "freebase", "foaf", "rdfs", "wikidata"]

In [5]:


datasets_directory_path = "/media/manuel/Tesi/Datasets"

general_purpose_ontology_datasets = dict()

for dataset in os.scandir(datasets_directory_path):
    for file in os.scandir(dataset):
        file_name = file.name.split(".")[0]
        for ontology_name in GENERAL_PURPOSE_ONTOLOGY:
            if ontology_name in file_name:
                if file.name not in general_purpose_ontology_datasets.keys():
                    general_purpose_ontology_datasets[file.name] = list()

                general_purpose_ontology_datasets[file.name].append((dataset, round(os.path.getsize(file.path)/(1024**2),3)))

In [6]:
general_purpose_ontology_datasets

{'foaf.ttl': [(<DirEntry 'dataset-16045'>, 0.052),
  (<DirEntry 'dataset-16068'>, 0.052),
  (<DirEntry 'dataset-16053'>, 0.052),
  (<DirEntry 'dataset-16038'>, 0.052),
  (<DirEntry 'dataset-16062'>, 0.052),
  (<DirEntry 'dataset-16046'>, 0.052),
  (<DirEntry 'dataset-16058'>, 0.052),
  (<DirEntry 'dataset-16050'>, 0.052),
  (<DirEntry 'dataset-16051'>, 0.052),
  (<DirEntry 'dataset-16061'>, 0.052),
  (<DirEntry 'dataset-16042'>, 0.052),
  (<DirEntry 'dataset-16065'>, 0.052),
  (<DirEntry 'dataset-16034'>, 0.052),
  (<DirEntry 'dataset-16077'>, 0.052),
  (<DirEntry 'dataset-16039'>, 0.052),
  (<DirEntry 'dataset-16040'>, 0.052),
  (<DirEntry 'dataset-16070'>, 0.052),
  (<DirEntry 'dataset-16043'>, 0.052),
  (<DirEntry 'dataset-16075'>, 0.052),
  (<DirEntry 'dataset-16054'>, 0.052),
  (<DirEntry 'dataset-13043'>, 0.003),
  (<DirEntry 'dataset-16074'>, 0.052),
  (<DirEntry 'dataset-16059'>, 0.052),
  (<DirEntry 'dataset-16048'>, 0.052),
  (<DirEntry 'dataset-16063'>, 0.052),
  (<DirEntry 

## DBPedia and Foaf Analysis

From the previous analysis we can see that a lot of datasets contain the DBPedia and the Foaf Ontologies. So now we are going to analyze the datasets that contain these two ontologies in order to see in which queries are considered relevant and if these ontologies are a limited or full version. This is important because we want to see if the given ontology is a reduced version or not. 


### DBPedia

In [7]:
for general_purpose_ontology in general_purpose_ontology_datasets.keys():
    if "dbpedia" in general_purpose_ontology:
        print(f"{general_purpose_ontology} -> {general_purpose_ontology_datasets[general_purpose_ontology]}")

dbpedia-3.5.1.owl -> [(<DirEntry 'dataset-13461'>, 0.443)]
dbpedia-2014.owl -> [(<DirEntry 'dataset-14530'>, 2.265)]
mapping-eat-dbpedia.rdf -> [(<DirEntry 'dataset-14079'>, 0.531)]
dbpedia-3.6.owl -> [(<DirEntry 'dataset-13388'>, 0.504), (<DirEntry 'dataset-14336'>, 0.504), (<DirEntry 'dataset-13369'>, 0.504), (<DirEntry 'dataset-15414'>, 0.504)]
dbpedia-2016-04.owl -> [(<DirEntry 'dataset-14084'>, 2.32)]


In [8]:
#check if all the dbpedia-3.6 owl files are identical
import filecmp

different = False
file1_path = general_purpose_ontology_datasets["dbpedia-3.6.owl"][0][0].path+"/dbpedia-3.6.owl"
for dataset,size in general_purpose_ontology_datasets["dbpedia-3.6.owl"]:
    file2_path =dataset.path+"/dbpedia-3.6.owl"
    if not filecmp.cmp(file1_path, file2_path):
        different = True
        break

if different:
    print("Are different")
else:
    print("Are identical")


Are identical


In [9]:
#search the relevance judgment scores for the datasets that contain the general purpose ontologies files
#we want also to return for every relevance score the query text to which the score is referred

for general_purpose_ontology in general_purpose_ontology_datasets.keys():
    if "dbpedia" in general_purpose_ontology:
        datasets_list = general_purpose_ontology_datasets[general_purpose_ontology]

        print(general_purpose_ontology)
        for dataset in datasets_list: 
            print(f"{dataset[0].name}")
            id = int(dataset[0].name.split("-")[1])
            if id in qrels.keys():
                for query in qrels[id]:
                    score = qrels[id][query]
                    if score > 0:
                        print(f"{query} : {map_query_text[query]} score: {qrels[id][query]}")
            else:
                print("Not present in the qrels")
            print()

dbpedia-3.5.1.owl
dataset-13461
Not present in the qrels

dbpedia-2014.owl
dataset-14530
1056 : English and Spanish terminology score: 1
1070 : Spanish Terminology score: 1

mapping-eat-dbpedia.rdf
dataset-14079

dbpedia-3.6.owl
dataset-13388
Not present in the qrels

dataset-14336

dataset-13369

dataset-15414

dbpedia-2016-04.owl
dataset-14084
1167 : wikidata alignment score: 1



In [10]:
"""
dataset-13461: file di dbpedia su Berlino ma metadati generici, non nei qrels
dataset-14530: 
    query 1056	English and Spanish terminology
    query 1070	Spanish Terminology
    file su termini spagnoli ma file di dbpedia totalmente general purpose
dataset-14079:
    irrilevante nei qrels:
        91:Dog Maulings
        1050:response of guardian school quality
        45: What Backing Does the National Rifle Association Have?
    associazione tra termini, abbastanza fined grained
dataset-13388:
    no nei qrels
    dpbedia general purpose ma file scaricati sono su parigi
dataset-14336
    irrilevante, query 218: Greek, philosophy, stoicism
    dbpedia in greco, general purpose
dataset-13369
    irrilevante, query 218:Greek, philosophy, stoicism
    dbpedia in greco, general purpose
dataset-15414
    irrilevante, query 1027: Canada Core Subject Thesaurus, English, French
    dataset-13461: file di dbpedia su Parigi ma metadati genericiì
dataset-14084
    1167	wikidata alignment
    general purpose dbpedia + file su Berlino
"""

'\ndataset-13461: file di dbpedia su Berlino ma metadati generici, non nei qrels\ndataset-14530: \n    query 1056\tEnglish and Spanish terminology\n    query 1070\tSpanish Terminology\n    file su termini spagnoli ma file di dbpedia totalmente general purpose\ndataset-14079:\n    irrilevante nei qrels:\n        91:Dog Maulings\n        1050:response of guardian school quality\n        45: What Backing Does the National Rifle Association Have?\n    associazione tra termini, abbastanza fined grained\ndataset-13388:\n    no nei qrels\n    dpbedia general purpose ma file scaricati sono su parigi\ndataset-14336\n    irrilevante, query 218: Greek, philosophy, stoicism\n    dbpedia in greco, general purpose\ndataset-13369\n    irrilevante, query 218:Greek, philosophy, stoicism\n    dbpedia in greco, general purpose\ndataset-15414\n    irrilevante, query 1027: Canada Core Subject Thesaurus, English, French\n    dataset-13461: file di dbpedia su Parigi ma metadati genericiì\ndataset-14084\n    

### Foaf

In [11]:
for general_purpose_ontology in general_purpose_ontology_datasets.keys():
    if "foaf" in general_purpose_ontology:
        print(f"{general_purpose_ontology} -> {general_purpose_ontology_datasets[general_purpose_ontology]}")

foaf.ttl -> [(<DirEntry 'dataset-16045'>, 0.052), (<DirEntry 'dataset-16068'>, 0.052), (<DirEntry 'dataset-16053'>, 0.052), (<DirEntry 'dataset-16038'>, 0.052), (<DirEntry 'dataset-16062'>, 0.052), (<DirEntry 'dataset-16046'>, 0.052), (<DirEntry 'dataset-16058'>, 0.052), (<DirEntry 'dataset-16050'>, 0.052), (<DirEntry 'dataset-16051'>, 0.052), (<DirEntry 'dataset-16061'>, 0.052), (<DirEntry 'dataset-16042'>, 0.052), (<DirEntry 'dataset-16065'>, 0.052), (<DirEntry 'dataset-16034'>, 0.052), (<DirEntry 'dataset-16077'>, 0.052), (<DirEntry 'dataset-16039'>, 0.052), (<DirEntry 'dataset-16040'>, 0.052), (<DirEntry 'dataset-16070'>, 0.052), (<DirEntry 'dataset-16043'>, 0.052), (<DirEntry 'dataset-16075'>, 0.052), (<DirEntry 'dataset-16054'>, 0.052), (<DirEntry 'dataset-13043'>, 0.003), (<DirEntry 'dataset-16074'>, 0.052), (<DirEntry 'dataset-16059'>, 0.052), (<DirEntry 'dataset-16048'>, 0.052), (<DirEntry 'dataset-16063'>, 0.052), (<DirEntry 'dataset-16066'>, 0.052), (<DirEntry 'dataset-16047

In [12]:
#search the relevance judgment scores for the datasets that contain the general purpose ontologies files
#we want also to return for every relevance score the query text to which the score is referred

for general_purpose_ontology in general_purpose_ontology_datasets.keys():
    if "foaf" in general_purpose_ontology:
        datasets_list = general_purpose_ontology_datasets[general_purpose_ontology]

        print(general_purpose_ontology)
        for dataset in datasets_list: 
            print(f"{dataset[0].name}")
            id = int(dataset[0].name.split("-")[1])
            if id in qrels.keys():
                for query in qrels[id]:
                    score = qrels[id][query]
                    print(f"{query} : {map_query_text[query]} score: {qrels[id][query]}")
            else:
                print("Not present in the qrels")
            print()

foaf.ttl
dataset-16045
1034 : British National Bibliography, book and its author, topic, name, language score: 0
1140 : drones use type score: 0
1125 : person name in National Library of Russia score: 0
9 : Mitsubishi Heavy Industries Ltd. score: 0

dataset-16068
1059 : Mathematics Subject Classification, concept, notation, homepage, description, creator score: 0

dataset-16053
14 : Companies Capable of Producing Document Management Systems score: 0

dataset-16038
1026 : resource type of IEEE datasets score: 0
1141 : title and description of social web content score: 0
1140 : drones use type score: 0
1125 : person name in National Library of Russia score: 0
1024 : financial resource data score: 0
1059 : Mathematics Subject Classification, concept, notation, homepage, description, creator score: 0
1093 : IEEE papers, data resource score: 0
1110 : Financial Literacy Resource provided by organization name score: 0

dataset-16062
1094 : txn.owl creator score: 0
119 : Violent Juvenile Crime

## Database Dump Datasets 

In the following section we are going to analyze the Database Dump datasets. These datasets are simple RDF files obtained with a simple dump from relational databases or other table structure data repositories. 

In [13]:
datasets_directory_path = "/media/manuel/Tesi/Datasets"

dump_datasets = dict()

for dataset in os.scandir(datasets_directory_path):
    for file in os.scandir(dataset):
        file_name = file.name.split(".")[0]
        if "rows" in file_name:
            if file.name not in dump_datasets.keys():
                dump_datasets[file.name] = list()
            dump_datasets[file.name].append(dataset)

In [14]:
print(dump_datasets.keys())
print(len(dump_datasets["rows.rdf"]))

dict_keys(['rows.rdf'])
10557


In [15]:
#search the relevance judgment scores for the datasets that contain the dump files
#we want also to return for every relevance score the query text to which the score is referred

for dataset in dump_datasets["rows.rdf"]: 
    print(f"{dataset.name}")
    id = int(dataset.name.split("-")[1])
    if id in qrels.keys():
        for query in qrels[id]:
            score = qrels[id][query]
            if score > 0:
                print(f"{query} : {map_query_text[query]} score: {qrels[id][query]}")
    print()

dataset-10476

dataset-1388

dataset-5286

dataset-5113
151 : Viral Hepatitis score: 2

dataset-19400

dataset-963

dataset-8436

dataset-992

dataset-1818

dataset-10743

dataset-11972

dataset-16389

dataset-18489

dataset-16325

dataset-5526

dataset-19586

dataset-1502

dataset-3048

dataset-3345

dataset-19022

dataset-5430

dataset-4249

dataset-5727

dataset-1380

dataset-10718

dataset-7405

dataset-2828

dataset-1542

dataset-17476

dataset-5312

dataset-5871

dataset-4444

dataset-2082

dataset-10557

dataset-1972

dataset-1356

dataset-16785

dataset-1301

dataset-16615

dataset-7596

dataset-17830

dataset-3687

dataset-7246

dataset-7145

dataset-8845

dataset-6729

dataset-18855

dataset-12492

dataset-6625

dataset-12409

dataset-8592

dataset-7203

dataset-10293

dataset-7205

dataset-18660

dataset-2500

dataset-11808
1146 : Alderman application requirements score: 1
1052 : energy star markets score: 2

dataset-935

dataset-2824

dataset-5690

dataset-6552

dataset-676