# Parsing

In this notebook we are going to retrieve some collection stats after the parsing phase.

We are actually testing 3 parsing strategies:
* Standard Parsing (Jena + RDFLib + LightRDF)
* Labels v1 Parsing (only RDFLib)
* Labels v2 Parsing (only RDFLib)

Labels v1 and Labels v2 are actually using the same parser (RDFLib) so we are going to consider them as one unique parsing strategy. 

In [5]:
import os
import json
import pathlib
import tqdm 
from tqdm import tqdm
import json

SUFFIXES = [".rdf", ".rdfs", ".ttl", ".owl", ".n3", ".nt", ".jsonld", ".xml", ".ntriples", ".nq", ".trig", ".trix"]

## Standard Parsing


After the standard parsing phase we want to know: 
* number of files parsed by JENA
* number of files parsed by RDFLib
* number of files parsed by LightRDF
* number of correctly parsed files
* number of not parsed files
* how many datasets are full, partial and empty

We will also output a json file with the empty datasets

In [6]:
# from the post manual file recovering 
N_FILES = 28537        #total number of files in the collection
N_NOT_RDF_FILES = 61   #number of not RDF files

# constants for the dataset type
EMPTY = 0
PARTIAL = 1
FULL = 2

def postParsing():
    output_file = open("output/post_standard_parsing_stats.txt", "a")
    jena_error_file = open(jena_error_log_path, "r")
    rdflib_error_file = open(rdflib_error_log_path, "r")

    #count the number of files not parsable by jena  and rdflib
    notParsableJena = 0

    while True:
        line = jena_error_file.readline()

        if not line:
            break

        if "Error: " in line:
            notParsableJena+=1

    notParsableRDFLib = 0

    while True:
        line = rdflib_error_file.readline()

        if not line:
            break

        if "Error: " in line:
            notParsableRDFLib+=1
    
    jena_error_file.close()
    rdflib_error_file.close()

    full_datasets = 0 
    full_datasets_list = list()
    partial_datasets = 0 
    partial_datasets_list = list()
    empty_datasets = 0 
    empty_datasets_list = list()
    parsable_jena = 0
    parsable_rdflib = 0
    parsable_lightrdf = 0

    datasets = 0 
    total_parsed_files = 0

    for dataset in os.scandir(datasets_directory_path):

        if datasets % 1000 == 0:
            print("Checked: "+str(datasets))

        #open the dataset metadata file
        dataset_metadata_file = open(dataset.path+"/dataset_metadata.json", "r", encoding="utf-8")
        dataset_metadata = json.load(dataset_metadata_file, strict = False)
        dataset_metadata_file.close()

        #check if the dataset is empty, partial or full based on the download info
        if dataset_metadata["download_info"]["downloaded"] == dataset_metadata["download_info"]["total_URLS"]: 
            dataset_download_type = FULL
        elif dataset_metadata["download_info"]["downloaded"] == 0:
            dataset_download_type = EMPTY
        else:
            dataset_download_type = PARTIAL
    
        if dataset_download_type == EMPTY:
            empty_datasets += 1
            empty_datasets_list.append(dataset.name)
        else:

            #count how many rdf files
            n_rdf_files = 0
            for file in os.scandir(dataset):
                if pathlib.Path(file.path).suffix in SUFFIXES:
                    n_rdf_files+=1
            
            parsed_files = 0

            if "mined_files_jena" in dataset_metadata:
                parsed_files += len(dataset_metadata["mined_files_jena"])
                parsable_jena += len(dataset_metadata["mined_files_jena"])
            if "mined_files_rdflib" in dataset_metadata:
                parsed_files += len(dataset_metadata["mined_files_rdflib"])
                parsable_rdflib += len(dataset_metadata["mined_files_rdflib"])
            if "mined_files_lightrdf" in dataset_metadata:
                parsed_files += len(dataset_metadata["mined_files_lightrdf"])
                parsable_lightrdf += len(dataset_metadata["mined_files_lightrdf"])

            if parsed_files == 0:
                empty_datasets += 1
                empty_datasets_list.append(dataset.name)
            
            if parsed_files == n_rdf_files:
                if dataset_download_type == FULL:
                    full_datasets += 1
                    full_datasets_list.append(dataset.name)
                elif dataset_download_type == PARTIAL:
                    partial_datasets += 1
                    partial_datasets_list.append(dataset.name)
            elif parsed_files > 0:
                partial_datasets += 1
                partial_datasets_list.append(dataset.name)
            
            total_parsed_files += parsed_files

        datasets+=1

    output_file.write("Total number of parsable files for Jena: "+str(parsable_jena)+"\n")
    output_file.write("Increment of parsable files for RDFLib: "+str(parsable_rdflib)+"\n")
    output_file.write("Increment of parsable files for LightRDF: "+str(parsable_lightrdf)+"\n")
    output_file.write("Total number of parsed files: "+str(total_parsed_files)+"\n" )
    output_file.write("Total number of not parsable files: "+str(N_FILES - total_parsed_files)+"\n" )
    output_file.write("Total number of full datasets: "+str(full_datasets)+"\n" )
    output_file.write("Total number of partial datasets: "+str(partial_datasets)+"\n" )
    output_file.write("Total number of empty datasets: "+str(empty_datasets)+"\n" )
    output_file.close()

    #save the empty datasets in a json file in the output folder
    with open('output/lists/standard/empty_datasets_standard.json', 'w', encoding='utf-8') as f:
        json.dump(empty_datasets_list, f, ensure_ascii=False, indent=4)

    with open('output/lists/standard/partial_datasets_standard.json', 'w', encoding='utf-8') as f:
        json.dump(partial_datasets_list, f, ensure_ascii=False, indent=4)

    with open('output/lists/standard/full_datasets_standard.json', 'w', encoding='utf-8') as f:
        json.dump(full_datasets_list, f, ensure_ascii=False, indent=4)

dirname = os.path.dirname(__name__)

datasets_directory_path = "/media/manuel/Tesi/Datasets"                                           #path to the folder of the downloaded datasets
jena_error_log_path = os.path.join(dirname, '../../mining/logs/jena_miner_error.log')             #path to the jena error log file
rdflib_error_log_path = os.path.join(dirname, '../../mining/logs/rdflib_miner_error.log')         #path to the rdflib error log file
lightrdf_error_log_path = os.path.join(dirname, '../../mining/logs/lightrdf_miner_error.log')     #path to the lightrdf error log file


postParsing() 

Checked: 0
Checked: 1000
Checked: 2000
Checked: 3000
Checked: 4000
Checked: 5000
Checked: 6000
Checked: 7000
Checked: 8000
Checked: 9000
Checked: 10000
Checked: 11000
Checked: 12000
Checked: 13000
Checked: 14000
Checked: 15000
Checked: 16000
Checked: 17000
Checked: 18000
Checked: 19000
Checked: 20000
Checked: 21000
Checked: 22000
Checked: 23000
Checked: 24000
Checked: 25000
Checked: 26000
Checked: 27000
Checked: 28000
Checked: 29000
Checked: 30000
Checked: 31000


## Label Parsing v1 and v2

Since for the 2 parsing versions we are using RDFLib in the same manner the stats will be the same.


In [7]:
def postHRParsing():
    output_file = open("output/post_parsing_labels_stats.txt", "a")
    rdflibhr_error_file = open(rdflibhr_error_log_path, "r")

    notParsableRDFLib = 0

    while True:
        line = rdflibhr_error_file.readline()

        if not line:
            break

        if "Error: " in line:
            notParsableRDFLib+=1

    rdflibhr_error_file.close()

    datasets = 0 
    empty_datasets = 0
    empty_datasets_list = list()
    partial_datasets = 0
    partial_datasets_list = list()
    full_datasets = 0
    full_datasets_list = list()
    
    total_parsed_files = 0

    for dataset in os.scandir(datasets_directory_path):

        if datasets % 1000 == 0:
            print("Checked: "+str(datasets))

        #open the dataset metadata file
        dataset_metadata_file = open(dataset.path+"/dataset_metadata.json", "r", encoding="utf-8")
        dataset_metadata = json.load(dataset_metadata_file, strict = False)
        dataset_metadata_file.close()

        dataset_download_type = EMPTY
        if len(dataset_metadata["failed_download_urls"]) == 0:
            dataset_download_type = FULL
        elif len(dataset_metadata["failed_download_urls"]) > 0 and len(dataset_metadata["downloaded_urls"]) > 0:
            dataset_download_type = PARTIAL
        else: 
            dataset_download_type = EMPTY

        parsed_files = 0

        if "mined_files_rdflibhr" in dataset_metadata:
            parsed_files = len(dataset_metadata["mined_files_rdflibhr"])
            total_parsed_files += parsed_files

        if parsed_files == 0:
            empty_datasets += 1
            empty_datasets_list.append(dataset.name)

        else:    
            total_files_set = set()
            for file in os.scandir(dataset):
                if pathlib.Path(file.path).suffix in SUFFIXES: 
                    total_files_set.add(file.name.split(".")[0])

            if parsed_files == len(total_files_set):
                if dataset_download_type == FULL:
                    full_datasets += 1
                    full_datasets_list.append(dataset.name)
                elif dataset_download_type == PARTIAL:
                    partial_datasets += 1
                    partial_datasets_list.append(dataset.name)

            elif parsed_files > 0:
                partial_datasets += 1
                partial_datasets_list.append(dataset.name)
                
        datasets += 1

    output_file.write("Total number of parsed files by RDFLibHR: "+ str(total_parsed_files)+"\n")
    output_file.write("Number of full datasets: "+str(full_datasets)+"\n")
    output_file.write("Number of partial datasets: "+str(partial_datasets)+"\n")
    output_file.write("Number of empty datasets: "+str(empty_datasets)+"\n")

    output_file.close()

    #save the empty datasets in a json file in the output folder
    with open('output/lists/labels/empty_datasets_standard.json', 'w', encoding='utf-8') as f:
        json.dump(empty_datasets_list, f, ensure_ascii=False, indent=4)

    with open('output/lists/labels/partial_datasets_standard.json', 'w', encoding='utf-8') as f:
        json.dump(partial_datasets_list, f, ensure_ascii=False, indent=4)

    with open('output/lists/labels/full_datasets_standard.json', 'w', encoding='utf-8') as f:
        json.dump(full_datasets_list, f, ensure_ascii=False, indent=4)

dirname = os.path.dirname(__name__)

datasets_directory_path = "/media/manuel/Tesi/Datasets"                                            #path to the folder of the downloaded datasets
rdflibhr_error_log_path = os.path.join(dirname, '../../mining/logs/rdflibhr_miner_error.log')      #path to the rdflib error log file


postHRParsing() 

Checked: 0
Checked: 1000
Checked: 2000
Checked: 3000
Checked: 4000
Checked: 5000
Checked: 6000
Checked: 7000
Checked: 8000
Checked: 9000
Checked: 10000
Checked: 11000
Checked: 12000
Checked: 13000
Checked: 14000
Checked: 15000
Checked: 16000
Checked: 17000
Checked: 18000
Checked: 19000
Checked: 20000
Checked: 21000
Checked: 22000
Checked: 23000
Checked: 24000
Checked: 25000
Checked: 26000
Checked: 27000
Checked: 28000
Checked: 29000
Checked: 30000
Checked: 31000
