# Collection Statistics

This notebook will calculate some collection statistic during the various phases of the processing. In particular: 
* after download phase (and download retry)
* after automatic file recovering phase
* after manual file recovering phase
* after indexing phase



In [2]:
import os
import json
import pathlib
import tqdm 
from tqdm import tqdm
import json

### Download

After collection download we want to know:
* how many datasets are complete (all the files for the dataset are downloaded)
* how many datasets are partial (not all the files for the dataset are downloaded)
* how many datasets are empty (no files for the dataset are downloaded)
* how many files have not a valid RDF extension 

In [9]:
'''
@param datasets_directory_path path to the directory where there are all the datasets
@param checker_error_log_path path of the log file of the dataset checker
@param output_file_path file where to write the output statistics
'''
def postDownloadStats():

    output_file = open(output_file_path, "a")
    f_log_checker = open(checker_error_log_path, "r")

    n_datasets = 0
    n_empty = 0
    n_full = 0

    #scan the datasets folders and count how many datasets are full, partial and empty

    for folder in os.scandir(datasets_directory_path):
        
        n_datasets+=1

        if n_datasets % 1000 == 0:
            print("Scanned: "+str(n_datasets))

        dataset_json_path = datasets_directory_path+"/"+folder.name+"/dataset_metadata.json"

        #open the dataset.json file 
        dataset_json_file=open(dataset_json_path, "r")
        
        #load the json object present in the json datasets list
        dataset_json = json.load(dataset_json_file,strict=False)

        if dataset_json["download_info"]["downloaded"] == 0:
            n_empty+=1 
            print(folder.name)
        
        if dataset_json["download_info"]["downloaded"] == dataset_json["download_info"]["total_URLS"]:
            n_full+=1
        
        dataset_json_file.close()
        del(dataset_json)
        
    #read the checker error_log

    n_file = 0
        
    while True:

        line = f_log_checker.readline()

        if not line:
            break

        #split the line
        fields = line.split(": ")

        if fields[0] == "File":
            n_file += 1

    output_file.write("Number of datasets: "+str(n_datasets)+"\n")
    output_file.write("Number of full datasets: "+str(n_full)+"\n")
    output_file.write("Number of partial datasets: "+str(n_datasets-n_full-n_empty)+"\n")
    output_file.write("Number of empty datasets: "+str(n_empty)+"\n")
    output_file.write("Number of files that need to be assigned to an extension: "+str(n_file)+"\n")

    output_file.close()
    f_log_checker.close()



scriptDir = os.path.dirname(os.path.realpath('__file__'))

datasets_directory_path = "/media/manuel/Tesi/Datasets"                                               #path to the folder of the downloaded datasets

output_file_path = os.path.join(scriptDir, '../../logs/statistics/post_download_statistics.txt')      #path to the error log file

checker_error_log_path = os.path.join(scriptDir, '../../logs/checker_error_log.txt')                  #path to the checker error log file
postDownloadStats() 

dataset-19459
dataset-18355
dataset-73070
dataset-26917
dataset-21567
dataset-85506
dataset-49418
dataset-12050
dataset-23926
dataset-2732
dataset-82147
dataset-2114
dataset-49054
dataset-65946
dataset-2940
dataset-49455
dataset-24126
dataset-73095
dataset-72869
dataset-82757
dataset-44614
dataset-84254
dataset-83515
dataset-88501
dataset-88019
dataset-5614
dataset-65065
dataset-2929
dataset-2533
dataset-27213
dataset-23414
dataset-84952
dataset-14488
dataset-49323
dataset-73860
dataset-84557
dataset-82585
dataset-21512
dataset-12547
dataset-63348
dataset-48967
dataset-83243
dataset-73029
dataset-48948
dataset-42854
dataset-17160
dataset-3350
dataset-68861
dataset-45941
dataset-87852
dataset-11802
dataset-84070
dataset-48481
dataset-3933
dataset-26969
dataset-19318
dataset-53739
dataset-69025
dataset-84170
dataset-84311
dataset-84587
dataset-13255
dataset-5606
dataset-85925
dataset-4803
dataset-2772
dataset-86274
dataset-24629
dataset-68596
dataset-24330
dataset-66363
dataset-48959
dat

### Automatic File Recovering

After the automatic file recovering we want to know: 
* how many files have been recovered
* how many files need to be manually processed

In [6]:
'''
@param recover_error_log_path path to the file recover error log path 
@param output_file_path file where to write the output statistics
'''
def postAutomaticRecovering():
    f_log_recover = open(recover_error_log_path, "r")
    output_file = open(output_file_path, "a")

    #read the recover error_log

    n_unrecovered_file = 0
    n_recovered_file = 0
        
    while True:

        line = f_log_recover.readline()

        if not line:
            break

        #split the line
        fields = line.split(": ")

        if fields[0] == "Dataset":
            n_unrecovered_file += 1
        elif fields[0] == "Recover in Dataset": 
            n_recovered_file += 1

    output_file.write("Number of recovered files: "+str(n_recovered_file)+"\n")
    output_file.write("Number of unrecovered files: "+str(n_unrecovered_file)+"\n")

    f_log_recover.close()
    output_file.close()


scriptDir = os.path.dirname(os.path.realpath('__file__'))

datasets_directory_path = "/media/manuel/Tesi/Datasets"                                #path to the folder of the downloaded datasets

output_file_path = os.path.join(scriptDir, '../../logs/statistics/post_automatic_recovering_statistics.txt')     #path to the error log file

recover_error_log_path = os.path.join(scriptDir, '../../logs/recover_error_log.txt')                  #path to the checker error log file
postAutomaticRecovering() 

### Manual File Recovering

After the manual file recovering we want to know: 
* total number of files in the collection
* total number of files with a no RDF extension

In [5]:
SUFFIXES = [".rdf", ".rdfs", ".ttl", ".owl", ".n3", ".nt", ".jsonld", ".xml", ".ntriples", ".nq", ".trig", ".trix"]

'''
@param output_file_path file where to write the output statistics
'''
def postManualRecovering():
    output_file = open(output_file_path, "a")

    nFiles = 0
    nNoRDFFiles = 0
    nDataset = 0

    for dataset in os.scandir(datasets_directory_path):
        for file in os.scandir(dataset):
            if file.name != "dataset_metadata.json" and file.name != "dataset_content.json": 
                file_suffix = pathlib.Path(file.path).suffix
                if file_suffix not in SUFFIXES:
                    nNoRDFFiles += 1
                nFiles+=1
        nDataset+=1
        if nDataset%1000 == 0:
            print("Scanned: "+str(nDataset))

    output_file.write("Total number of files: "+str(nFiles)+"\n")
    output_file.write("Total number of NO RDF files: "+str(nNoRDFFiles)+"\n")

    output_file.close()


scriptDir = os.path.dirname(os.path.realpath('__file__'))

datasets_directory_path = "/media/manuel/Tesi/Datasets"                                #path to the folder of the downloaded datasets

output_file_path = os.path.join(scriptDir, '../../logs/statistics/post_manual_recovering_statistics.txt')     #path to the error log file

postManualRecovering() 

Scanned: 1000
Scanned: 2000
Scanned: 3000
Scanned: 4000
Scanned: 5000
Scanned: 6000
Scanned: 7000
Scanned: 8000
Scanned: 9000
Scanned: 10000
Scanned: 11000
Scanned: 12000
Scanned: 13000
Scanned: 14000
Scanned: 15000
Scanned: 16000
Scanned: 17000
Scanned: 18000
Scanned: 19000
Scanned: 20000
Scanned: 21000
Scanned: 22000
Scanned: 23000
Scanned: 24000
Scanned: 25000
Scanned: 26000
Scanned: 27000
Scanned: 28000
Scanned: 29000
Scanned: 30000
Scanned: 31000


### Duplicated Datasets

In this section we are going to analyze which datasets are duplicated. A dataset is considered duplicated if there is another dataset with the same link. Then we are going to consider which are the differences (if there are any) in the datasets. 


In [5]:
def findDuplicates(datasets_list_path, output_file_path):

    #open the json file with the datasets list
    dataset_list_file=open(datasets_list_path, "r", encoding="utf-8")

    #open the output file
    output_file=open(output_file_path, "a")

    #load the json list
    datasets_list = json.load(dataset_list_file,strict=False)

    #create a dictionary of key: link, value: list of dataset ids that download that link
    links = dict()

    for entry in datasets_list["datasets"]:
        dataset_id = entry["dataset_id"]

        distinct_links = sorted(set(entry["download"]))

        links_as_string = ""
        for link in distinct_links:
            links_as_string += link

        if links_as_string not in links.keys():
            links[links_as_string] = list()

        links[links_as_string].append(dataset_id)

    i = 0 
    for links_as_string, datasets in links.items():
        if len(datasets) > 1:
            output_file.write(str(datasets)+"\n")
            i+=1
    print("Duplicated: "+str(i)+" datasets")
    



datasets_list_path = "/home/manuel/Tesi/ACORDAR/Data/datasets.json"

scriptDir = os.path.dirname(os.path.realpath('__file__'))

output_file_path = os.path.join(scriptDir, '../../logs/statistics/duplicated_datasets.txt')      #path to the jena error log file


findDuplicates(datasets_list_path, output_file_path)

Duplicated: 438 datasets


### Parsing



After the parsing phase we want to know: 
* number of files parsed by JENA
* number of files parsed by RDFLib
* number of files parsed by LightRDF
* number of correctly parsed files
* number of not parsed files
* how many datasets are full, partial and empty


In [3]:
# from the post manual file recovering 
N_FILES = 28537        #total number of files in the collection
N_NOT_RDF_FILES = 61   #number of not RDF files

def postParsing():
    output_file = open(output_folder_path+"/post_parsing_statistics.txt", "a")
    jena_error_file = open(jena_error_log_path, "r")
    rdflib_error_file = open(rdflib_error_log_path, "r")

    #count the number of files not parsable by jena  and rdflib
    notParsableJena = 0

    while True:
        line = jena_error_file.readline()

        if not line:
            break

        if "Error: " in line:
            notParsableJena+=1

    notParsableRDFLib = 0

    while True:
        line = rdflib_error_file.readline()

        if not line:
            break

        if "Error: " in line:
            notParsableRDFLib+=1
    
    jena_error_file.close()
    rdflib_error_file.close()

    full_datasets = 0 
    full_datasets_list = list()
    partial_datasets = 0 
    partial_datasets_list = list()
    empty_datasets = 0 
    empty_datasets_list = list()

    datasets = 0 
    total_parsed_files = 0

    for dataset in os.scandir(datasets_directory_path):

        if datasets % 1000 == 0:
            print("Checked: "+str(datasets))

        #open the dataset metadata file
        dataset_metadata_file = open(dataset.path+"/dataset_metadata.json", "r", encoding="utf-8")
        dataset_metadata = json.load(dataset_metadata_file, strict = False)
        dataset_metadata_file.close()

        parsed_files = 0
        all_files = len(os.listdir(dataset)) - 1
        if "mined_files_jena" in dataset_metadata:
            parsed_files += len(dataset_metadata["mined_files_jena"])
            all_files -= 1 
        if "mined_files_rdflib" in dataset_metadata:
            parsed_files += len(dataset_metadata["mined_files_rdflib"])
            all_files -= 1
        if "mined_files_lightrdf" in dataset_metadata:
            parsed_files += len(dataset_metadata["mined_files_lightrdf"])
            all_files -= 4
        if parsed_files == 0:
            empty_datasets += 1
            empty_datasets_list.append(dataset.name)
        elif parsed_files > 0 and parsed_files >= all_files:
            full_datasets += 1
            full_datasets_list.append(dataset.name)
        elif parsed_files > 0 and parsed_files < all_files:
            partial_datasets += 1
            partial_datasets_list.append(dataset.name)
        
        total_parsed_files += parsed_files
        datasets+=1

    output_file.write("Total number of parsable files for Jena: "+str(N_FILES-N_NOT_RDF_FILES-notParsableJena)+"\n")
    output_file.write("Increment of parsable files for RDFLib: "+str(notParsableJena-notParsableRDFLib)+"\n")
    output_file.write("Total number of parsed files: "+str(total_parsed_files)+"\n" )
    output_file.write("Total number of not parsable files: "+str(N_FILES - total_parsed_files)+"\n" )
    output_file.write("Total number of full datasets: "+str(full_datasets)+"\n" )
    output_file.write("Total number of partial datasets: "+str(partial_datasets)+"\n" )
    output_file.write("Total number of empty datasets: "+str(empty_datasets)+"\n" )
    output_file.close()

    #save the empty files in a json file in the output folder
    with open(output_folder_path+'/empty_datasets.json', 'w', encoding='utf-8') as f:
        json.dump(empty_datasets_list, f, ensure_ascii=False, indent=4)


scriptDir = os.path.dirname(os.path.realpath('__file__'))

datasets_directory_path = "/media/manuel/Tesi/Datasets"                                          #path to the folder of the downloaded datasets
jena_error_log_path = os.path.join(scriptDir, '../../logs/jena_miner_error_log.txt')             #path to the jena error log file
rdflib_error_log_path = os.path.join(scriptDir, '../../logs/rdflib_miner_error_log.txt')         #path to the rdflib error log file
lightrdf_error_log_path = os.path.join(scriptDir, '../../logs/lightrdf_miner_error_log.txt')     #path to the lightrdf error log file

output_folder_path = os.path.join(scriptDir, '../../logs/statistics')     #path to the output folder

postParsing() 

Checked: 0
Checked: 1000
Checked: 2000
Checked: 3000
Checked: 4000
Checked: 5000
Checked: 6000
Checked: 7000
Checked: 8000
Checked: 9000
Checked: 10000
Checked: 11000
Checked: 12000
Checked: 13000
Checked: 14000
Checked: 15000
Checked: 16000
Checked: 17000
Checked: 18000
Checked: 19000
Checked: 20000
Checked: 21000
Checked: 22000
Checked: 23000
Checked: 24000
Checked: 25000
Checked: 26000
Checked: 27000
Checked: 28000
Checked: 29000
Checked: 30000
Checked: 31000
