# Collection Statistics

This notebook will calculate some collection statistic during the various phases of the processing. In particular: 
* after download phase (and download retry)
* after automatic file recovering phase
* after manual file recovering phase
* after indexing phase



In [1]:
import os
import json
import pathlib

### Download

After collection download we want to know:
* how many datasets are complete (all the files for the dataset are downloaded)
* how many datasets are partial (not all the files for the dataset are downloaded)
* how many datasets are empty (no files for the dataset are downloaded)
* how many files have not a valid RDF extension 

In [4]:
'''
@param datasets_directory_path path to the directory where there are all the datasets
@param checker_error_log_path path of the log file of the dataset checker
@param output_file_path file where to write the output statistics
'''
def postDownloadStats():

    output_file = open(output_file_path, "a")
    f_log_checker = open(checker_error_log_path, "r")

    n_datasets = 0
    n_empty = 0
    n_full = 0

    #scan the datasets folders and count how many datasets are full, partial and empty

    for folder in os.scandir(datasets_directory_path):
        
        n_datasets+=1

        if n_datasets % 1000 == 0:
            print("Scanned: "+str(n_datasets))

        dataset_json_path = datasets_directory_path+"/"+folder.name+"/dataset_metadata.json"

        #open the dataset.json file 
        dataset_json_file=open(dataset_json_path, "r")
        
        #load the json object present in the json datasets list
        dataset_json = json.load(dataset_json_file,strict=False)

        if dataset_json["download_info"]["downloaded"] == 0:
            n_empty+=1 
        
        if dataset_json["download_info"]["downloaded"] == dataset_json["download_info"]["total_URLS"]:
            n_full+=1
        
        dataset_json_file.close()
        del(dataset_json)
        
    #read the checker error_log

    n_file = 0
        
    while True:

        line = f_log_checker.readline()

        if not line:
            break

        #split the line
        fields = line.split(": ")

        if fields[0] == "File":
            n_file += 1

    output_file.write("Number of datasets: "+str(n_datasets)+"\n")
    output_file.write("Number of full datasets: "+str(n_full)+"\n")
    output_file.write("Number of partial datasets: "+str(n_datasets-n_full-n_empty)+"\n")
    output_file.write("Number of empty datasets: "+str(n_empty)+"\n")
    output_file.write("Number of files that need to be assigned to an extension: "+str(n_file)+"\n")

    output_file.close()
    f_log_checker.close()



scriptDir = os.path.dirname(os.path.realpath('__file__'))

datasets_directory_path = "/media/manuel/500GBHDD/Tesi/Datasets"                                #path to the folder of the downloaded datasets

output_file_path = os.path.join(scriptDir, '../../logs/statistics/post_download_statistics.txt')                 #path to the error log file

checker_error_log_path = os.path.join(scriptDir, '../../logs/checker_error_log.txt')                  #path to the checker error log file
postDownloadStats() 

Scanned: 1000
Scanned: 2000
Scanned: 3000
Scanned: 4000
Scanned: 5000
Scanned: 6000
Scanned: 7000
Scanned: 8000
Scanned: 9000
Scanned: 10000
Scanned: 11000
Scanned: 12000
Scanned: 13000
Scanned: 14000
Scanned: 15000
Scanned: 16000
Scanned: 17000
Scanned: 18000
Scanned: 19000
Scanned: 20000
Scanned: 21000
Scanned: 22000
Scanned: 23000
Scanned: 24000
Scanned: 25000
Scanned: 26000
Scanned: 27000
Scanned: 28000
Scanned: 29000
Scanned: 30000
Scanned: 31000


### Automatic File Recovering

After the automatic file recovering we want to know: 
* how many files have been recovered
* how many files need to be manually processed

In [5]:
'''
@param recover_error_log_path path to the file recover error log path 
@param output_file_path file where to write the output statistics
'''
def postAutomaticRecovering():
    f_log_recover = open(recover_error_log_path, "r")
    output_file = open(output_file_path, "a")

    #read the recover error_log

    n_unrecovered_file = 0
    n_recovered_file = 0
        
    while True:

        line = f_log_recover.readline()

        if not line:
            break

        #split the line
        fields = line.split(": ")

        if fields[0] == "Dataset":
            n_unrecovered_file += 1
        elif fields[0] == "Recover in Dataset": 
            n_recovered_file += 1

    output_file.write("Number of recovered files: "+str(n_recovered_file)+"\n")
    output_file.write("Number of unrecovered files: "+str(n_unrecovered_file)+"\n")

    f_log_recover.close()
    output_file.close()


scriptDir = os.path.dirname(os.path.realpath('__file__'))

datasets_directory_path = "/media/manuel/500GBHDD/Tesi/Datasets"                                #path to the folder of the downloaded datasets

output_file_path = os.path.join(scriptDir, '../../logs/statistics/post_automatic_recovering_statistics.txt')     #path to the error log file

recover_error_log_path = os.path.join(scriptDir, '../../logs/recover_error_log.txt')                  #path to the checker error log file
postAutomaticRecovering() 

### Manual File Recovering

After the manual file recovering we want to know: 
* total number of files in the collection
* total number of files with a no RDF extension

In [None]:
SUFFIXES = [".rdf", ".rdfs", ".ttl", ".owl", ".n3", ".nt", ".jsonld", ".xml", ".ntriples", ".nq", ".trig", ".trix"]

'''
@param output_file_path file where to write the output statistics
'''
def postManualRecovering():
    output_file = open(output_file_path, "a")

    nFiles = 0
    nNoRDFFiles = 0

    for dataset in os.scandir(datasets_directory_path):
        for file in os.scandir(dataset):
            if file.name != "dataset_metadata.json" and file.name != "dataset_content.json":
                file_suffix = pathlib.Path(file.path).suffix
                if file_suffix not in SUFFIXES
                    nNoRDFFiles += 1
                nFiles+=1

    output_file.write("Total number of files: "+str(nFiles))
    output_file.write("Total number of NO RDF files: "+str(nNoRDFFiles))

    output_file.close()


scriptDir = os.path.dirname(os.path.realpath('__file__'))

datasets_directory_path = "/media/manuel/500GBHDD/Tesi/Datasets"                                #path to the folder of the downloaded datasets

output_file_path = os.path.join(scriptDir, 'logs/post_manual_recovering_statistics.txt')     #path to the error log file

postManualRecovering() 

### Indexing

After the indexing phase we want to know: 
* total number of mined files
* total number of files that cannot be mined
* total number of full datasets
* total number of partial datasets
* total number of empty datasets


In [None]:
'''
@param output_file_path file where to write the output statistics
'''
def postIndexing():
    output_file = open(output_file_path, "a")

    full_datasets = 0
    empty_datasets = 0
    partial_dataset = 0
    mined_files = 0

    for dataset in os.scandir(datasets_directory_path):

        #open the dataset metadata file
        dataset_metadata_file = open(dataset.path+"/dataset_metadata.json", "r", encoding="utf-8")
        dataset_metadata = json.load(dataset_metadata_file, strict = False)
        dataset_metadata_file.close()

        mined_files += dataset_metadata["mined_files"]

        if dataset_metadata["mined_files"] == 0:
            empty_datasets += 1
        elif dataset_metadata["mined_files"] > 0 and dataset_metadata["mined_files"] < (len(os.listdir(dataset))-2):
            partial_dataset += 1
        elif dataset_metadata["mined_files"] == (len(os.listdir(dataset))-2):
            full_datasets += 1 

    output_file.write("Total number of mined files: "+str(mined_files))
    output_file.write("Total number of full datasets: "+str(full_datasets) )
    output_file.write("Total number of partial datasets: "+str(partial_dataset) )
    output_file.write("Total number of empty datasets: "+str(empty_datasets) )
    output_file.close()


scriptDir = os.path.dirname(os.path.realpath('__file__'))

datasets_directory_path = "/media/manuel/500GBHDD/Tesi/Datasets"                                #path to the folder of the downloaded datasets

output_file_path = os.path.join(scriptDir, 'logs/post_indexing_statistics.txt')     #path to the error log file

postIndexing() 