# Download Stats

In this notebook we are going to retrieve some insights after the download phase. All the notebook output will be saved in the output directory.

After the collection download we want to know:
* how many links we tried to download
* how many of these links are downloaded with success 
* how many of these links gave error
* how many datasets are complete (all the files for the dataset are downloaded)
* how many datasets are partial (not all the files for the dataset are downloaded)
* how many datasets are empty (no files for the dataset are downloaded)
* how many files have not a valid RDF extension 

In [1]:
import os
import json
import pathlib
import tqdm 
from tqdm import tqdm
import json

SUFFIXES = [".rdf", ".rdfs", ".ttl", ".owl", ".n3", ".nt", ".jsonld", ".xml", ".ntriples", ".nq", ".trig", ".trix"]

In [None]:
'''
@param datasets_directory_path path to the directory where there are all the datasets
@param checker_error_log_path path of the log file of the dataset checker
@param output_file_path file where to write the output statistics
'''
def postDownloadStats():

    output_file = open(output_file_path, "a")
    f_log_checker = open(checker_error_log_path, "r")

    n_datasets = 0
    n_empty = 0
    n_full = 0
    n_links = 0
    success_links = 0
    error_links = 0 

    #scan the datasets folders and extract download info

    for folder in os.scandir(datasets_directory_path):
        
        n_datasets+=1

        if n_datasets % 1000 == 0:
            print("Scanned: "+str(n_datasets))

        dataset_json_path = datasets_directory_path+"/"+folder.name+"/dataset_metadata.json"

        #open the dataset.json file 
        dataset_json_file=open(dataset_json_path, "r")
        
        #load the json object present in the json datasets list
        dataset_json = json.load(dataset_json_file,strict=False)

        if dataset_json["download_info"]["downloaded"] == 0:
            n_empty+=1 
        elif dataset_json["download_info"]["downloaded"] == dataset_json["download_info"]["total_URLS"]:
            n_full+=1

        n_links += dataset_json["download_info"]["total_URLS"]
        success_links += dataset_json["download_info"]["downloaded"]
        error_links += (dataset_json["download_info"]["total_URLS"] - dataset_json["download_info"]["downloaded"])
        
        dataset_json_file.close()
        del(dataset_json)
        
    #read the checker error_log

    n_file = 0
        
    while True:

        line = f_log_checker.readline()

        if not line:
            break

        #split the line
        fields = line.split(": ")

        if fields[0] == "File":
            n_file += 1

    output_file.write("Number of tried links:"+str(n_links)+"\n")
    output_file.write("Number of success links:"+str(success_links)+"\n")
    output_file.write("Number of error links:"+str(error_links)+"\n")
    output_file.write("Number of datasets: "+str(n_datasets)+"\n")
    output_file.write("Number of full datasets: "+str(n_full)+"\n")
    output_file.write("Number of partial datasets: "+str(n_datasets-n_full-n_empty)+"\n")
    output_file.write("Number of empty datasets: "+str(n_empty)+"\n")
    output_file.write("Number of files that need to be assigned to an extension: "+str(n_file)+"\n")

    output_file.close()
    f_log_checker.close()

dirname = os.path.dirname(__name__)
logs_path = os.path.join(dirname, '../download/logs/downloader_errors.log')

datasets_directory_path = "/media/manuel/Tesi/Datasets"                                               #path to the folder of the downloaded datasets

output_file_path = os.path.join('output/post_download_statistics.txt')                                #path to the output file

checker_error_log_path = os.path.join(dirname, '../../download/logs/check_datasets.log')              #path to the check_datasets.py log
postDownloadStats() 

Scanned: 1000
Scanned: 2000
Scanned: 3000
Scanned: 4000
Scanned: 5000
Scanned: 6000
Scanned: 7000
Scanned: 8000
Scanned: 9000
Scanned: 10000
Scanned: 11000
Scanned: 12000
Scanned: 13000
Scanned: 14000
Scanned: 15000
Scanned: 16000
Scanned: 17000
Scanned: 18000
Scanned: 19000
Scanned: 20000
Scanned: 21000
Scanned: 22000
Scanned: 23000
Scanned: 24000
Scanned: 25000
Scanned: 26000
Scanned: 27000
Scanned: 28000
Scanned: 29000
Scanned: 30000
Scanned: 31000
