In [1]:
pip install rdflib



In [2]:
pip install datetime



In [3]:
pip install tqdm



In [4]:
pip install psutil



In [5]:
import pandas as pd
import os
from tqdm import tqdm
import datetime
import re

from rdflib import Graph, Literal, RDF, RDFS, URIRef, Namespace
from rdflib.plugins.sparql import prepareQuery
from rdflib.namespace import XSD

In [6]:
# To measure the usage of RAM
import psutil

In [7]:
# Use your personal account!
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
global pbar

chunksize = 10000

BTP = Namespace('http://www.dei.unipd.it/~gdb/ontology/btp/')

global viaChiarini_gp, giardiniMargherita_gp, portaSanFelice_gp
viaChiarini_gp = [44.4997732567231, 11.2873095406444]
giardiniMargherita_gp = [44.4830615285162, 11.3528830371546] # via Medaro Bottonelli
portaSanFelice_gp = [44.4991470592725, 11.3270506316853]

global coil_dict
coil_dict = dict()

In [9]:
# Function to populate the coils dataset
def coils_process_chunk(chunk : set) -> set:
    # Graph
    chunk_set = set()

    # OWL - Object Properties
    chunk_set.add(':isNearTo a owl:ObjectProperty .')
    chunk_set.add(':isObserved a owl:ObjectProperty .')
    chunk_set.add(':hasObserve a owl:ObjectProperty .')
    chunk_set.add(':hasLevel a owl:ObjectProperty .')
    chunk_set.add(':hasType a owl:ObjectProperty .')
    chunk_set.add(':isOn a owl:ObjectProperty .')
    chunk_set.add(':isPlaced a owl:ObjectProperty .')

    # OWL - DataType Properties
    chunk_set.add(':hasID a owl:DatatypeProperty .')

    for index, row in chunk.iterrows():

        # I check if the record is valid or not -> must have all the field not NaN
        if row['Livello'] == '' or row['tipologia'] == '' or row['codice arco'] == '' or row['ID_univoco_stazione_spira'] == '':
            # I skip the record -> next record
            continue

        # else: is valid -> continue

        ## COIL:
        # -uri: coil_ + id number.
        # -attributi: hasID
        # -object properties: hasLevel, hasType, isOn, and isPlaced.

        Coil = ':coil_'+str(int(row['ID_univoco_stazione_spira']))

        # PollutionCoils and SimpleCoils are subclasses of Coil
        chunk_set.add(':PollutionCoil rdfs:subClassOf :Coil .')
        chunk_set.add(':SimpleCoil rdfs:subClassOf :Coil .')

        # Cast to float
        latitudine = row['latitudine']
        longitudine = row['longitudine']

        if(type(latitudine) == str):
            latitudine = latitudine.replace(',', '')
            # From 113473933293812,00 to 11.3473933293812
            latitudine = latitudine[:2] + '.' + latitudine[2:]
            # Cast to float
            latitudine = float(latitudine)
        if(type(longitudine) == str):
            longitudine = longitudine.replace(',', '')
            # From 44500438455000,00 to 44.500438455000
            longitudine = longitudine[:2] + '.' + longitudine[2:]
            longitudine = float(longitudine)

        # Pollution coils -> must be around 300 m
        if ((viaChiarini_gp[0] - 0.0027 <= latitudine <= viaChiarini_gp[0] + 0.0027) and (viaChiarini_gp[1] - 0.0013 <= longitudine <= viaChiarini_gp[1] + 0.0013)):
            chunk_set.add(Coil + ' a :PollutionCoil .')
            PollutionStation = ':viaChiariniControlUnit'
            chunk_set.add(PollutionStation + ' a :PollutionStation .')
            chunk_set.add(PollutionStation + ' :isNearTo ' + Coil +' .')
        elif ((giardiniMargherita_gp[0] - 0.0027 <= latitudine <= giardiniMargherita_gp[0] + 0.0027) and (giardiniMargherita_gp[1] - 0.0013 <= longitudine <= giardiniMargherita_gp[1] + 0.0013)):
            chunk_set.add(Coil + ' a :PollutionCoil .')
            PollutionStation = ':giardiniMargheritaControlUnit'
            chunk_set.add(PollutionStation + ' a :PollutionStation .')
            chunk_set.add(PollutionStation + ' :isNearTo ' + Coil +' .')
        elif ((portaSanFelice_gp[0] - 0.0027 <= latitudine <= portaSanFelice_gp[0] + 0.0027) and (portaSanFelice_gp[1] - 0.0013 <= longitudine <= portaSanFelice_gp[1] + 0.0013)):
            chunk_set.add(Coil + ' a :PollutionCoil .')
            PollutionStation = ':portaSanFeliceControlUnit'
            chunk_set.add(PollutionStation + ' a :PollutionStation .')
            chunk_set.add(PollutionStation + ' :isNearTo ' + Coil +' .')
        else:
            chunk_set.add(Coil + ' a :SimpleCoil .')


        for i in range(2, 26):
            date_obj = datetime.datetime.strptime(str(row['data']), '%Y-%m-%d')
            VehicleDetection = ':veDet_'+str(int(row['ID_univoco_stazione_spira']))+'_'+(date_obj.strftime('%Y-%m-%d')).replace('-', '_')+'_'+str(i-2).zfill(2)+'_'+str(i-1).zfill(2)
            chunk_set.add(VehicleDetection + ' a :VehicleDetection .')
            chunk_set.add(VehicleDetection + ' :isObserved ' + Coil + ' .')
            chunk_set.add(Coil + ' :hasObserve ' + VehicleDetection + ' .')

        Level = ':level'+str(int(row['Livello']))
        chunk_set.add(Level + ' a :Level .')
        chunk_set.add(Coil + ' :hasLevel ' + Level + ' .')

        Type = URIRef(BTP['type_'+str(row['tipologia'])])
        Type = ':'+str(row['tipologia'])
        chunk_set.add(Type + ' a :Type .')
        chunk_set.add(Coil + ' :hasType ' + Type + ' .')

        chunk_set.add(Coil + ' :hasID "' + str(row['codice spira']) + '"^^xsd:string .')

        RoadArch = ':roadarch_'+str(int(row['codice arco']))
        chunk_set.add(RoadArch + ' a :RoadArch .')
        chunk_set.add(Coil + ' :isOn ' + RoadArch + ' .')
        chunk_set.add(RoadArch + ' :isPlaced ' + Coil + ' .')

        # Update the dictionary
        coil_dict[str(row['codice spira'])] = str(int(row['ID_univoco_stazione_spira']))

    pbar.update(len(chunk))

    return chunk_set

In [10]:
# Function that populates the vehicle count dataset
def vehicle_count_process_chunk(chunk: set) -> set:

    vc_set = set()

    # OWL - Object Properties
    vc_set.add(':isObservedOnPeriod a owl:ObjectProperty .')
    vc_set.add(':hasObservedOnPeriod a owl:ObjectProperty .')
    vc_set.add(':onDay a owl:ObjectProperty .')

    # OWL - DataType Properties
    vc_set.add(':hasCount a owl:DatatypeProperty .')
    vc_set.add(':startTime a owl:DatatypeProperty .')
    vc_set.add(':endTime a owl:DatatypeProperty .')

    for index, row in chunk.iterrows():

        # I check if the record is valid or not -> must have all the field not NaN
        if row['Livello'] == '' or row['tipologia'] == '' or row['ID_univoco_stazione_spira'] == '':
            # I skip the record -> next record
            continue
        # else: is valid -> continue

        for i in range(2, 26):

            ## VEHICLEDETECTION:
            # -uri: vehicleDetection_ + id number + _ + date.
            # -attributi: hasCount.
            # -object properties: isObserved, hasObserve, isObservedOnPeriod, and hasObservedOnPeriod.

            date_obj = datetime.datetime.strptime(str(row['data']), '%Y-%m-%d')
            VehicleDetection = ':veDet_'+str(int(row['ID_univoco_stazione_spira']))+'_'+(date_obj.strftime('%Y-%m-%d')).replace('-', '_')+'_'+str(i-2).zfill(2)+'_'+str(i-1).zfill(2)
            vc_set.add(VehicleDetection + ' a :VehicleDetection .')

            vc_set.add(VehicleDetection + ' :hasCount "' + str(int(row.iloc[i])) + '"^^xsd:integer .')

            # # PERIOD:
            # -uri: period_ + date + _ + hour1 + _ + hour2.
            # -attributi: startTime and endTime.
            # -object properties: onDay.

            date_obj = datetime.datetime.strptime(str(row['data']), '%Y-%m-%d')
            Period = ':period_'+(date_obj.strftime('%Y-%m-%d')).replace('-', '_')+'_'+str(i-2).zfill(2)+'_'+str(i-1).zfill(2)
            vc_set.add(Period + ' a :Period .')

            vc_set.add(Period + ' :isObservedOnPeriod ' + VehicleDetection + ' .')
            vc_set.add(VehicleDetection + ' :hasObservedOnPeriod ' + Period + ' .')

            startTime = str(i-2).zfill(2)+':00:00'
            date_obj = datetime.datetime.strptime(str(row['data']), '%Y-%m-%d')

            vc_set.add(Period + ' :startTime "' + str(date_obj.strftime('%Y-%m-%d')+'T'+startTime) + '"^^xsd:dateTime .')

            endTime = str(i-1).zfill(2)+':00:00'

            # If the endTime is 24 -> date+1 and endTime = 00
            if(endTime == '24:00:00'):
                endTime = '00:00:00'
                # I add one day
                date_obj = date_obj + datetime.timedelta(days=1)

            vc_set.add(Period + ' :endTime "' + str(date_obj.strftime('%Y-%m-%d')+'T'+endTime) + '"^^xsd:dateTime .')

            ## Convert day from italian to english ex: lunedì -> monday
            day_value = ''
            if 'Giorno della settimana' in row:
                day_value = str(row['Giorno della settimana']).lower()
            elif 'giorno della settimana' in row:
                day_value = str(row['giorno della settimana']).lower()

            match day_value:
                case 'lunedì':
                    DayWeek = ':monday'
                    vc_set.add(DayWeek + ' a :DayWeek .')
                    vc_set.add(Period + ' :onDay ' + DayWeek + ' .')
                case 'martedì':
                    DayWeek = ':tuesday'
                    vc_set.add(DayWeek + ' a :DayWeek .')
                    vc_set.add(Period + ' :onDay ' + DayWeek + ' .')
                case 'mercoledì':
                    DayWeek = ':wednesday'
                    vc_set.add(DayWeek + ' a :DayWeek .')
                    vc_set.add(Period + ' :onDay ' + DayWeek + ' .')
                case 'giovedì':
                    DayWeek = ':thursday'
                    vc_set.add(DayWeek + ' a :DayWeek .')
                    vc_set.add(Period + ' :onDay ' + DayWeek + ' .')
                case 'venerdì':
                    DayWeek = ':friday'
                    vc_set.add(DayWeek + ' a :DayWeek .')
                    vc_set.add(Period + ' :onDay ' + DayWeek + ' .')
                case 'sabato':
                    DayWeek = ':saturday'
                    vc_set.add(DayWeek + ' a :DayWeek .')
                    vc_set.add(Period + ' :onDay ' + DayWeek + ' .')
                case 'domenica':
                    DayWeek = ':sunday'
                    vc_set.add(DayWeek + ' a :DayWeek .')
                    vc_set.add(Period + ' :onDay ' + DayWeek + ' .')
                case _:
                    # No day provided
                    pass

    pbar.update(len(chunk))

    return vc_set

In [11]:
# Function that populates the vehicle accuracy dataset
def vehicle_accuracy_process_chunk(chunk : set) -> set:

    # Graphs
    acc_set = set()

    # OWL - DataType Properties
    acc_set.add(':hasAccuracy a owl:DatatypeProperty .')

    for index, row in chunk.iterrows():

        for i in range(2, 26):

            ## VEHICLEDETECTION:
            # -uri: vehicleDetection_ + id number + _ + date.
            # -attributi: hasAccuracy, and hasCount.

            # Query to get the coil's code associated to an ID
            coil = get_coil_by_id(str(row['codice spira']))
            if coil == None:
                # I skip the record -> next record
                continue

            date_obj = datetime.datetime.strptime(str(row['data']), '%Y-%m-%d')

            VehicleDetection = ':veDet_' + str(int(coil)) + '_' + (str(date_obj.strftime('%Y-%m-%d'))).replace('-', '_') + '_'+str(i-2).zfill(2) + '_' + str(i-1).zfill(2)
            acc_set.add(VehicleDetection + ' a :VehicleDetection .')
            percentage = row.iloc[i].replace('%', '')
            acc_set.add(VehicleDetection + ' :hasAccuracy "' + str(float(percentage)) + '"^^xsd:float .')

    pbar.update(len(chunk))

    return acc_set

In [12]:
# Function that populates the pollution data
def pollution_process_chunk(chunk: set) -> set:

    pol_set = set()

    # OWL - Object Properties
    pol_set.add(':hasDetect a owl:ObjectProperty .')
    pol_set.add(':isDetected a owl:ObjectProperty .')
    pol_set.add(':isRegistered a owl:ObjectProperty .')
    pol_set.add(':hasRegister a owl:ObjectProperty .')
    pol_set.add(':isDetectedOnPeriod a owl:ObjectProperty .')
    pol_set.add(':hasDetectedOnPeriod a owl:ObjectProperty .')

    # OWL - DateType Properties
    pol_set.add(':startTime a owl:DatatypeProperty .')
    pol_set.add(':endTime a owl:DatatypeProperty .')
    pol_set.add(':inQuantity a owl:DatatypeProperty .')
    pol_set.add(':hasChemicalName a owl:DatatypeProperty .')

    for index, row in chunk.iterrows():

        ## POLLUTIONSTATION:
        # -uri: centralUnit + pollution name.
        # -object properties: hasRegister, and isRegistered.

        PollutionStation = ':' + ((str(row['COD_STAZ']).split(" "))[0]).lower() + ''.join(s.capitalize() for s in (str(row['COD_STAZ']).split(" "))[1:]) + 'ControlUnit'
        pol_set.add(PollutionStation + ' a :PollutionStation .')

        # PERIOD:
        # -uri: period_ + date + _ + hour1 + _ + hour2.
        # -attributi: startTime and endTime.
        # -object properties: onDay.

        # date format: yyyy-mm-ddThh:mm:ss+hh:mm
        # keep only the data: 'Thh:mm:ss+hh:mm' -> yyyy-mm-dd
        date_obj = datetime.datetime.strptime((str(row['DATA_INIZIO']).split('T'))[0], '%Y-%m-%d')
        # keep only the hour: 'Thh:mm:ss+hh:mm' -> hh:mm:ss
        startTime = str((((str(row['DATA_INIZIO']).split('T'))[1].split('+')[0]).split(':'))[0])
        endTime = str((((str(row['DATA_FINE']).split('T'))[1].split('+')[0]).split(':'))[0])

        Period = ':period_'+(str(date_obj.strftime('%Y-%m-%d'))).replace('-', '_')+'_'+startTime+'_'+endTime
        pol_set.add(Period + ' a :Period .')

        chemical_element = ((row['AGENTE'].split('(')[0]).strip()).upper()
        date_obj = datetime.datetime.strptime((str(row['DATA_INIZIO']).split('T'))[0], '%Y-%m-%d')

        ## CHEMICALDETECTION:
        # -uri: chemicalDetection_ + pollution_station_name + _ + date + _ + element.
        # -attributi: inQuantity (conversion all in ug/m), and hasChemicalName.
        # -object properties: isDetectedOnPeriod, hasDetectedOnPeriod, hasDetect, and isDetected.

        ChemicalDetection = ':chDet_'+(str(row['COD_STAZ']).lower()).replace(' ', '')+'_'+(str(date_obj.strftime('%Y-%m-%d'))).replace('-', '_')+'_'+startTime+'_'+endTime+'_'+chemical_element
        pol_set.add(ChemicalDetection + ' a :ChemicalDetection .')
        pol_set.add(ChemicalDetection + ' :isRegistered ' + PollutionStation + ' .')
        pol_set.add(PollutionStation + ' :hasRegister ' + ChemicalDetection + ' .')

        pol_set.add(Period + ' :isDetectedOnPeriod ' + ChemicalDetection + ' .')
        pol_set.add(ChemicalDetection + ' :hasDetectedOnPeriod ' + Period + ' .')

        # Cast from mg/m^3 to ug/m^3
        if(row['UM'] == 'mg/m3'):
            pol_set.add(ChemicalDetection + ' :inQuantity "' + str(float(row['VALORE']*1000)) + '"^^xsd:float .')
        else:
            pol_set.add(ChemicalDetection + ' :inQuantity "' + str(float(row['VALORE'])) + '"^^xsd:float .')

        ## CHEMICALELEMENT:
        # -uri: chemicalElement_ + chemical element name.
        # -object properties: hasDetect, and isDetected

        ChemicalElement = ':'+chemical_element
        pol_set.add(ChemicalElement + ' a :ChemicalElement .')
        pol_set.add(ChemicalDetection + ' :hasDetect ' + ChemicalElement + ' .')
        pol_set.add(ChemicalElement + ' :isDetected ' + ChemicalDetection + ' .')

        if len(row['AGENTE'].split('(')) > 1:
            chemical_element_name = (((row['AGENTE'].split('(')[1]).replace(')','')).strip()).lower()

            match chemical_element_name:
                case 'benzene':
                    pol_set.add(ChemicalDetection + ' :hasChemicalName "Benzene"^^xsd:string .')
                case 'monossido di carbonio':
                    pol_set.add(ChemicalDetection + ' :hasChemicalName "Carbon monoxide"^^xsd:string .')
                case 'monossido di azoto':
                    pol_set.add(ChemicalDetection + ' :hasChemicalName "Nitrogen Monoxide"^^xsd:string .')
                case 'biossido di azoto':
                    pol_set.add(ChemicalDetection + ' :hasChemicalName "Nitrogen dioxide"^^xsd:string .')
                case 'ossidi di azoto':
                    pol_set.add(ChemicalDetection + ' :hasChemicalName "Nitrogen oxides"^^xsd:string .')
                case 'ozono':
                    pol_set.add(ChemicalDetection + ' :hasChemicalName "Ozone"^^xsd:string .')
                case _:
                    # New element provided
                    pol_set.add(ChemicalDetection + ' :hasChemicalName "' + chemical_element_name + '"^^xsd:string .')

        startTime = startTime+':00:00'
        endTime = endTime+':00:00'

        pol_set.add(Period + ' :startTime "' + str(date_obj.strftime('%Y-%m-%d')+'T'+startTime) + '"^^xsd:dateTime .')
        pol_set.add(Period + ' :endTime "' + str(date_obj.strftime('%Y-%m-%d')+'T'+endTime) + '"^^xsd:dateTime .')

    pbar.update(len(chunk))

    return pol_set

In [13]:
# Function to save a graph
def save_graph(set : set, path : str):

    with open(path, 'w', encoding="utf-8") as file:

        file.write('@prefix : <' + BTP + '> .\n')
        file.write('@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n')
        file.write('@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n')
        file.write('@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n')
        file.write('@prefix owl: <http://www.w3.org/2002/07/owl#> .\n')

        file.write('\n')

        for elem in set:
            file.write(elem + '\n')

In [14]:
def get_coil_by_id(coil_id : str) -> str:

    return coil_dict.get(coil_id)

In [15]:
## Datasets

# Rilevazione flusso datasets
rilevazione_flusso = []

# ONLY FOR TEST
# rilevazione_flusso.append('/content/drive/MyDrive/Colab Notebooks/Graph Database/datasets/test/rilevazione_flusso_veicoli_2019.csv')

rilevazione_flusso.append('/content/drive/MyDrive/Colab Notebooks/Graph Database/datasets/rilevazione_flusso_veicoli_2019.csv')
rilevazione_flusso.append('/content/drive/MyDrive/Colab Notebooks/Graph Database/datasets/rilevazione_flusso_veicoli_2020.csv')
rilevazione_flusso.append('/content/drive/MyDrive/Colab Notebooks/Graph Database/datasets/rilevazione_flusso_veicoli_2021.csv')
rilevazione_flusso.append('/content/drive/MyDrive/Colab Notebooks/Graph Database/datasets/rilevazione_flusso_veicoli_2022.csv')

# Accuratezza spire datasets
accuratezza_spire = []

# ONLY FOR TEST
# accuratezza_spire.append('/content/drive/MyDrive/Colab Notebooks/Graph Database/datasets/test/accuratezza_spire_2019.csv')

accuratezza_spire.append('/content/drive/MyDrive/Colab Notebooks/Graph Database/datasets/accuratezza_spire_2019.csv')
accuratezza_spire.append('/content/drive/MyDrive/Colab Notebooks/Graph Database/datasets/accuratezza_spire_2020.csv')
accuratezza_spire.append('/content/drive/MyDrive/Colab Notebooks/Graph Database/datasets/accuratezza_spire_2021.csv')
accuratezza_spire.append('/content/drive/MyDrive/Colab Notebooks/Graph Database/datasets/accuratezza_spire_2022.csv')

# Centraline qualità datasets
centraline = []

# ONLY FOR TEST
# centraline.append('/content/drive/MyDrive/Colab Notebooks/Graph Database/datasets/test/dati_centraline_2019.csv')

centraline.append('/content/drive/MyDrive/Colab Notebooks/Graph Database/datasets/dati_centraline_2019.csv')
centraline.append('/content/drive/MyDrive/Colab Notebooks/Graph Database/datasets/dati_centraline_2020.csv')
centraline.append('/content/drive/MyDrive/Colab Notebooks/Graph Database/datasets/dati_centraline_2021.csv')
centraline.append('/content/drive/MyDrive/Colab Notebooks/Graph Database/datasets/dati_centraline_2022.csv')

# Save path
save_path = '/content/drive/MyDrive/Colab Notebooks/Graph Database/rdf'

In [16]:
print('--- populating coils ---')

coils_set = set()

# Regular expression
re_a_type_pol = re.compile(r' a :PollutionStation .')
re_a_type_sm = re.compile(r' a :SimpleCoil .')
re_hasID = re.compile(r' :hasID ')

for namefile in rilevazione_flusso:

    year_dataset = namefile.split('_')[3].split('.')[0]
    piece = 0

    total_rows = len(pd.read_csv(namefile))
    pbar = tqdm(total=total_rows)

    for chunk in pd.read_csv(namefile, sep=';', chunksize=chunksize):

        # Manage NaN values
        chunk = chunk.fillna('')

        # Add the coils to the set
        coils_set.update(coils_process_chunk(chunk))

        # Memory monitor
        if psutil.virtual_memory().percent > 85:
            save_graph(coils_set, '/content/coils_populated_'+year_dataset+'_'+str(piece)+'.ttl')
            # Reset the set
            coils_set.clear()
            piece += 1

    save_graph(coils_set, '/content/coils_populated_'+year_dataset+'_'+str(piece)+'.ttl')

    coils_set.clear()

    pbar.close()

--- populating coils ---


100%|██████████| 287747/287747 [05:02<00:00, 951.06it/s] 
100%|██████████| 284787/284787 [05:08<00:00, 922.66it/s]
100%|██████████| 290531/290531 [04:46<00:00, 1013.45it/s]
100%|██████████| 302872/302872 [05:19<00:00, 947.01it/s] 


In [17]:
# Free memory
del coils_set

In [18]:
print('--- populating coils ---')

vehicle_count_set = set()

for namefile in rilevazione_flusso:

    year_dataset = namefile.split('_')[3].split('.')[0]
    piece = 0

    total_rows = len(pd.read_csv(namefile))
    pbar = tqdm(total=total_rows)

    for chunk in pd.read_csv(namefile, sep=';', chunksize=chunksize):

        # Manage NaN values
        chunk = chunk.fillna('')

        # Add the coils to the set
        vehicle_count_set.update(vehicle_count_process_chunk(chunk))

        # Memory monitor
        if psutil.virtual_memory().percent > 85:
            save_graph(vehicle_count_set, '/content/vehicle_count_populated_'+year_dataset+'_'+str(piece)+'.ttl')
            # Reset the set
            vehicle_count_set.clear()
            piece += 1

    save_graph(vehicle_count_set, '/content/vehicle_count_populated_'+year_dataset+'_'+str(piece)+'.ttl')
    vehicle_count_set.clear()

    pbar.close()

--- populating coils ---


100%|██████████| 287747/287747 [12:17<00:00, 390.10it/s]
100%|██████████| 284787/284787 [12:47<00:00, 371.09it/s]
100%|██████████| 290531/290531 [13:12<00:00, 366.45it/s]
100%|██████████| 302872/302872 [14:32<00:00, 347.10it/s]


In [19]:
# Free memory
del vehicle_count_set

In [20]:
print('--- populating vehicle accuracy ---')

acc_set = set()

for namefile in accuratezza_spire:

    year_dataset = namefile.split('_')[2].split('.')[0]
    piece = 0

    total_rows = len(pd.read_csv(namefile))
    pbar = tqdm(total=total_rows)

    for chunk in pd.read_csv(namefile, sep=';', chunksize=chunksize):

        # Manage NaN values
        chunk = chunk.fillna('')

        # Add the coils to the set
        acc_set.update(vehicle_accuracy_process_chunk(chunk))

        # Memory monitor
        if psutil.virtual_memory().percent > 85:
            save_graph(acc_set, '/content/vehicle_accuracy_populated_'+year_dataset+'_'+str(piece)+'.ttl')
            # Reset the set
            acc_set.clear()
            piece += 1

    save_graph(acc_set, '/content/vehicle_accuracy_populated_'+year_dataset+'_'+str(piece)+'.ttl')
    acc_set.clear()

    pbar.close()

--- populating vehicle accuracy ---


100%|██████████| 286974/286974 [06:24<00:00, 745.66it/s]
100%|██████████| 292845/292845 [06:42<00:00, 727.53it/s]
100%|██████████| 299130/299130 [06:53<00:00, 722.95it/s]
100%|██████████| 316458/316458 [07:10<00:00, 734.61it/s]


In [21]:
# Free memory
del acc_set, coil_dict

In [22]:
print('--- populating pollution data ---')

pollution_set = set()

for namefile in centraline:

    year_dataset = namefile.split('_')[2].split('.')[0]
    piece = 0

    total_rows = len(pd.read_csv(namefile))
    pbar = tqdm(total=total_rows)

    for chunk in pd.read_csv(namefile, sep=';', chunksize=chunksize):

        # Manage NaN values
        chunk = chunk.fillna('')

        # Add the coils to the set
        pollution_set.update(pollution_process_chunk(chunk))

        # Memory monitor
        if psutil.virtual_memory().percent > 85:
            save_graph(pollution_set, '/content/pollution_populated_'+year_dataset+'_'+str(piece)+'.ttl')
            # Reset the set
            pollution_set.clear()
            piece += 1

    save_graph(pollution_set, '/content/pollution_populated_'+year_dataset+'_'+str(piece)+'.ttl')
    pollution_set.clear()

    pbar.close()

--- populating pollution data ---


100%|██████████| 76989/76989 [00:17<00:00, 4377.34it/s]
100%|██████████| 77741/77741 [00:23<00:00, 3347.06it/s]
100%|██████████| 79033/79033 [00:20<00:00, 3924.15it/s]
100%|██████████| 79662/79662 [00:22<00:00, 3583.53it/s]


In [23]:
# Free memory
del pollution_set