In [None]:
## STANDARDIZZATORE ##

#FASI
#1. LETTURA estensione
#2. TRADUZIONE in JSON


#FORMATI DATASET (fonte: https://docs.italia.it/AgID/documenti-in-consultazione/lg-opendata-docs/it/bozza/allegato-b-standard-di-riferimento-e-formati-aperti.html)

#!! PRINCIPALI !!
##Formati aperti per i dati
#CSV (Comma Separated Values)
#JSON (JavaScript Object Notation)
#XML (eXtensible Markup Language)
#XLSX (Excel)
#!!


##Formati aperti più diffusi per i dati geografici
#Shapefile
#KML
#GeoJSON
#GML (Geography Markup Language)
#GeoPackage

##Formati aperti per i documenti
#ODF (Open Document Format)
#PDF
#Akoma Ntoso

##Formati per dati meteorologici
#BUFR (Binary Universal Form for the Representation of meteorological data)
#NetCDF (Network Common Data Form)
#ASCII (American Standard Code for Information Interchange)
#Avvisi Meteo: 
#CAP (Common Alerting Protocol), RSS (Really Simple Syndication)/Atom
#Radar: 
#HDF5 (Hierarchical Data Format)
#Modello NWP (Numerical weather prediction): 
#GRIB (General Representation of fields In Binary)


In [87]:
import pandas as pd
import json
import os
import xml.etree.ElementTree as ET
from bson import ObjectId
import random

In [115]:
def random_sampling(max_len, n_sample):
    random.seed(0)
    # Verifica che il numero di righe di cui fare sample non sia superiore al numero massimo effettivo di righe
    actual_n_sample = n_sample
    if (n_sample > max_len):
        actual_n_sample = max_len

    # Genera una lista di numeri casuali distinti nell'intervallo specificato
    row_list = random.sample(range(0, max_len), actual_n_sample)

    # Stampare la lista di numeri casuali distinti
    return row_list

In [116]:
def extract(file_path, n_sample):
    sample = None
    root, extension = os.path.splitext(file_path)
    if (extension == '.csv'):
        sample = extract_csv(file_path, n_sample)      
    #elif (extension == '.xml'):
    #    json_item = extract_xml(file_path) 
    #elif (extension == '.json'):
    #    sample = extract_json(file_path, n_sample)
    elif (extension == '.xlsx'):
        sample = extract_excel(file_path, n_sample)      
        
    return sample
        
def extract_csv(file_path, n_sample):
    data = pd.read_csv(file_path, encoding='latin1')  
    row_sampled = random_sampling(len(data),n_sample)
    sample_data = data.loc[row_sampled]
    return sample_data
    
#def extract_json(file_path, n_sample):
#    with open(file_path, 'r') as file:
#        data = json.load(file)
#    return data

def extract_excel(file_path, n_sample):
    data = pd.read_excel(file_path)
    row_sampled = random_sampling(len(data),n_sample)
    sample_data = data.loc[row_sampled]   
    return sample_data



In [117]:
#mi verifica se la lista è numerica nonostante la presenza di stringhe
def is_numeric_list(lst):
    try:
        # Prova a convertire ogni elemento in float
        float_values = [float(element) for element in lst]
        return True
    except ValueError:
        # Se la conversione in float genera un errore, la lista non è completamente numerica
        return False



#identifico il tipo di variabile fra:
#numerica discreta
#numerica continua
#categorica (parole per le categorie) o testuale (solo testi diversi)?

def variable_type_detector(variable):
    vtype = "undefined"
    #è numerica
    if(pd.api.types.is_numeric_dtype(variable)):
        #conta ripetizioni di valori
        unique_counter = variable.nunique()
        #al massimo 8 categorie (esempio)
        max_counter = 8 
        if (unique_counter <= max_counter):
            vtype = "categorical numerical"
        else:
            vtype = "continuous numerical"
    #è testuale
    else:
        #conta ripetizioni di valori
        unique_counter = variable.nunique()
        #al massimo 8 categorie (esempio)
        max_counter = 8 
        if (unique_counter <= max_counter):
            vtype = "categorical textual"
        else:
            vtype = "text"
            
    return vtype
        

In [91]:
# Carica il file CSV
file_path = 'kaggle_datasets/classification/coffee_maker.csv'
df = pd.read_csv(file_path)
variable_type_detector(df['rating'])

#df

5000
5


In [107]:
def create_file_node(file_name, sample_data):
    #default values
    file_elements = file_name.split(".")
    extension_type = "undefined"
    features = []
    n_features = 0
    
    if (file_elements[1] in ['csv', 'xlsx']):
        extension_type = "tabular"
        n_features = len(sample_data.columns)
        features = [
        {
            "feature_name": col,
            "feature_datatype": sample_data[col].dtype.name,
            "feature_type": variable_type_detector(sample_data[col]),
            "elements_sampled": sample_data[col].tolist()
        }
        for col in sample_data.columns
            
            
    ]   
    elif (file_elements[1] in ['jpg', 'png']):
        extension_type = "image"   
    
    
    new_file = {
        "file_name": file_elements[0],
        "file_extension": file_elements[1],
        "file_type": extension_type,
        "n_features": n_features,
        "features": features
    }
    return new_file

In [108]:
def navigate_folders(root, subfolders_limit, files_limit, new_dataset_node, subfolder_count=0):
#mi prende eventuali files, i primi 10
    for current_root, dirs, files in os.walk(root):
        for file in files[0:files_limit]:
            file_path = os.path.join(current_root, file)
            n_sample = 100
            file_sample = extract(file_path, n_sample)
            if file_sample is not None:
                new_file = create_file_node(file, file_sample)
                new_dataset_node["files"].append(new_file)
                print(file_path)
                
        #se c'è una sottocartella, "scava" ad albero, limitando il numero di sottocartelle da esaminare
        # Chiamata ricorsiva per le prime massime sottocartelle
        if subfolder_count >= subfolders_limit:
            break
            
        for subfolder in dirs:
            new_root = os.path.join(current_root, subfolder)
            subfolder_count = subfolder_count + 1
            navigate_folders(new_root, subfolders_limit, files_limit, subfolder_count)
            # Verifica se abbiamo raggiunto il limite delle sottocartelle
            if subfolder_count >= subfolders_limit:
                break
            

In [113]:
#Navigation

#Directory
root_directory = 'test_navigate'
extensions = ['.csv', '.json', '.xlsx']
directories = [dirc for dirc in os.listdir(root_directory)]
#files = [file for file in os.listdir(directory) if file.endswith(tuple(extensions))]

# Json final
datasets_json = {}


# Converte ciascun file in un JSON
for directory in directories:
    category = directory 
    directory_path = root_directory + "/" + directory
    #each folder is the name of the dataset
    datasets = [dataset for dataset in os.listdir(directory_path)]
    #in ciascuna cartella del dataset, posso aspettarmi di tutto, cartelle, files etc.. come faccio?
    #possibile soluzione: prendo alpiù 5 cartelle e alpiù 10 files, nelle cartelle scavo ad albero in 1-2 livelli massimi,
    #poi mi fermo e inserisco max 10 files nella struttura
    for dataset in datasets:
        dataset_path = root_directory + "/" + directory + "/" + dataset
        new_dataset_node = {"_id": dataset, "label": category, "files": []}
        navigate_folders(dataset_path, 5, 10, new_dataset_node)
        
        json_data = json.dumps(new_dataset_node, indent=2)
        with open("output " + dataset + ".json", "w") as json_file:
            json_file.write(json_data)
        

test_navigate/classification/atis_intents\atis_intents.csv
test_navigate/classification/atis_intents\atis_intents_test.csv
test_navigate/classification/atis_intents\atis_intents_train.csv
test_navigate/classification/bill_authentication\bill_authentication.csv
test_navigate/classification/birds\birds.csv
test_navigate/classification/bodyPerformance\bodyPerformance.csv
test_navigate/classification/breast-cancer-data\breast-cancer-data.csv


In [None]:
#tests..

In [39]:
# Carica il file CSV
file_path = 'test dataset clustering/education/capital-project-schedules-and-budgets-1.csv'
df = pd.read_csv(file_path)

# Ottieni i metadati
info = df.info()

# Visualizza i metadati
print(info)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8185 entries, 0 to 8184
Data columns (total 14 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   Project Geographic District                                 8185 non-null   int64  
 1   Project Building Identifier                                 8185 non-null   object 
 2   Project School Name                                         8185 non-null   object 
 3   Project Type                                                8185 non-null   object 
 4   Project Description                                         8185 non-null   object 
 5   Project Phase Name                                          8184 non-null   object 
 6   Project Status Name                                         8185 non-null   object 
 7   Project Phase Actual Start Date                             8185 non-null   object 
 8 

In [40]:
unique_values_count = df.nunique()
print(unique_values_count)

Project Geographic District                                     32
Project Building Identifier                                   1196
Project School Name                                           1483
Project Type                                                    20
Project Description                                           1363
Project Phase Name                                               8
Project Status Name                                              3
Project Phase Actual Start Date                               1069
Project Phase Planned End Date                                1168
Project Phase Actual End Date                                  888
Project Budget Amount                                         3613
Final Estimate of Actual Costs Through End of Phase Amount    5730
Total Phase Actual Spending Amount                            5060
DSF Number(s)                                                 3554
dtype: int64
