In [175]:
from rdflib import Graph, Literal, Namespace, URIRef
from rdflib.namespace import RDF, RDFS, OWL, XSD, DC

import os
import json

# Specify the directory path
directory = "ABOX"

file_attrs = {}

# Create the RDF graph
tbox = Graph()

abox = Graph()
# Define the namespaces
ns = Namespace("http://www.fraudanalytix.com/schema/")
l = Namespace("http://www.fraudanalytix.com/schema/l/")
f = Namespace("http://www.fraudanalytix.com/schema/f/")
e = Namespace("http://www.fraudanalytix.com/schema/e/")

# Parse the TBOX from FA.ttl file
tbox.parse("FA-tbox.ttl", format="ttl")

abox.bind('fa', ns)
abox.bind('l', l)
abox.bind('f', f)
abox.bind('e', e)

In [174]:
def read_headers(file_dict, data_file):
    # Read transactions.csv and add ABOX triples to the graph
    with open(data_file, "r") as file:
        attribute_names = set()
        if '.json' in data_file:
            # Read the JSON file
            data = json.load(file)
            for item in data:
                # Get the keys of each item
                keys = item.keys()
                # Add the keys to the set of attribute names
                attribute_names.update(keys)
        else:
            # Read the column headers
            attribute_names.update(file.readline().strip().split(","))
            
        dir = data_file.replace(directory + '\\', '')
        file_dict[dir] = attribute_names

In [167]:

# Function to recursively loop through files in a directory
def process_files(directory, parent = ''):
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isdir(file_path):
            # Recursively call the function for subdirectories
            process_files(file_path)
        else:
            # Process the file
            read_headers(file_attrs, file_path)

# Call the function to process files in the directory and its subdirectories
process_files(directory)

for i,k in file_attrs.items():
    print(i)
    print(k)



clientA\cardprograms_2023-05-28_11-30-01.csv
{'currency', 'withdraw_limit', 'bank', 'card_no', 'last_updated', 'expiration_months', 'card_program', 'min_load_amount', 'start_date', 'program_id', 'pin_length', 'max_load_amount', 'max_transfer_amount', 'is_virtual_available', 'min_transfer_amount'}
clientA\cardprograms_2023-05-31_10-57-05.csv
{'currency', 'withdraw_limit', 'bank', 'card_no', 'last_updated', 'expiration_months', 'card_program', 'min_load_amount', 'start_date', 'program_id', 'pin_length', 'max_load_amount', 'max_transfer_amount', 'is_virtual_available', 'min_transfer_amount'}
clientA\cards_2023-05-28_11-30-01.csv
{'ccv_status', 'work_phone_no', 'birth_country_code', 'pos_ol_withd_limit', 'tot_trans_amount', 'zip_postal_code', 'nlast_declines', 'last_act_on', 'name_on_card', 'gender', 'card_status_atm', 'address2', 'atm_of_withd_limit', 'card_status', 'state_code', 'card_status_pos', 'mobile_no', 'status_map_no', 'atm_ol_withd_limit', 'home_phone_no', 'is_main_card', 'avs_s

Create a provider only once, to ensure we don't create more than once (RDFLib can manage it but still for perfomance reasons)

In [168]:
clients = set()

In [169]:
root = l.DataSource1
abox.add((root, RDF.type, l.DataSources))
abox.add((root, RDFS.label, Literal('DataSource1', lang="en")))


<Graph identifier=N8671d00729174e2da21dfb0a95d7a72b (<class 'rdflib.graph.Graph'>)>

In [170]:
def createProviderFile(provider):
    uri = l[provider]
    if provider not in clients:
        clients.add(provider)

        abox.add((uri, RDF.type, l.DataProvider))
        abox.add((uri, RDFS.label, Literal(provider, lang="en")))
        abox.add((root, l.hasDataProvider, uri))
    return uri

# Landing Zone

In [171]:
landing_files = dict()
for file, attrs in file_attrs.items():
    filename = file
    fkey = None
    if '\\' in file:
        dir, filename = file.split('\\')
        fkey = dir + '\\' + filename.split('_')[0]
    else:
        fkey = filename.split('_')[0]
    landing_files[fkey] = attrs

print(landing_files.keys())

dict_keys(['clientA\\cardprograms', 'clientA\\cards', 'clientA\\customers', 'clientA\\geodata', 'clientA\\transactions', 'mcc'])


In [172]:
for file, attrs in landing_files.items():

    if '\\' in file:
        dir, filename = file.split('\\')
        print(filename, dir)

        provider = createProviderFile(dir)

        object = l[filename]
        abox.add((object, RDF.type, l.DataFile))
        abox.add((object, RDFS.label, Literal(filename, lang="en")))
        abox.add((provider, l.providesDataFile, object))

    # Iterate over the column headers and create triples
    else:
        print(file)
        object = l[file]
        abox.add((object, RDF.type, l.DataFile))
        abox.add((object, RDFS.label, Literal(file, lang="en")))
        abox.add((root, l.hasDataFile, object))



cardprograms clientA
cards clientA
customers clientA
geodata clientA
transactions clientA
mcc


# Formatted Zone

In [67]:
for file, attrs in file_attrs.items():

    if '\\' in file:
        dir, filename = file.split('\\')
        print(filename, dir)

        provider = createProviderFile(dir)

        object = l[dir + '_' + filename]
        abox.add((object, RDF.type, ns.DataFile))
        abox.add((object, RDFS.label, Literal(dir, lang="en")))
        abox.add((provider, l.providesDataFile, object))

        # for attr in attrs:
        #     attribute_uri = URIRef(l + dir + filename)
        #     abox.add((attribute_uri, RDF.type, ns.Attribute))
        #     abox.add((attribute_uri, RDFS.label, Literal(dir, lang="en")))
        #     abox.add((ns.DataFile, ns.hasAttribute, attribute_uri))

    # Iterate over the column headers and create triples
    # else:
    #     print(file)
    #     attribute_uri = URIRef(l + file)
    #     abox.add((attribute_uri, RDF.type, ns.Attribute))
    #     abox.add((attribute_uri, RDFS.label, Literal(file, lang="en")))
    #     abox.add((ns.DataFile, ns.hasAttribute, attribute_uri))

# Serialize the updated graph to FA_updated.ttl file
abox.serialize("FA_ABOX.nt", format="ttl")




cardprograms_2023-05-28_11-30-01.csv clientA
cardprograms_2023-05-31_10-57-05.csv clientA
cards_2023-05-28_11-30-01.csv clientA
cards_2023-05-31_10-57-05.csv clientA
customers_2023-05-28_11-30-01.csv clientA
customers_2023-05-31_10-57-05.csv clientA
geodata_2023-05-28_11-30-01.json clientA
geodata_2023-05-31_10-57-05.json clientA
transactions_2023-05-28_11-30-01.csv clientA
transactions_2023-05-31_10-57-05.csv clientA


<Graph identifier=Nd7dc0749af0647e9bc0c03eeca46769e (<class 'rdflib.graph.Graph'>)>

In [17]:
abox.serialize("FA_ABOX.nt", format="ttl")

<Graph identifier=Nc60274697aff40828ac2db1e760d65c2 (<class 'rdflib.graph.Graph'>)>

In [None]:
graph.add((ns.Attribute, RDF.type, OWL.Class))
graph.add((ns.Attribute, RDFS.label, Literal("Attribute", lang="en")))

graph.add((ns.File, RDF.type, OWL.Class))
graph.add((ns.File, RDFS.label, Literal("File", lang="en")))

graph.add((e.PreprocessedFile, RDF.type, OWL.Class))
graph.add((e.PreprocessedFile, RDFS.label, Literal("PreprocessedFile", lang="en")))
graph.add((e.PreprocessedFile, RDFS.subClassOf, ns.File))

graph.add((e.fileGeneratedFrom, RDF.type, OWL.ObjectProperty))
graph.add((e.fileGeneratedFrom, RDFS.domain, e.PreprocessedFile))
graph.add((e.fileGeneratedFrom, RDFS.label, Literal("fileGeneratedFrom", lang="en")))
graph.add((e.fileGeneratedFrom, RDFS.range, f.FormattedFile))

graph.add((f.FormattedFile, RDF.type, OWL.Class))
graph.add((f.FormattedFile, RDFS.label, Literal("FormattedFile", lang="en")))
graph.add((f.FormattedFile, RDFS.subClassOf, ns.File))

graph.add((ns.hasAttribute, RDF.type, OWL.ObjectProperty))
graph.add((ns.hasAttribute, RDFS.domain, ns.File))
graph.add((ns.hasAttribute, RDFS.label, Literal("hasAttribute", lang="en")))
graph.add((ns.hasAttribute, RDFS.range, ns.Attribute))

graph.add((l.DataFile, RDF.type, OWL.Class))
graph.add((l.DataFile, RDFS.label, Literal("DataFile", lang="en")))
graph.add((l.DataFile, RDFS.subClassOf, ns.File))

graph.add((l.DataProvider, RDF.type, OWL.Class))
graph.add((l.DataProvider, RDFS.label, Literal("DataProvider", lang="en")))

graph.add((l.DataSource, RDF.type, OWL.Class))
graph.add((l.DataSource, RDFS.label, Literal("DataSource", lang="en")))

graph.add((l.FileVersion, RDF.type, OWL.Class))
graph.add((l.FileVersion, RDFS.label, Literal("FileVersion", lang="en")))
graph.add((l.FileVersion, RDFS.subClassOf, f.FormattedFile))

graph.add((l.hasDataFile, RDF.type, OWL.ObjectProperty))
graph.add((l.hasDataFile, RDFS.domain, l.DataSource))
graph.add((l.hasDataFile, RDFS.label, Literal("hasDataFile", lang="en")))
graph.add((l.hasDataFile, RDFS.range, l.DataFile))
graph.add((l.hasDataFile, RDFS.subPropertyOf, l.providesDataFile))

graph.add((l.hasDataProvider, RDF.type, OWL.ObjectProperty))
graph.add((l.hasDataProvider, RDFS.domain, l.DataSource))
graph.add((l.hasDataProvider, RDFS.label, Literal("hasDataProvider", lang="en")))
graph.add((l.hasDataProvider, RDFS.range, l.DataProvider))
graph.add((l.hasDataProvider, RDFS.subPropertyOf, l.providesDataFile))

graph.add((l.hasFileVersion, RDF.type, OWL.ObjectProperty))
graph.add((l.hasFileVersion, RDFS.domain, l.DataFile))
graph.add((l.hasFileVersion, RDFS.label, Literal("hasFileVersion", lang="en")))
graph.add((l.hasFileVersion, RDFS.range, l.FileVersion))

graph.add((l.providesDataFile, RDF.type, OWL.ObjectProperty))
graph.add((l.providesDataFile, RDFS.domain, l.DataProvider))
graph.add((l.providesDataFile, RDFS.label, Literal("providesDataFile", lang="en")))
graph.add((l.providesDataFile, RDFS.range, l.DataFile))