## Indexing Brick Ontology

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
pd.set_option("max_rows", 600)
from pathlib import Path  
import glob
from sklearn.metrics.pairwise import cosine_similarity
import numpy

In [2]:
from rdflib import Graph
from flashtext import KeywordProcessor
import re

# Loadinb Brick Schema
g = Graph()
g.parse("/Users/slz/git_repo/vizbrick/Brick.ttl", format="turtle")

<Graph identifier=Nf556b7670c5f4643ba779ce06b5136d1 (<class 'rdflib.graph.Graph'>)>

In [11]:
def get_relationships():
    rel_list = []
    for triple in g:
        if "brick" in triple[0] and "22-rdf-syntax-ns#type" in triple[1] and ("AsymmetricProperty" in triple[2] or "ObjectProperty" in triple[2]):
            print(triple[0])
    print("*done")

https://brickschema.org/schema/Brick#isFedBy
https://brickschema.org/schema/Brick#hasTag
https://brickschema.org/schema/Brick#isMeasuredBy
https://brickschema.org/schema/Brick#hasLocation
https://brickschema.org/schema/Brick#isTagOf
https://brickschema.org/schema/Brick#value
https://brickschema.org/schema/Brick#hasUnit
https://brickschema.org/schema/Brick#isRegulatedBy
https://brickschema.org/schema/Brick#isRegulatedBy
https://brickschema.org/schema/Brick#hasAssociatedTag
https://brickschema.org/schema/Brick#isLocationOf
https://brickschema.org/schema/Brick#hasOutputSubstance
https://brickschema.org/schema/Brick#hasAssociatedTag
https://brickschema.org/schema/Brick#isTagOf
https://brickschema.org/schema/Brick#isLocationOf
https://brickschema.org/schema/Brick#hasOutputSubstance
https://brickschema.org/schema/Brick#isPartOf
https://brickschema.org/schema/Brick#timeseries
https://brickschema.org/schema/Brick#latitutde
https://brickschema.org/schema/Brick#isAssociatedWith
https://bricksche

In [None]:
def vectorize(list_of_docs):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(list_of_docs)
    feature_names = vectorizer.get_feature_names_out()
    dense = vectors.todense()
    denselist = dense.tolist()
    df = pd.DataFrame(denselist, columns=feature_names)
    return df, vectorizer

In [None]:
def indexing_brick():
    definitions = {}
    
    for triple in g:
        if "definition" in triple[1]:
            class_name = triple[0].replace("https://brickschema.org/schema/Brick#","")
            definition = triple[2]
            class_name = class_name.replace("_"," ")
            definitions[class_name] = definition+" "+ class_name
        
        elif "#Location" in triple[2]:
            class_name = triple[0].replace("https://brickschema.org/schema/Brick#","")
            definition = class_name
            class_name = class_name.replace("_"," ")
            definitions[class_name] = definition +" " + class_name
    
    for key in definitions.keys():
        if str(definitions[key]).startswith("See"):
            definitions[key] = definitions[definitions[key].split(" ")[1].replace("_"," ")]
            #print("->",definitions[key])
    
    #print(definitions)
    list_of_classname = []
    list_of_definitions = []
    
    for key in definitions.keys():
        list_of_classname.append(key)
        list_of_definitions.append(definitions[key])

    print(len(list_of_classname), len(list_of_definitions))
    
    df_class, vectorizer_class = vectorize(list_of_classname)
    df_def, vectorizer_def = vectorize(list_of_definitions)

    return df_class, df_def, vectorizer_class, vectorizer_def, list_of_classname, list_of_definitions
        
df_class, df_def, vectorizer_class, vectorizer_def, list_of_classname, list_of_definitions = indexing_brick()
print("* Done")

    

In [None]:
list_of_definitions[list_of_classname.index("HVAC System")]

## Keyword Search Test for Suggestions

In [None]:
def search(query, hint="", topk=5):
    return_list = []
    '''
    parsed_query = query.split(" ")
    for classname in list_of_classname:
        cnt = 0
        for term in parsed_query:
            if term in classname:
                cnt+=1
        if cnt==len(parsed_query):
            return_list.append((classname,cnt))
    '''
    
    
    search_result = {}
    idx = 0
    
    query_vec = vectorizer_def.transform([query])
    
    for index, row in df_def.iterrows():
        class_name = list_of_classname[idx]
        similarity = cosine_similarity(row.values.reshape(1,len(row.values)), query_vec.toarray())[0][0]
        search_result[class_name]=similarity
        idx+=1

    idx = 0
    query_vec = vectorizer_class.transform([query])
    
    for index, row in df_class.iterrows():
        class_name = list_of_classname[idx]
        similarity = cosine_similarity(row.values.reshape(1,len(row.values)), query_vec.toarray())[0][0]
        search_result[class_name]+=similarity
        idx+=1

    search_result = dict(sorted(search_result.items(), key=lambda item: item[1], reverse=True))
    
    rank=1
    for key in search_result.keys():
        return_list.append((key,search_result[key]))
        if rank==topk: break
        rank+=1
    return return_list

In [None]:
query = "]'DHW_ClothesWasherColdFlow The cumulative volume of cold water flowing into the clothes washer starting at midnigh"
query+="water usage sensor"
search(query, topk=10)

In [None]:
query = "DHW Clothes Washer Cold Flow The cumulative volume of cold water flowing into the clothes washer starting at midnight"
search(query, topk=20)

## Loading an input metadata table, cleaning metadata table

In [None]:
dataset = pd.read_csv("NIST_meta.csv").fillna("")

In [None]:
dataset

In [None]:
Location_cols = ["Measurement Location"]
Description_cols = ["Subsystem","Measured Parameter","Description","Units"]
Data_Label_cols = ["Data Label"]

In [None]:
dataset.columns

In [None]:
def get_a_merged_df(df, cols, colname):
    tmp = dataset[cols[0]]
    for i in range(1,len(cols)):
        tmp+=" "+ dataset[cols[i]]
    desc_df = pd.DataFrame(tmp)
    desc_df.columns =[colname]
    return desc_df

In [None]:
df_desc = get_a_merged_df(dataset, Description_cols, "Description")

In [None]:
df_data_label = get_a_merged_df(dataset, Data_Label_cols, "Data Label")

In [None]:
df_location = get_a_merged_df(dataset, Location_cols, "Location")

In [None]:
df_desc.values[0]

In [None]:
metadata = pd.concat([df_data_label, df_location, df_desc], axis=1)

In [None]:
metadata.to_csv('NIST_metadata_cleaned.csv');

In [None]:
metadata['Description'].values[0]

In [None]:
list_of_rows = []
for index, row in metadata.iterrows():
    list_of_rows.append((row[0],row[1],row[2]))

## Analyzing locations

In [None]:
locations = metadata['Location']
locations = list(set(locations.tolist()))

In [None]:
locations

In [None]:
list_of_locations = []
for triple in g:
    if "#Location" in triple[2] and "https://brickschema.org/schema/Brick#hasAssociatedTag" in triple[1]:
        list_of_locations.append(triple[0].replace("https://brickschema.org/schema/Brick#",""))
print(list_of_locations)

In [None]:
def search_locations(query, topk=1):
    if "room" in query.lower():
        query+=" room"
    query_vec = vectorizer_class.transform([query])
    search_result = {}
    idx = 0
    for index, row in df_class.iterrows():
        class_name = list_of_classname[idx]
        similarity = cosine_similarity(row.values.reshape(1,len(row.values)), query_vec.toarray())[0][0]
        search_result[class_name]=similarity
        idx+=1

    search_result = dict(sorted(search_result.items(), key=lambda item: item[1], reverse=True))
    return_list = []
    rank=1
    for key in search_result.keys():
        if key in list_of_locations:
            return_list.append((key,search_result[key]))
            if rank==topk: break
            rank+=1
    if return_list[0][1]==0.0:
        return_list= [('Location', 1,0)]
    return return_list

In [None]:
## Suggesting Brick Location Classes based on Location Names
for location in locations:
    print(location, search_locations(location))

## Analyzing Column Names

In [None]:
data_labels = metadata['Data Label']
label_desc = metadata['Description']

In [None]:
data_labels

In [None]:
import re
def parse_colname(term):
    parsed = re.sub('[^a-zA-Z0-9\n\.]', ' ', term)
    return parsed

def parse_label(label):
    terms = []
    parsed = parse_colname(label.replace("_",""))
    parsed =re.sub(r"(?<=\w)([A-Z])", r" \1", parsed)
    tmp=""
    for item in parsed.split(" "):
        if len(item)>1:
            terms.append(item)
            if tmp!="":
                terms.append(tmp)
                tmp=""
        else:
            tmp +=item
    if tmp!="":
        terms.append(tmp)
        tmp=""
    return ' '.join(terms)

In [None]:
def frequency_parse(data_labels):
    freq = {}
    terms = []
    for label in data_labels:
        parsed = parse_colname(label)
        parsed =re.sub(r"(?<=\w)([A-Z])", r" \1", parsed)
        tmp=""
        for item in parsed.split(" "):
            if len(item)>1:
                terms.append(item)
                if tmp!="":
                    terms.append(tmp)
                    tmp=""
            else:
                tmp +=item
        if tmp!="":
            terms.append(tmp)
            tmp=""
    
    for label in terms:
        try:
            freq[label]+=1
        except:
            freq[label]=1
    return freq

In [None]:
freq = frequency_parse(data_labels)

In [None]:
search("HVAC System")

In [None]:
freq_result = dict(sorted(freq.items(), key=lambda item: item[1], reverse=True))
rank = 1
for key in freq_result.keys():
    tmp = search(key)
    if tmp[0][1]==0:
        print(key, "not found")
    else:
        print(key, freq_result[key], tmp)

In [None]:
for i in range(0,len(data_labels)):
    query = parse_label(data_labels[i])+" "+label_desc[i]
    print(data_labels[i], search(query,topk=3))