## Indexing Brick Ontology

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
pd.set_option("max_rows", 600)
from pathlib import Path  
import glob
from sklearn.metrics.pairwise import cosine_similarity
import numpy

In [2]:
from rdflib import Graph
from flashtext import KeywordProcessor
import re

# Loadinb Brick Schema
g = Graph()
g.parse("/Users/slz/git_repo/vizbrick/Brick.ttl", format="turtle")

<Graph identifier=N8141944eb0c44561adbb862f6182515e (<class 'rdflib.graph.Graph'>)>

In [3]:
def get_relationships():
    rel_list = []
    for triple in g:
        if "brick" in triple[0] and "22-rdf-syntax-ns#type" in triple[1] and ("AsymmetricProperty" in triple[2] or "ObjectProperty" in triple[2]):
            rel_list.append(triple[0].replace("https://brickschema.org/schema/Brick#",""))
    print("*done")
    return rel_list

In [4]:
def vectorize(list_of_docs):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(list_of_docs)
    feature_names = vectorizer.get_feature_names_out()
    dense = vectors.todense()
    denselist = dense.tolist()
    df = pd.DataFrame(denselist, columns=feature_names)
    return df, vectorizer

In [5]:
def indexing_brick():
    definitions = {}
    
    for triple in g:
        if "definition" in triple[1]:
            class_name = triple[0].replace("https://brickschema.org/schema/Brick#","")
            definition = triple[2]
            class_name = class_name.replace("_"," ")
            definitions[class_name] = definition+" "+ class_name
        
        elif "#Location" in triple[2]:
            class_name = triple[0].replace("https://brickschema.org/schema/Brick#","")
            definition = class_name
            class_name = class_name.replace("_"," ")
            definitions[class_name] = definition +" " + class_name
    
    for key in definitions.keys():
        if str(definitions[key]).startswith("See"):
            definitions[key] = definitions[definitions[key].split(" ")[1].replace("_"," ")]
            #print("->",definitions[key])
    
    #print(definitions)
    list_of_classname = []
    list_of_definitions = []
    
    for key in definitions.keys():
        list_of_classname.append(key)
        list_of_definitions.append(definitions[key])

    print(len(list_of_classname), len(list_of_definitions))
    
    df_class, vectorizer_class = vectorize(list_of_classname)
    df_def, vectorizer_def = vectorize(list_of_definitions)

    return df_class, df_def, vectorizer_class, vectorizer_def, list_of_classname, list_of_definitions
        
df_class, df_def, vectorizer_class, vectorizer_def, list_of_classname, list_of_definitions = indexing_brick()
print("* Done")

    

1027 1027
* Done


In [6]:
list_of_definitions[list_of_classname.index("HVAC System")]

rdflib.term.Literal('The equipment, distribution systems and terminals that provide, either collectively or individually, the processes of heating, ventilating or air conditioning to a building or portion of a building Heating Ventilation Air Conditioning System', lang='en')

## Keyword Search Test for Suggestions

In [7]:
def search(query, hint="", topk=5):
    return_list = []
    '''
    parsed_query = query.split(" ")
    for classname in list_of_classname:
        cnt = 0
        for term in parsed_query:
            if term in classname:
                cnt+=1
        if cnt==len(parsed_query):
            return_list.append((classname,cnt))
    '''
    
    
    search_result = {}
    idx = 0
    
    query_vec = vectorizer_def.transform([query])
    
    for index, row in df_def.iterrows():
        class_name = list_of_classname[idx]
        similarity = cosine_similarity(row.values.reshape(1,len(row.values)), query_vec.toarray())[0][0]
        search_result[class_name]=similarity
        idx+=1

    idx = 0
    query_vec = vectorizer_class.transform([query])
    
    for index, row in df_class.iterrows():
        class_name = list_of_classname[idx]
        similarity = cosine_similarity(row.values.reshape(1,len(row.values)), query_vec.toarray())[0][0]
        search_result[class_name]+=similarity
        idx+=1

    search_result = dict(sorted(search_result.items(), key=lambda item: item[1], reverse=True))
    
    rank=1
    for key in search_result.keys():
        return_list.append((key,search_result[key]))
        if rank==topk: break
        rank+=1
    return return_list

In [8]:
query = "]'DHW_ClothesWasherColdFlow The cumulative volume of cold water flowing into the clothes washer starting at midnigh"
query+="water usage sensor"
search(query, topk=10)

[('Water Usage Sensor', 0.7749837513546974),
 ('volume', 0.7547574114418181),
 ('Hot Water Usage Sensor', 0.6807491741640774),
 ('Usage Sensor', 0.6415175181403583),
 ('Energy Usage Sensor', 0.5447254801890808),
 ('Cold Box', 0.5273226850442712),
 ('Steam Usage Sensor', 0.5115996614241947),
 ('Variable Air Volume Box', 0.4782982950606183),
 ('Water Temperature Sensor', 0.44077627683758014),
 ('Water Flow Sensor', 0.41277219200805126)]

In [9]:
query = "DHW Clothes Washer Cold Flow The cumulative volume of cold water flowing into the clothes washer starting at midnight"
search(query, topk=20)

[('Cold Box', 0.8200476378402841),
 ('volume', 0.6123005070969189),
 ('Water Flow Sensor', 0.46693017168654116),
 ('Water Flow Setpoint', 0.4342440413453723),
 ('Variable Air Volume Box', 0.39003658358689575),
 ('Discharge Water Flow Sensor', 0.37546719691675334),
 ('Supply Water Flow Sensor', 0.3725053125532621),
 ('Supply Water Flow Setpoint', 0.36409999914116253),
 ('Discharge Water Flow Setpoint', 0.35411980548809796),
 ('Cooling Valve', 0.35119293064661994),
 ('Hot Water Flow Setpoint', 0.3497345226602519),
 ('Hot Water Flow Sensor', 0.34475471288277826),
 ('Chilled Water Flow Setpoint', 0.3336340458260073),
 ('Chilled Water Flow Sensor', 0.32937639090139575),
 ('Freezer', 0.3242294639406801),
 ('Hot Water Supply Flow Sensor', 0.3235921392453537),
 ('Hot Water Discharge Flow Sensor', 0.3198481022003142),
 ('Outside Air Flow Sensor', 0.318999470581222),
 ('Bypass Water Flow Sensor', 0.3161083134570281),
 ('Flow Setpoint', 0.3114351077642703)]

## Loading an input metadata table, cleaning metadata table

In [10]:
dataset = pd.read_csv("NIST_meta.csv").fillna("")

In [11]:
dataset

Unnamed: 0,Data Label,Subsystem,Measurement Location,Measured Parameter,Description,Units
0,DHW_ClothesWasherColdFlow,DHW,Utility,Flow_Water,The cumulative volume of cold water flowing in...,Gallons
1,DHW_ClothesWasherHotFlow,DHW,Utility,Flow_Water,The cumulative volume of hot water flowing int...,Gallons
2,DHW_DishwasherHotFlow,DHW,Kitchen,Flow_Water,The cumulative volume of hot water flowing int...,Gallons
3,DHW_SHWGlycolTempIn,DHW,Basement,Temp_Glycol,The instantaneous temperature of the glycol-wa...,°C
4,DHW_SHWGlycolTempOut,DHW,Basement,Temp_Glycol,The instantaneous temperature of the glycol-wa...,°C
5,DHW_HeatPumpWaterHeaterEnergyTotal,DHW,Basement,Energy_Electrical,Cumulative energy consumption by heat pump wat...,Wh
6,DHW_HeatPumpWaterHeaterPowerTotal,DHW,Basement,Power_Electrical,Instantaneous power consumption by heat pump w...,W
7,DHW_ManifoldColdFlow,DHW,Basement,Flow_Water,The cumulative volume of water flowing into th...,Gallons
8,DHW_ManifoldHotFlow,DHW,Basement,Flow_Water,The cumulative volume of water flowing into th...,Gallons
9,DHW_MixValveColdFlow,DHW,Basement,Flow_Water,The cumulative volume of mains water flowing i...,Gallons


In [12]:
Location_cols = ["Measurement Location"]
Description_cols = ["Subsystem","Measured Parameter","Description","Units"]
Data_Label_cols = ["Data Label"]

In [13]:
dataset.columns

Index(['Data Label', 'Subsystem', 'Measurement Location', 'Measured Parameter',
       'Description', 'Units'],
      dtype='object')

In [14]:
def get_a_merged_df(df, cols, colname):
    tmp = dataset[cols[0]]
    for i in range(1,len(cols)):
        tmp+=" "+ dataset[cols[i]]
    desc_df = pd.DataFrame(tmp)
    desc_df.columns =[colname]
    return desc_df

In [15]:
df_desc = get_a_merged_df(dataset, Description_cols, "Description")

In [16]:
df_data_label = get_a_merged_df(dataset, Data_Label_cols, "Data Label")

In [17]:
df_location = get_a_merged_df(dataset, Location_cols, "Location")

In [18]:
df_desc.values[0]

array(['DHW Flow_Water The cumulative volume of cold water flowing into the clothes washer starting at midnight Gallons'],
      dtype=object)

In [19]:
metadata = pd.concat([df_data_label, df_location, df_desc], axis=1)

In [20]:
metadata.to_csv('NIST_metadata_cleaned.csv');

In [21]:
metadata['Description'].values[0]

'DHW Flow_Water The cumulative volume of cold water flowing into the clothes washer starting at midnight Gallons'

In [22]:
list_of_rows = []
for index, row in metadata.iterrows():
    list_of_rows.append((row[0],row[1],row[2]))

## Analyzing locations

In [23]:
locations = metadata['Location']
locations = list(set(locations.tolist()))

In [24]:
locations

['Attic',
 'Bath2',
 'Kitchen',
 'Entry Hallway',
 'Bedroom3',
 'Bedroom4',
 'Mudroom',
 'Dining Room',
 'Master Bedroom',
 'Basement',
 'Outdoor',
 'Bath1',
 'Utility',
 'MBath',
 'Living Room',
 'Bedroom2',
 'Multiple',
 'MBedroom',
 'Bedroom 2']

In [25]:
list_of_locations = []
for triple in g:
    if "#Location" in triple[2] and "https://brickschema.org/schema/Brick#hasAssociatedTag" in triple[1]:
        list_of_locations.append(triple[0].replace("https://brickschema.org/schema/Brick#",""))
print(list_of_locations)

['Pump_Room', 'Waste_Storage', 'Wing', 'Conference_Room', 'Janitor_Room', 'Environment_Box', 'Shower', 'Outside', 'Electrical_Room', 'Security_Service_Room', 'Lobby', 'Lounge', 'Zone', 'Control_Room', 'Freezer', 'Cold_Box', 'Massage_Room', 'Private_Office', 'Equipment_Room', 'Room', 'IDF', 'Generator_Room', 'Open_Office', 'Majlis', 'Switch_Room', 'Region', 'Laboratory', 'Ticketing_Booth', 'Plumbing_Room', 'Media_Production_Room', 'Server_Room', 'Common_Space', 'Mechanical_Room', 'Workshop', 'Distribution_Frame', 'Break_Room', 'Space', 'Battery_Room', 'Restroom', 'Energy_Zone', 'Hospitality_Box', 'Library', 'Elevator_Space', 'Employee_Entrance_Lobby', 'Riser', 'Team_Room', 'Exercise_Room', 'Elevator_Shaft', 'Ablutions_Room', 'TETRA_Room', 'Concession', 'Vertical_Space', 'Broadcast_Room', 'Reception', 'Enclosed_Office', 'Visitor_Lobby', 'Floor', 'Bench_Space', 'Basement', 'Office_Kitchen', 'Wardrobe', 'Rooftop', 'Fire_Zone', 'Gatehouse', 'Lighting_Zone', 'Sports_Service_Room', 'Water_Tan

In [26]:
def search_locations(query, topk=1):
    if "room" in query.lower():
        query+=" room"
    query_vec = vectorizer_class.transform([query])
    search_result = {}
    idx = 0
    for index, row in df_class.iterrows():
        class_name = list_of_classname[idx]
        similarity = cosine_similarity(row.values.reshape(1,len(row.values)), query_vec.toarray())[0][0]
        search_result[class_name]=similarity
        idx+=1

    search_result = dict(sorted(search_result.items(), key=lambda item: item[1], reverse=True))
    return_list = []
    rank=1
    for key in search_result.keys():
        if key in list_of_locations:
            return_list.append((key,search_result[key]))
            if rank==topk: break
            rank+=1
    if return_list[0][1]==0.0:
        return_list= [('Location', 1,0)]
    return return_list

In [27]:
## Suggesting Brick Location Classes based on Location Names
for location in locations:
    print(location, search_locations(location))

Attic [('Location', 1, 0)]
Bath2 [('Location', 1, 0)]
Kitchen [('Location', 1, 0)]
Entry Hallway [('Hallway', 1.0)]
Bedroom3 [('Room', 1.0)]
Bedroom4 [('Room', 1.0)]
Mudroom [('Room', 1.0)]
Dining Room [('Room', 1.0)]
Master Bedroom [('Room', 1.0)]
Basement [('Basement', 1.0)]
Outdoor [('Location', 1, 0)]
Bath1 [('Location', 1, 0)]
Utility [('Location', 1, 0)]
MBath [('Location', 1, 0)]
Living Room [('Room', 1.0)]
Bedroom2 [('Room', 1.0)]
Multiple [('Location', 1, 0)]
MBedroom [('Room', 1.0)]
Bedroom 2 [('Room', 1.0)]


## Analyzing Column Names

In [28]:
data_labels = metadata['Data Label']
label_desc = metadata['Description']

In [29]:
data_labels

0                 DHW_ClothesWasherColdFlow
1                  DHW_ClothesWasherHotFlow
2                     DHW_DishwasherHotFlow
3                       DHW_SHWGlycolTempIn
4                      DHW_SHWGlycolTempOut
5        DHW_HeatPumpWaterHeaterEnergyTotal
6         DHW_HeatPumpWaterHeaterPowerTotal
7                      DHW_ManifoldColdFlow
8                       DHW_ManifoldHotFlow
9                      DHW_MixValveColdFlow
10                 DHW_RoomTempBasementHPWH
11            DHW_SHWPumpsEnergywithStandby
12             DHW_SHWPumpsPowerwithStandby
13             DHW_StatusSolenoidColdMBATub
14          DHW_StatusSolenoidColdMBAShower
15        DHW_StatusSolenoidColdKitchenSink
16         DHW_StatusSolenoidHotKitchenSink
17              DHW_StatusSolenoidHotMBATub
18           DHW_StatusSolenoidHotMBAShower
19               DHW_WaterTempBAShowerMixed
20                   DHW_WaterTempBAShwCold
21                    DHW_WaterTempBAShwHot
22                  DHW_WaterTem

In [None]:
import re
def parse_colname(term):
    parsed = re.sub('[^a-zA-Z0-9\n\.]', ' ', term)
    return parsed

def parse_label(label):
    terms = []
    parsed = par
    se_colname(label.replace("_",""))
    parsed =re.sub(r"(?<=\w)([A-Z])", r" \1", parsed)
    tmp=""
    for item in parsed.split(" "):
        if len(item)>1:
            terms.append(item)
            if tmp!="":
                terms.append(tmp)
                tmp=""
        else:
            tmp +=item
    if tmp!="":
        terms.append(tmp)
        tmp=""
    return ' '.join(terms)

In [None]:
def frequency_parse(data_labels):
    freq = {}
    terms = []
    for label in data_labels:
        parsed = parse_colname(label)
        parsed =re.sub(r"(?<=\w)([A-Z])", r" \1", parsed)
        tmp=""
        for item in parsed.split(" "):
            if len(item)>1:
                terms.append(item)
                if tmp!="":
                    terms.append(tmp)
                    tmp=""
            else:
                tmp +=item
        if tmp!="":
            terms.append(tmp)
            tmp=""
    
    for label in terms:
        try:
            freq[label]+=1
        except:
            freq[label]=1
    return freq

In [None]:
freq = frequency_parse(data_labels)

In [None]:
search("HVAC System")

In [None]:
freq_result = dict(sorted(freq.items(), key=lambda item: item[1], reverse=True))
rank = 1
for key in freq_result.keys():
    tmp = search(key)
    if tmp[0][1]==0:
        print(key, "not found")
    else:
        print(key, freq_result[key], tmp)

In [None]:
for i in range(0,len(data_labels)):
    query = parse_label(data_labels[i])+" "+label_desc[i]
    print(data_labels[i], search(query,topk=3))