In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import networkx as nx
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
ifc_info = pd.read_csv("IFC_processed.csv")
ifc_embed = np.load("embeddings_ifc_en.npy")

ebkp_info = pd.read_csv("eBKP_processed1.csv")
ebkp_embed = np.load("embeddings_ebkp_en.npy")

mf_info = pd.read_csv("MF_processed.csv")
mf_embed = np.load("embeddings_mf_en.npy")

uni_info = pd.read_excel("Uniclass2015_Pr.xlsx", header=2)
uni_embed = np.load("embeddings_uni_en.npy")

In [3]:
ENCODE_EMBED = False
if ENCODE_EMBED:
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    def encoding(sent):
        return model.encode(sent, convert_to_numpy=True)

    embedding = uni_info["Title"].apply(model.encode)
    embedding = np.array(embedding)
    embedding = np.vstack(embedding)
    np.save("embeddings_graph.npy", embedding)


In [4]:
embedding = np.load("embeddings_graph.npy")

In [5]:
threshold = 0.5


similarity_uni_ebkp = cosine_similarity(embedding, ebkp_embed)
max_indices = np.argmax(similarity_uni_ebkp, axis=1)
mask = similarity_uni_ebkp[np.arange(len(similarity_uni_ebkp)), max_indices] > threshold
similarity_uni_ebkp = np.where(mask, max_indices,  np.nan)

similarity_uni_mf = cosine_similarity(embedding, mf_embed)
max_indices = np.argmax(similarity_uni_mf, axis=1)
mask = similarity_uni_mf[np.arange(len(similarity_uni_mf)), max_indices] > threshold
similarity_uni_mf = np.where(mask, max_indices,  np.nan)

similarity_uni_ifc = cosine_similarity(embedding, ifc_embed)
max_indices = np.argmax(similarity_uni_ifc, axis=1)
mask = similarity_uni_ifc[np.arange(len(similarity_uni_ifc)), max_indices] > threshold
similarity_uni_ifc = np.where(mask, max_indices,  np.nan)


In [6]:
table_data = {
    'Uni_Code': uni_info['Code'],
    'Uni': uni_info['Title'],
    'IFC_code': ifc_info.iloc[similarity_uni_ifc]['raw'].reset_index(drop=True),
    'IFC': ifc_info.iloc[similarity_uni_ifc]['IFC'].reset_index(drop=True),
    'MF_code': mf_info.iloc[similarity_uni_mf]['code'].reset_index(drop=True),
    'MF': mf_info.iloc[similarity_uni_mf]['label'].reset_index(drop=True),
    'eBKP_code': ebkp_info.iloc[similarity_uni_ebkp]['Code'].reset_index(drop=True),
    'eBKP': ebkp_info.iloc[similarity_uni_ebkp]['Element designation_EN'].reset_index(drop=True)
}
result_table = pd.DataFrame(table_data)


In [7]:
result_table.IFC[np.isnan(similarity_uni_ifc)] = np.nan
result_table.eBKP[np.isnan(similarity_uni_ebkp)] = np.nan
result_table.MF[np.isnan(similarity_uni_mf)] = np.nan

result_table = result_table[result_table['Uni_Code'].str.contains("Pr_20|Pr_25|Pr_30|Pr_40|Pr_65|Pr_70|Pr_80")]
result_table

Unnamed: 0,Uni_Code,Uni,IFC_code,IFC,MF_code,MF,eBKP_code,eBKP
105,Pr_20,Structure and general products,IfcStructuralItem,structural item,13 30 00,Special Structures,J03.01,General textile products
106,Pr_20_29,Fastener products,IfcFastener,fastener,34 11 36.13,Direct-Fixation Fasteners,A,
107,Pr_20_29_03,Anchors and components,IfcReinforcingBar.ANCHORING,anchoring,31 51 00,Anchor Tiebacks,F02.03,"Side protection, anchor points"
108,Pr_20_29_03_04,Anchor blocks,IfcReinforcingBar.ANCHORING,anchoring,31 51 00,Anchor Tiebacks,F02.03,"Side protection, anchor points"
109,Pr_20_29_03_05,Anchor rails,IfcReinforcingBar.ANCHORING,anchoring,31 51 00,Anchor Tiebacks,B03.01,Guard rails
...,...,...,...,...,...,...,...,...
8213,Pr_80_77_94_50,Mat mountings,IfcActuator,,11 66 23.56,Mat Storage,A,
8214,Pr_80_77_94_60,Pad mountings,IfcDiscreteAccessory.RAILPAD,rail pad,00 00 00,,A,
8215,Pr_80_77_94_65,Pneumatic isolation mounts,IfcActuator.PNEUMATICACTUATOR,pneumatic actuator,40 05 57.53,Pneumatic Actuators,A,
8216,Pr_80_77_94_74,Rubber bellows,IfcVibrationDamper.RUBBER,rubber,44 53 36,Crumb Rubber Systems,A,


In [8]:
IFC_Uni = result_table[~result_table.IFC.isin([np.nan])]
IFC_Uni = IFC_Uni[["Uni_Code", "IFC_code"]].reset_index().drop(columns=["index"])
IFC_Uni.to_csv("IFC_Uni.csv", index=False)

In [9]:
ebkp_Uni = result_table[~result_table.eBKP.isin([np.nan])]
ebkp_Uni = ebkp_Uni[["Uni_Code", "eBKP_code"]].reset_index().drop(columns=["index"])
ebkp_Uni.to_csv("eBKP_Uni.csv", index=False)

In [10]:
MF_Uni = result_table[~result_table.MF.isin([np.nan])]
MF_Uni = MF_Uni[["Uni_Code", "MF_code"]].reset_index().drop(columns=["index"])
MF_Uni.to_csv("MF_Uni.csv", index=False)

In [11]:
uni_info['Group'] = uni_info['Group'].fillna(method='ffill')
uni_info['Sub group'] = uni_info['Sub group'].fillna(method='ffill')
uni_info['Section'] = uni_info['Section'].fillna(method='ffill')

# Define a function to find the parent code
def find_parent(code):
    parts = code.split('_')
    if len(parts) > 1 and parts[0]=="Pr":
        return '_'.join(parts[:-1])
    return np.nan

# Create the Parent column
uni_info['Parent'] = uni_info['Code'].apply(find_parent)

# Create the final table
tableuni = uni_info[['Code', 'Parent', 'Title']]
tableuni.columns = ["Code", "Parent", "Title"]
tableuni = tableuni.dropna().reset_index()
tableuni = tableuni[["Code", "Parent", "Title"]]
result_table["Uni_Parent"] = tableuni.Parent

In [12]:
parent_uni = result_table[["Uni_Parent", "Uni_Code"]].reset_index().drop(columns=["index"])
parent_uni.to_csv("parent_Uni.csv", index=False)


In [13]:
G = nx.MultiDiGraph()

for index, row in ifc_info.iterrows():
    G.add_node(row['raw'], IFC=row['IFC'])

for index, row in mf_info.iterrows():
    G.add_node(row['code'], MF=row['label'])

for index, row in ebkp_info.iterrows():
    G.add_node(row['Code'], MF=row['Element designation_EN'])

for index, row in uni_info.iterrows():
    G.add_node(row['Code'], Uni=row['Title'])

for index, row in result_table.iterrows():
    if not pd.isna(row['IFC']):
        G.add_edge(row['Uni_Code'], row['IFC_code'], name='equivalent_classification')
        G.add_edge(row['IFC_code'], row['Uni_Code'], name='equivalent_classification')

    if not pd.isna(row['MF']):
        G.add_edge(row['Uni_Code'], row['MF_code'], name='equivalent_classification')
        G.add_edge(row['MF_code'], row['Uni_Code'], name='equivalent_classification')

    if not pd.isna(row['eBKP']):
        G.add_edge(row['Uni_Code'], row['eBKP_code'], name='equivalent_classification')
        G.add_edge(row['eBKP_code'], row['Uni_Code'], name='equivalent_classification')
        
# Add edges representing parent relationships
for index, row in result_table.iterrows():
    if row['Uni_Code'] != row['Uni_Parent'] and any(char.isdigit() for char in row['Uni_Parent']):
        G.add_edge(row['Uni_Parent'], row['Uni_Code'], name='parent')
        G.add_edge(row['Uni_Code'], row['Uni_Parent'], name='parent')

In [14]:
ifc_info

Unnamed: 0.1,Unnamed: 0,IFC,raw
0,0,actuator,IfcActuator
1,1,alarm,IfcAlarm
2,2,controller,IfcController
3,3,flow instrument,IfcFlowInstrument
4,4,sensor,IfcSensor
...,...,...,...
2951,2951,voiding feature,IfcVoidingFeature
2952,2952,voiding feature,IfcVoidingFeature
2953,2953,voiding feature,IfcVoidingFeature
2954,2954,voiding feature,IfcVoidingFeature


In [15]:
classifications = ifc_info.raw.unique()
classifications

array(['IfcActuator', 'IfcAlarm', 'IfcController', ...,
       'IfcVoidingFeature.NOTCH', 'IfcVoidingFeature.USERDEFINED',
       'IfcVoidingFeature.NOTDEFINED'], dtype=object)

In [16]:
import json

IFC = pd.read_csv("IFC_processed.csv")


classifications = ifc_info.raw.unique()

with open('IFC.json', 'r') as file:
    IFC_ATTRIBUTES = json.load(file)


prop_names = list()
prop_types = list()
pset_names = list()
class_ = list()

for classification in classifications:
    for pset_name, pset_details in IFC_ATTRIBUTES["Domain"]["Classifications"][classification].get("Psets", {}).items():
        # Add nodes for psets
        for prop_name, prop_details in pset_details.get("Properties", {}).items():
            if prop_details["type"] == "string" or prop_details["type"] == "real":
                prop_type = prop_details["type"]
                prop_names.append(prop_name)
                prop_types.append(prop_type)
                class_.append(classification)

            elif "values" in prop_details.keys():
                prop_type = str(prop_details["values"])
                prop_names.append(prop_name)
                prop_types.append(prop_type)
                class_.append(classification)


In [17]:
classifications

array(['IfcActuator', 'IfcAlarm', 'IfcController', ...,
       'IfcVoidingFeature.NOTCH', 'IfcVoidingFeature.USERDEFINED',
       'IfcVoidingFeature.NOTDEFINED'], dtype=object)

In [18]:
table_form = pd.DataFrame([class_, prop_names, prop_types], index=['Classifications', 'Property Names', 'Property Types']).T


In [19]:
table_form.to_csv("form_ifc.csv", index=False)

In [20]:
nx.write_edgelist(G, "data_graph.tsv", delimiter='\t', data=True)