In [1]:
# We will use the pandas library to explore the data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import requests
import json
import re

# Display all availablecolumns in the dataframes
pd.set_option("display.max_columns", None)

In [10]:
print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)
print("Matplotlib version:", plt.__version__)

NumPy version: 1.26.4
Pandas version: 2.2.2


AttributeError: module 'matplotlib.pyplot' has no attribute '__version__'

In [13]:
# Read the CSV file into a pandas dataframe; the file is encoded in ISO-8859-1 and uses ; as the delimiter
obd_2024 = pd.read_csv('data/OBD/OBD_2024_I.csv', sep=';', encoding='ISO-8859-1', low_memory=False)

# Convert numeric columns to float automatically
for col in obd_2024.select_dtypes(include=['object']).columns:
    try:
        obd_2024[col] = pd.to_numeric(obd_2024[col].str.replace(',', '.'))
    except:
        pass

# Convert to int64 but ignore NaN values
obd_2024["Referenzjahr"] = pd.to_numeric(obd_2024["Referenzjahr"], errors='coerce').fillna(obd_2024["Referenzjahr"]).astype('Int64')
obd_2024["Gueltig bis"] = pd.to_numeric(obd_2024["Gueltig bis"], errors='coerce').fillna(obd_2024["Gueltig bis"]).astype('Int64')

# Get basic information about the dataframe
obd_2024.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14985 entries, 0 to 14984
Data columns (total 81 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   UUID                                           14985 non-null  object 
 1   Version                                        14985 non-null  object 
 2   Name (de)                                      12848 non-null  object 
 3   Name (en)                                      14945 non-null  object 
 4   Kategorie (original)                           14985 non-null  object 
 5   Kategorie (en)                                 14985 non-null  object 
 6   Konformität                                    14985 non-null  object 
 7   Laenderkennung                                 14982 non-null  object 
 8   Typ                                            14985 non-null  object 
 9   Referenzjahr                                   149

In [14]:
obd_2024 = obd_2024.drop(columns=obd_2024.columns[obd_2024.isna().all()])
obd_2024.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14985 entries, 0 to 14984
Data columns (total 79 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   UUID                                           14985 non-null  object 
 1   Version                                        14985 non-null  object 
 2   Name (de)                                      12848 non-null  object 
 3   Name (en)                                      14945 non-null  object 
 4   Kategorie (original)                           14985 non-null  object 
 5   Kategorie (en)                                 14985 non-null  object 
 6   Konformität                                    14985 non-null  object 
 7   Laenderkennung                                 14982 non-null  object 
 8   Typ                                            14985 non-null  object 
 9   Referenzjahr                                   149

In [25]:
# Replace empty strings with NaN in the columns before converting to Int64
obd_2024["Referenzjahr"] = pd.to_numeric(obd_2024["Referenzjahr"].replace('', np.nan), errors='coerce').fillna(obd_2024["Referenzjahr"]).astype('Int64')
obd_2024["Gueltig bis"] = pd.to_numeric(obd_2024["Gueltig bis"].replace('', np.nan), errors='coerce').fillna(obd_2024["Gueltig bis"]).astype('Int64')

specific_data = obd_2024[obd_2024["Typ"] == "specific dataset"]
generic_data = obd_2024[obd_2024["Typ"] == "generic dataset"]

specific = pd.DataFrame(specific_data).drop_duplicates(subset=['UUID'])
generic = pd.DataFrame(generic_data).drop_duplicates(subset=['UUID'])

# === Step 2: Text Similarity ===

# Fill missing text
specific[['UUID', 'Name (en)', 'Kategorie (en)', 'Bezugseinheit']] = specific[['UUID', 'Name (en)', 'Kategorie (en)', 'Bezugseinheit']].fillna(value="", downcast='infer')
generic[['UUID', 'Name (en)', 'Kategorie (en)', 'Bezugseinheit']] = generic[['UUID', 'Name (en)', 'Kategorie (en)', 'Bezugseinheit']].fillna(value="", downcast='infer')


  specific[['UUID', 'Name (en)', 'Kategorie (en)', 'Bezugseinheit']] = specific[['UUID', 'Name (en)', 'Kategorie (en)', 'Bezugseinheit']].fillna(value="", downcast='infer')
  generic[['UUID', 'Name (en)', 'Kategorie (en)', 'Bezugseinheit']] = generic[['UUID', 'Name (en)', 'Kategorie (en)', 'Bezugseinheit']].fillna(value="", downcast='infer')


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util


# Load a lightweight yet effective model
model = SentenceTransformer('all-MiniLM-L6-v2')

def name_sim(name_1, name_2):
    # Ensure the inputs are strings
    if not isinstance(name_1, str) or not isinstance(name_2, str):
        return 0

    # Encode names into vector embeddings
    embeddings = model.encode([name_1, name_2], convert_to_tensor=True)

    # Compute cosine similarity
    similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
    return similarity

def cat_sim(cat_1, cat_2):
    if not isinstance(cat_1, str) or not isinstance(cat_2, str):
        return 0  # Return 0 if either category is not a string

    # Encode categories into vector embeddings
    embeddings = model.encode([cat_1, cat_2], convert_to_tensor=True)

    # Compute cosine similarity
    similarity = util.cos_sim(embeddings[0], embeddings[1]).item()
    return similarity


def year_bucket_match(year1, year2):
    try:
        if pd.isna(year1) or pd.isna(year2):
            return 0
        year1 = int(year1)
        year2 = int(year2)
        return 1 if abs(year1 - year2) <= 2 else 0
    except ValueError:
        print(f"Invalid year value: {year1} or {year2}")
        return 0

def unit_match(unit1, unit2):
    return 1 if unit1.strip().lower() == unit2.strip().lower() else 0



In [29]:
import torch

# Precompute embeddings for generics
generic_name_embeddings = model.encode(generic["Name (en)"].fillna("").tolist(), convert_to_tensor=True)
generic_cat_embeddings = model.encode(generic["Kategorie (en)"].fillna("").tolist(), convert_to_tensor=True)

In [32]:
def calculate_scores_for_specific_uuid(specific_uuid):
    # Filter the specific material by UUID
    specific_material = specific[specific['UUID'] == specific_uuid]
    if specific_material.empty:
        print(f"No specific material found with UUID: {specific_uuid}")
        return

    # Encode the specific material once
    spec_name_emb = model.encode(specific_material.iloc[0]["Name (en)"], convert_to_tensor=True)
    spec_cat_emb = model.encode(specific_material.iloc[0]["Kategorie (en)"], convert_to_tensor=True)

    # Compute similarity in one shot
    name_similarities = util.cos_sim(spec_name_emb, generic_name_embeddings).cpu().numpy().flatten()
    cat_similarities = util.cos_sim(spec_cat_emb, generic_cat_embeddings).cpu().numpy().flatten()

    results = []
    for i, (_, gen_row) in enumerate(generic.iterrows()):
        name_score = name_similarities[i]
        cat_score = cat_similarities[i]
        year_score = year_bucket_match(specific_material.iloc[0]["Referenzjahr"], gen_row["Referenzjahr"])
        unit_score = unit_match(specific_material.iloc[0]["Bezugseinheit"], gen_row["Bezugseinheit"])

        final_score = 0.5 * name_score + 0.2 * cat_score + 0.1 * year_score + 0.2 * unit_score

        results.append({
            "Generic_UUID": gen_row["UUID"],
            "Generic_Name": gen_row["Name (en)"],
            "Name_Similarity": round(name_score, 3),
            "Category_Similarity": round(cat_score, 3),
            "Year_Match": year_score,
            "Unit_Match": unit_score,
            "Final_Score": round(final_score, 3)
        })

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    # Get top 3 rows with the highest final_score
    top_results = results_df.nlargest(3, "Final_Score")

    # Print results
    if not top_results.empty:
        print(top_results)
    else:
        print(f"No matches found for specific UUID: {specific_uuid}")

In [33]:
calculate_scores_for_specific_uuid('1d9ca1d8-7ff3-4533-b828-a4c96bf31066')

                             Generic_UUID  \
496  1b0a3488-9b02-4c98-b421-8c746d350f97   
492  7b69e7b4-68dd-49a3-b2dc-ae608b66eece   
494  deeb0bda-20fa-412a-b945-1a589638db21   

                                    Generic_Name  Name_Similarity  \
496                   Gypsum fibre board (10 mm)            0.903   
492                    Gypsum wallboard (100 mm)            0.673   
494  Gypsum plaster board (impregnated, 12.5 mm)            0.634   

     Category_Similarity  Year_Match  Unit_Match  Final_Score  
496                0.808           1           1        0.913  
492                0.808           1           1        0.798  
494                0.808           1           1        0.779  
