In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from gliner import GLiNER
import ollama
from node2vec import Node2Vec
import networkx as nx
import plotly
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import ast
from langchain.text_splitter import RecursiveCharacterTextSplitter
from datetime import datetime
import pickle
from tqdm.notebook import tqdm
import re

In [None]:
# Initialize GLiNER with the base model
model = GLiNER.from_pretrained("urchade/gliner_mediumv2.1")
def create_splitter():
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=150,
        length_function=len,
        is_separator_regex=False,
    )

    return splitter

labels = [
    "person", "award", "date", "competitions", "teams", "organization", "location", "event", "product", 
    "quantity", "money", "percent", "time", "gpe", "facility", "language", "work_of_art", "law", "nationality", 
    "title", "field_of_study", "measurement", "technology"
]

splitter = create_splitter()

BUY_EMBEDDING_DF = "../assets/buy_embeddings_stella_1.5b.csv"
ABT_EMBEDDING_DF = "../assets/abt_embeddings_stella_1.5b.csv"

# Download pre-trained embeddings

[buy](https://drive.google.com/file/d/1kgsQWlAom-7iWgqi7gf6_uO6V-F33R0H/view?usp=sharing)

[abt](https://drive.google.com/file/d/1gvBrm7quph1AIIC9t48QqQ0Wl0jGPJTm/view?usp=sharing)

In [4]:
def clean_text(text):
    # Entferne alle Satzzeichen (Punctuation)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Wandle den gesamten Text in Kleinbuchstaben um
    text = text.lower()
    
    return text

abt_df = pd.read_csv('../datasets/abt_buy/Abt.csv', encoding='unicode_escape')
abt_df.fillna('', inplace=True)
abt_df['name'] = abt_df['name'].apply(lambda x: clean_text(x))
abt_df['description'] = abt_df['description'].apply(lambda x: clean_text(x))

buy_df = pd.read_csv('../datasets/abt_buy/Buy.csv', encoding='unicode_escape')
buy_df.fillna('', inplace=True)
buy_df['name'] = buy_df['name'].apply(lambda x: clean_text(x))
buy_df['description'] = buy_df['description'].apply(lambda x: clean_text(x))

In [None]:
print(len(abt_df))
print(len(buy_df))
abt_df.head(1)

# Semantic Embedding

In [None]:
# buy_embeddings = []
# for idx, row in tqdm(buy_df.iterrows()):
#     t = f"The Title is: {row['name']}. The price is: {row['price']}. And now the description: {row['description']}"
#     response = ollama.embeddings(model="Losspost/stella_en_1.5b_v5", prompt=t)
#     buy_embeddings.append(response['embedding'])
# abt_embeddings = []
# for idx, row in tqdm(abt_df.iterrows()):
#     t = f"The Title is: {row['name']}. The price is: {row['price']}. And now the description: {row['description']}"
#     response = ollama.embeddings(model="Losspost/stella_en_1.5b_v5", prompt=t)
#     abt_embeddings.append(response['embedding'])

#### Load Pretrained Models

In [84]:
buy_embd = pd.read_csv(BUY_EMBEDDING_DF)
abt_embd = pd.read_csv(ABT_EMBEDDING_DF)

# ER Extraction

In [6]:
buy_nodes = []

for idx, row in tqdm(buy_df.iterrows()):
    text = f"The Title is: {row['name']}. The price is: {row['price']}. And now the description: {row['description']}"
    # Load Text
    texts = splitter.create_documents([text])
    # Split Text
    pages = splitter.split_documents(texts)
    for p in pages:
        entities = model.predict_entities(p.page_content, labels, threshold=0.5)
        for entity in entities:
            buy_nodes.append((entity["text"].lower(), entity["label"].lower(), row['id'], 'google'))
            # print(entity["text"].lower(), "=>", entity["label"].lower())

print(len(list(set(buy_nodes))))

#######

abt_nodes = []

for idx, row in tqdm(abt_df.iterrows()):
    text = f"The Title is: {row['name']}. The price is: {row['price']}. And now the description: {row['description']}"
    # Load Text
    texts = splitter.create_documents([text])
    # Split Text
    pages = splitter.split_documents(texts)
    for p in pages:
        entities = model.predict_entities(p.page_content, labels, threshold=0.5)
        for entity in entities:
            abt_nodes.append((entity["text"].lower(), entity["label"].lower(), row['id'], 'amazon'))
            # print(entity["text"].lower(), "=>", entity["label"].lower())

print(len(list(set(abt_nodes))))

nodes = buy_nodes + abt_nodes

# KG Generation

In [14]:
def increment_edge_weight(G, u, v):
    if G.has_edge(u, v):
        G[u][v]['weight'] += 1
    else:
        # Füge die Kante hinzu, falls sie noch nicht existiert, mit Gewicht 1
        G.add_edge(u, v, weight=1)

In [None]:
# Create Graph
G = nx.Graph()
## Add Nodes
for node in nodes:
    if not G.has_node(node[0]):
        G.add_node(
            node[0].lower(),
            type="entity"
        )
    if not G.has_node(node[1]):
        G.add_node(
            node[1].lower(),
            type="label"
        )
    if not G.has_node(node[2]):
        G.add_node(
            node[2],
            type="id"
        )

    # Add tuple edges
    increment_edge_weight(G, node[0].lower(), node[1].lower())
    increment_edge_weight(G, node[1].lower(), node[2])
    increment_edge_weight(G, node[0].lower(), node[2])

# Beispiel: Anzahl der Knoten und Kanten anzeigen
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

In [22]:
# nx.write_gexf(G, "./assets/n2v_abt_buy_graph_full.gexf")

# Graph Embedding

In [None]:
node2vec = Node2Vec(G, dimensions=128, walk_length=40, num_walks=100, workers=2)
model = node2vec.fit(window=10, min_count=1)

In [25]:
model.wv.save_word2vec_format('./assets/KG_abt_buy_n2v_embedding.csv')
model.save('./assets/KG_abt_buy_n2v_model')

In [26]:
# Retrieve node embeddings
node_ids = model.wv.index_to_key  # list of node IDs
node_labels = model.wv.key_to_index
node_embeddings = model.wv.vectors

In [27]:
n2v_embd = pd.DataFrame(node_embeddings, index=node_labels).reset_index()
n2v_embd.rename(columns = {'index':'product'}, inplace=True)

In [None]:
buy_idxs = [str(x) for x in buy_df['id']]
abt_idxs = [str(x) for x in abt_df['id']]

n2v_embd_buy= n2v_embd[n2v_embd['product'].isin(buy_idxs)]
n2v_embd_abt = n2v_embd[n2v_embd['product'].isin(abt_idxs)]
print(len(n2v_embd_buy))
print(len(n2v_embd_abt))

In [42]:
l=list(n2v_embd_buy['product'].values) + list(n2v_embd_abt['product'].values)
id_dataset = {('google' if str(x).split(':')[0] == 'http' else 'amazon'): k for k, x in enumerate(l)}

In [43]:
n2v_embd_buy['type'] = 'google'
n2v_embd_abt['type'] = 'amazon'

In [None]:
n2v_combined = pd.concat([n2v_embd_abt, n2v_embd_buy])
n2v_combined.head(1)

In [None]:
n2v_vectors = np.array(n2v_combined.drop(['product', 'type'], axis=1).to_numpy())
cosine_n2v = cosine_similarity(n2v_vectors)
cosine_n2v_df = pd.DataFrame(cosine_n2v, index=n2v_combined['product'], columns=n2v_combined['product'])
cosine_n2v_df.head(3)

In [46]:
# Google is True -->Y Invertieren für Amazon
cosine_dataset_id = [True if x in buy_idxs else False for x in cosine_n2v_df.index.values]

In [76]:
perfect_matching = pd.read_csv('../datasets/abt_buy/abt_buy_perfectMapping.csv')
matching_scores = []

for i, (idx, row) in enumerate(cosine_n2v_df.iterrows()):
    pm_abt = perfect_matching[perfect_matching['idAbt']==int(idx)]['idBuy'].values
    pm_abt = [str(x) for x in pm_abt]
    pm_buy = perfect_matching[perfect_matching['idBuy']==int(idx)]['idAbt'].values
    pm_buy = [str(x) for x in pm_buy]
    if idx in buy_idxs:
        f = pd.Series(cosine_dataset_id).values
        d = cosine_n2v_df[f]
        dd = d[d.index == idx].T
        dd_f = dd[dd.index.isin(abt_idxs)].sort_values(idx, ascending=False)
        indices = np.where(np.isin(dd_f.index.values, pm_buy))[0]
        if len(indices) > 0:
            tuple_out = [(x, dd_f.iloc[x][idx], 'buy', idx, pm_buy) for x in indices][0]
            matching_scores.append(tuple_out)
    elif idx in abt_idxs:
        f = ~pd.Series(cosine_dataset_id).values
        d = cosine_n2v_df[f]
        dd = d[d.index == idx].T
        dd_f = dd[dd.index.isin(buy_idxs)].sort_values(idx, ascending=False)
        indices = np.where(np.isin(dd_f.index.values, pm_abt))[0]
        if len(indices) > 0:
            tuple_out = [(x, dd_f.iloc[x][idx], 'abt', idx, pm_abt) for x in indices][0]
            matching_scores.append(tuple_out)

In [None]:
result_df = pd.DataFrame(matching_scores, columns=['rank', 'score', 'dataset', 'index', 'perfect_match'])
print(f"Ideal Matching (%): {round(result_df['rank'].value_counts()[0] / len(result_df), 3) * 100}")
result_df

In [None]:
custom_palette = sns.color_palette(['#FF33CC', '#39FF14', '#00FFFF', '#998650', '#E0BE36'])
sns.set_palette(custom_palette)

sns.set_style('ticks')
sns.set_context('talk')
fig, ax = plt.subplots()
fig.set_size_inches(10, 7)

sns.scatterplot(data=result_df, x="rank", y="score", hue='dataset')
sns.rugplot(data=result_df, x="rank", y="score", color='black')

# Description Analysis

In [86]:
buy_embd['type'] = 'google'
abt_embd['type'] = 'amazon'

In [112]:
buy_idxs = buy_embd['id'].values
abt_idxs = abt_embd['id'].values

In [113]:
description_embd_combined = pd.concat([buy_embd, abt_embd])

In [None]:
description_vectors = np.array(description_embd_combined.drop(['id', 'type'], axis=1).to_numpy())
cosine_descriptions = cosine_similarity(description_vectors)
cosine_descriptions_df = pd.DataFrame(cosine_descriptions, index=description_embd_combined['id'], columns=description_embd_combined['id'])
cosine_descriptions_df.tail(3)

In [None]:
f

In [118]:
perfect_matching = pd.read_csv('../datasets/abt_buy/abt_buy_perfectMapping.csv')
matching_scores = []

cosine_dataset_id = [True if x in buy_idxs else False for x in cosine_descriptions_df.index.values]

for i, (idx, row) in enumerate(cosine_descriptions_df.iterrows()):
    pm_abt = perfect_matching[perfect_matching['idAbt']==idx]['idBuy'].values
    pm_buy = perfect_matching[perfect_matching['idBuy']==idx]['idAbt'].values
    if idx in buy_idxs:
        f = pd.Series(cosine_dataset_id).values
        d = cosine_descriptions_df[f]
        dd = d[d.index == idx].T
        dd_f = dd[dd.index.isin(abt_idxs)].sort_values(idx, ascending=False)
        indices = np.where(np.isin(dd_f.index.values, pm_buy))[0]
        if len(indices) > 0:
            tuple_out = [(x, dd_f.iloc[x][idx], 'buy', idx, pm_buy) for x in indices][0]
            matching_scores.append(tuple_out)
    elif idx in abt_idxs:
        f = ~pd.Series(cosine_dataset_id).values
        d = cosine_descriptions_df[f]
        dd = d[d.index == idx].T
        dd_f = dd[dd.index.isin(buy_idxs)].sort_values(idx, ascending=False)
        indices = np.where(np.isin(dd_f.index.values, pm_abt))[0]
        if len(indices) > 0:
            tuple_out = [(x, dd_f.iloc[x][idx], 'abt', idx, pm_abt) for x in indices][0]
            matching_scores.append(tuple_out)

In [None]:
result_df_description = pd.DataFrame(matching_scores, columns=['rank', 'score', 'dataset', 'index', 'perfect_match'])
print(f"Ideal Matching (%): {round(result_df_description['rank'].value_counts()[0] / len(result_df_description), 3) * 100}")
result_df_description

In [None]:
custom_palette = sns.color_palette(['#FF33CC', '#39FF14', '#00FFFF', '#998650', '#E0BE36'])
sns.set_palette(custom_palette)

sns.set_style('ticks')
sns.set_context('talk')
fig, ax = plt.subplots()
# sns.set_context("notebook", rc={"lines.linewidth": 3})
fig.set_size_inches(10, 7)

sns.scatterplot(data=result_df_description, x="rank", y="score", hue='dataset')
sns.rugplot(data=result_df_description, x="rank", y="score", color='black')

# Kombination

In [None]:
final_result = result_df.copy()
final_result['rank'] = result_df['rank'].combine(result_df_description['rank'], min)
final_result.head(2)

In [None]:
print(f"Ideal Matching (%): {round(final_result['rank'].value_counts()[0] / len(final_result), 3) * 100}")

In [None]:
custom_palette = sns.color_palette(['#FF33CC', '#39FF14', '#00FFFF', '#998650', '#E0BE36'])
sns.set_palette(custom_palette)

sns.set_style('ticks')
sns.set_context('talk')
fig, ax = plt.subplots()
# sns.set_context("notebook", rc={"lines.linewidth": 3})
fig.set_size_inches(10, 7)

sns.scatterplot(data=result_df_description, x="rank", y="score", hue='dataset')
sns.rugplot(data=result_df_description, x="rank", y="score", color='black')