In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from gliner import GLiNER
import ollama
from node2vec import Node2Vec
import networkx as nx
import plotly
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import ast
from langchain.text_splitter import RecursiveCharacterTextSplitter
from datetime import datetime
import pickle
from tqdm.notebook import tqdm
import re

In [None]:
# Initialize GLiNER with the base model
model = GLiNER.from_pretrained("urchade/gliner_mediumv2.1")
def create_splitter():
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=150,
        length_function=len,
        is_separator_regex=False,
    )

    return splitter

labels = [
    "person", "award", "date", "competitions", "teams", "organization", "location", "event", "product", 
    "quantity", "money", "percent", "time", "gpe", "facility", "language", "work_of_art", "law", "nationality", 
    "title", "field_of_study", "measurement", "technology"
]

splitter = create_splitter()

GOOGLE_EMBEDDING_DF = "google_embeddings_stella_1.5b.csv"
AMAZON_EMBEDDING_DF = "amazon_embeddings_stella_1.5b.csv"

# Download pre-trained embeddings

[amazon](https://drive.google.com/file/d/1JSn-fFtP_o7T3XxKdRrwLP1hyy27wzpf/view?usp=sharing)

[google](https://drive.google.com/file/d/1IYr8Of23hrdkDkc24hTE7ixWVIthCSg2/view?usp=sharing)

In [2]:
def clean_text(text):
    # Entferne alle Satzzeichen (Punctuation)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Wandle den gesamten Text in Kleinbuchstaben um
    text = text.lower()
    
    return text

amazon_df = pd.read_csv('../datasets/amazon-google/Amazon.csv', encoding='unicode_escape')
amazon_df.fillna('', inplace=True)
amazon_df['title'] = amazon_df['title'].apply(lambda x: clean_text(x))
amazon_df['description'] = amazon_df['description'].apply(lambda x: clean_text(x))

google_df = pd.read_csv('../datasets/amazon-google/GoogleProducts.csv', encoding='unicode_escape')
google_df.fillna('', inplace=True)
google_df['name'] = google_df['name'].apply(lambda x: clean_text(x))
google_df['description'] = google_df['description'].apply(lambda x: clean_text(x))

In [None]:
print(len(amazon_df))
print(len(google_df))

# Semantic Embedding

In [10]:
# google_embeddings = []
# for idx, row in tqdm(google_df.iterrows()):
#     t = f"The manufcaturer of the product is: {row['manufacturer']}. The Title is: {row['name']}. The price is: {row['price']}. And now the description: {row['description']}"
#     response = ollama.embeddings(model="Losspost/stella_en_1.5b_v5", prompt=t)
#     google_embeddings.append(response['embedding'])
# amazon_embeddings = []
# for idx, row in tqdm(amazon_df.iterrows()):
#     t = f"The manufcaturer of the product is: {row['manufacturer']}. The Title is: {row['title']}. The price is: {row['price']}. And now the description: {row['description']}"
#     response = ollama.embeddings(model="Losspost/stella_en_1.5b_v5", prompt=t)
#     amazon_embeddings.append(response['embedding'])

In [11]:
# google_embd = pd.DataFrame(google_embeddings, index=google_df['id']).reset_index()
# google_embd.to_csv(GOOGLE_EMBEDDING_DF)

# amazon_embd = pd.DataFrame(amazon_embeddings, index=amazon_df['id']).reset_index()
# amazon_embd.to_csv(AMAZON_EMBEDDING_DF)

In [68]:
google_embd = pd.read_csv(GOOGLE_EMBEDDING_DF)
amazon_embd = pd.read_csv(AMAZON_EMBEDDING_DF)

# ER Extraction

In [13]:
google_nodes = []

for idx, row in tqdm(google_df.iterrows()):
    text = f"The manufcaturer of the product is: {row['manufacturer']}. The Title is: {row['name']}. The price is: {row['price']}. And now the description: {row['description']}"
    # Load Text
    texts = splitter.create_documents([text])
    # Split Text
    pages = splitter.split_documents(texts)
    for p in pages:
        entities = model.predict_entities(p.page_content, labels, threshold=0.5)
        for entity in entities:
            google_nodes.append((entity["text"].lower(), entity["label"].lower(), row['id'], 'google'))
            # print(entity["text"].lower(), "=>", entity["label"].lower())

print(len(list(set(google_nodes))))

#######

amazon_nodes = []

for idx, row in tqdm(amazon_df.iterrows()):
    text = f"The manufcaturer of the product is: {row['manufacturer']}. The Title is: {row['title']}. The price is: {row['price']}. And now the description: {row['description']}"
    # Load Text
    texts = splitter.create_documents([text])
    # Split Text
    pages = splitter.split_documents(texts)
    for p in pages:
        entities = model.predict_entities(p.page_content, labels, threshold=0.5)
        for entity in entities:
            amazon_nodes.append((entity["text"].lower(), entity["label"].lower(), row['id'], 'amazon'))
            # print(entity["text"].lower(), "=>", entity["label"].lower())

print(len(list(set(amazon_nodes))))

In [14]:
nodes = google_nodes + amazon_nodes

with open('./assets/google_amazon_er_liste.pkl', 'wb') as f:
    pickle.dump(nodes, f)

# KG Generation

In [17]:
def increment_edge_weight(G, u, v):
    if G.has_edge(u, v):
        G[u][v]['weight'] += 1
    else:
        # Füge die Kante hinzu, falls sie noch nicht existiert, mit Gewicht 1
        G.add_edge(u, v, weight=1)

In [None]:
# Create Graph
G = nx.Graph()
## Add Nodes
for node in nodes:
    if not G.has_node(node[0]):
        G.add_node(
            node[0].lower(),
            type="entity"
        )
    if not G.has_node(node[1]):
        G.add_node(
            node[1].lower(),
            type="label"
        )
    if not G.has_node(node[2]):
        G.add_node(
            node[2].lower(),
            type="product"
        )
    # if not G.has_node(node[3]):
    #     G.add_node(
    #         node[3].lower(),
    #         type="dataset"
    #     )
    # Add tuple edges
    increment_edge_weight(G, node[0].lower(), node[1].lower())
    increment_edge_weight(G, node[1].lower(), node[2].lower())
    increment_edge_weight(G, node[0].lower(), node[2].lower())
# Beispiel: Anzahl der Knoten und Kanten anzeigen
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

In [19]:
# nx.write_gexf(G, "./graphs/n2v_amazon_google_graph_full.gexf")

# Graph Embedding

In [20]:
node2vec = Node2Vec(G, dimensions=128, walk_length=40, num_walks=100, workers=2)
model = node2vec.fit(window=10, min_count=1)

In [46]:
model.wv.save_word2vec_format('./assets/KG_amazon_google_n2v_embedding.csv')
model.save('./assets/KG_amazon_google_n2v_model')

In [25]:
# Retrieve node embeddings
node_ids = model.wv.index_to_key  # list of node IDs
node_labels = model.wv.key_to_index
node_embeddings = model.wv.vectors

In [26]:
n2v_embd = pd.DataFrame(node_embeddings, index=node_labels).reset_index()
n2v_embd.rename(columns = {'index':'product'}, inplace=True)

In [None]:
google_idxs = list(google_df['id'])
amazon_idxs = list(amazon_df['id'])

n2v_embd_google = n2v_embd[n2v_embd['product'].isin(google_idxs)]
n2v_embd_amazon = n2v_embd[n2v_embd['product'].isin(amazon_idxs)]
print(len(n2v_embd_google))
print(len(n2v_embd_amazon))

In [30]:
l=list(n2v_embd_google['product'].values) + list(n2v_embd_amazon['product'].values)
id_dataset = {('google' if str(x).split(':')[0] == 'http' else 'amazon'): k for k, x in enumerate(l)}

In [31]:
n2v_embd_google['type'] = 'google'
n2v_embd_amazon['type'] = 'amazon'

In [None]:
n2v_combined = pd.concat([n2v_embd_amazon, n2v_embd_google])
n2v_combined.head(1)

In [None]:
n2v_vectors = np.array(n2v_combined.drop(['product', 'type'], axis=1).to_numpy())
cosine_n2v = cosine_similarity(n2v_vectors)
cosine_n2v_df = pd.DataFrame(cosine_n2v, index=n2v_combined['product'], columns=n2v_combined['product'])
cosine_n2v_df.head(3)

In [34]:
# Google is True -->Y Invertieren für Amazon
cosine_dataset_id = [True if x in google_idxs else False for x in cosine_n2v_df.index.values]

In [35]:
perfect_matching = pd.read_csv('../datasets/amazon-google/Amzon_GoogleProducts_perfectMapping.csv')
matching_scores = []

for i, (idx, row) in enumerate(cosine_n2v_df.iterrows()):
    pm_amazon = perfect_matching[perfect_matching['idAmazon']==idx]['idGoogleBase'].values
    pm_google = perfect_matching[perfect_matching['idGoogleBase']==idx]['idAmazon'].values
    if idx in google_idxs:
        f = pd.Series(cosine_dataset_id).values
        d = cosine_n2v_df[f]
        dd = d[d.index == idx].T
        dd_f = dd[dd.index.isin(amazon_idxs)].sort_values(idx, ascending=False)
        indices = np.where(np.isin(dd_f.index.values, pm_google))[0]
        if len(indices) > 0:
            tuple_out = [(x, dd_f.iloc[x][idx], 'google', idx, pm_google) for x in indices][0]
            matching_scores.append(tuple_out)
    elif idx in amazon_idxs:
        f = ~pd.Series(cosine_dataset_id).values
        d = cosine_n2v_df[f]
        dd = d[d.index == idx].T
        dd_f = dd[dd.index.isin(google_idxs)].sort_values(idx, ascending=False)
        indices = np.where(np.isin(dd_f.index.values, pm_amazon))[0]
        if len(indices) > 0:
            tuple_out = [(x, dd_f.iloc[x][idx], 'amazon', idx, pm_amazon) for x in indices][0]
            matching_scores.append(tuple_out)

In [None]:
result_df = pd.DataFrame(matching_scores, columns=['rank', 'score', 'dataset', 'index', 'perfect_match'])
print(f"Ideal Matching (%): {round(result_df['rank'].value_counts()[0] / len(result_df), 3) * 100}")
result_df.score.median()

In [None]:
custom_palette = sns.color_palette(['#FF33CC', '#39FF14', '#00FFFF', '#998650', '#E0BE36'])
sns.set_palette(custom_palette)

sns.set_style('ticks')
sns.set_context('talk')
fig, ax = plt.subplots()
# sns.set_context("notebook", rc={"lines.linewidth": 3})
fig.set_size_inches(8, 10)

sns.scatterplot(data=result_df, x="rank", y="score", hue='dataset')
sns.rugplot(data=result_df, x="rank", y="score", color='black')

# Description Analysis

In [38]:
google_embd['type'] = 'google'
amazon_embd['type'] = 'amazon'

In [39]:
description_embd_combined = pd.concat([google_embd, amazon_embd])

In [None]:
description_vectors = np.array(description_embd_combined.drop(['id', 'type'], axis=1).to_numpy())
cosine_descriptions = cosine_similarity(description_vectors)
cosine_descriptions_df = pd.DataFrame(cosine_descriptions, index=description_embd_combined['id'], columns=description_embd_combined['id'])
cosine_descriptions_df.tail(3)

In [41]:
perfect_matching = pd.read_csv('../datasets/amazon-google/Amzon_GoogleProducts_perfectMapping.csv')
matching_scores = []

cosine_dataset_id = [True if x in google_idxs else False for x in cosine_descriptions_df.index.values]

for i, (idx, row) in enumerate(cosine_descriptions_df.iterrows()):
    pm_amazon = perfect_matching[perfect_matching['idAmazon']==idx]['idGoogleBase'].values
    pm_google = perfect_matching[perfect_matching['idGoogleBase']==idx]['idAmazon'].values
    if idx in google_idxs:
        f = pd.Series(cosine_dataset_id).values
        d = cosine_descriptions_df[f]
        dd = d[d.index == idx].T
        dd_f = dd[dd.index.isin(amazon_idxs)].sort_values(idx, ascending=False)
        indices = np.where(np.isin(dd_f.index.values, pm_google))[0]
        if len(indices) > 0:
            tuple_out = [(x, dd_f.iloc[x][idx], 'google', idx, pm_google) for x in indices][0]
            matching_scores.append(tuple_out)
    elif idx in amazon_idxs:
        f = ~pd.Series(cosine_dataset_id).values
        d = cosine_descriptions_df[f]
        dd = d[d.index == idx].T
        dd_f = dd[dd.index.isin(google_idxs)].sort_values(idx, ascending=False)
        indices = np.where(np.isin(dd_f.index.values, pm_amazon))[0]
        if len(indices) > 0:
            tuple_out = [(x, dd_f.iloc[x][idx], 'amazon', idx, pm_amazon) for x in indices][0]
            matching_scores.append(tuple_out)

In [None]:
result_df_description = pd.DataFrame(matching_scores, columns=['rank', 'score', 'dataset', 'index', 'perfect_match'])
print(f"Ideal Matching (%): {round(result_df_description['rank'].value_counts()[0] / len(result_df_description), 3) * 100}")
result_df_description.score.median()

In [None]:
custom_palette = sns.color_palette(['#FF33CC', '#39FF14', '#00FFFF', '#998650', '#E0BE36'])
sns.set_palette(custom_palette)

sns.set_style('ticks')
sns.set_context('talk')
fig, ax = plt.subplots()
# sns.set_context("notebook", rc={"lines.linewidth": 3})
fig.set_size_inches(8, 10)

sns.scatterplot(data=result_df_description, x="rank", y="score", hue='dataset')
sns.rugplot(data=result_df_description, x="rank", y="score", color='black')

# Kombination

In [None]:
final_result = result_df.copy()
final_result['rank'] = result_df['rank'].combine(result_df_description['rank'], min)
final_result.head(2)

In [None]:
print(f"Ideal Matching (%): {round(final_result['rank'].value_counts()[0] / len(final_result), 3) * 100}")

In [None]:
custom_palette = sns.color_palette(['#FF33CC', '#39FF14', '#00FFFF', '#998650', '#E0BE36'])
sns.set_palette(custom_palette)

sns.set_style('ticks')
sns.set_context('talk')
fig, ax = plt.subplots()
# sns.set_context("notebook", rc={"lines.linewidth": 3})
fig.set_size_inches(8, 10)

markers = {"amazon": "X", "google": "X"}
# sns.scatterplot(data=result_df_description, x="rank", y="score", hue='dataset', alpha=0.07, style='dataset', markers='x')
# sns.scatterplot(data=result_df, x="rank", y="score", hue='dataset', alpha=0.07, style='dataset', markers='x')
sns.scatterplot(data=final_result, x="rank", y="score", hue='dataset')
sns.rugplot(data=final_result, x="rank", y="score", color='black')