In [22]:
# from helpers import *
import pandas as pd
import numpy as np
import seaborn as sns
from gliner import GLiNER
import ollama
from node2vec import Node2Vec
import networkx as nx
import plotly
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import ast
from langchain.text_splitter import RecursiveCharacterTextSplitter
from datetime import datetime
import pickle
from tqdm.notebook import tqdm

In [7]:
kaggle_dataset = pd.read_csv("../datasets/kaggle_dataset_metadata_10000.csv")
kaggle_dataset.fillna('', inplace=True)
kaggle_dataset['keywords'] = kaggle_dataset['keywords'].apply(ast.literal_eval)
dataset_idxs = list(kaggle_dataset['id'])

In [None]:
model = GLiNER.from_pretrained("urchade/gliner_mediumv2.1")
def create_splitter():
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=150,
        length_function=len,
        is_separator_regex=False,
    )

    return splitter

labels = [
    "person", "award", "date", "competitions", "teams", "organization", "location", "event", "product", 
    "quantity", "money", "percent", "time", "gpe", "facility", "language", "work_of_art", "law", "nationality", 
    "title", "field_of_study", "measurement", "technology"
]

splitter = create_splitter()

# ER

#### Extract own Entities

In [None]:
# nodes = []

# for idx, row in tqdm(df_sample.iterrows()):
#     text = row['titleNullable'] + row['subtitleNullable'] + row['descriptionNullable']
#     # Load Text
#     texts = splitter.create_documents([text])
#     # Split Text
#     pages = splitter.split_documents(texts)
#     for p in pages:
#         entities = model.predict_entities(p.page_content, labels, threshold=0.5)
#         for entity in entities:
#             nodes.append((entity["text"].lower(), entity["label"].lower(), row['id']))
#             # print(entity["text"].lower(), "=>", entity["label"].lower())

# print(len(list(set(nodes))))

In [9]:
# with open('./assets/er_liste.pkl', 'wb') as f:
#     pickle.dump(nodes, f)

#### Load pre-extracted Entities

In [13]:
# Load stored pickle

with open('../assets/er_liste.pkl', 'rb') as handle:
    nodes = pickle.load(handle)

# Create Graph

In [14]:
def increment_edge_weight(G, u, v):
    if G.has_edge(u, v):
        G[u][v]['weight'] += 1
    else:
        # Füge die Kante hinzu, falls sie noch nicht existiert, mit Gewicht 1
        G.add_edge(u, v, weight=1)

In [None]:
# Create Graph
G = nx.Graph()
## Add Nodes
for node in nodes:
    if not G.has_node(node[0]):
        G.add_node(
            node[0].lower(),
            type="entity"
        )
    if not G.has_node(node[1]):
        G.add_node(
            node[1].lower(),
            type="label"
        )
    if not G.has_node(node[2]):
        G.add_node(
            node[2].lower(),
            type="dataset"
        )
    # Add tuple edges
    increment_edge_weight(G, node[0].lower(), node[1].lower())
    increment_edge_weight(G, node[1].lower(), node[2].lower())
    increment_edge_weight(G, node[0].lower(), node[2].lower())

# Beispiel: Anzahl der Knoten und Kanten anzeigen
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

In [31]:
# Store Graph
# nx.write_gexf(G, "../assets/n2v_graph_full.gexf")

# Semantic Embedding

In [None]:
# description_emebeddings = []
# for idx, row in tqdm(kaggle_dataset.iterrows()):
#     t = row['title'] + row['subtitleNullable'] + row['descriptionNullable']
#     response = ollama.embeddings(model="Losspost/stella_en_1.5b_v5", prompt=t)
#     description_emebeddings.append(response['embedding'])

# description_embd = pd.DataFrame(description_emebeddings, index=kaggle_dataset['id']).reset_index()
# description_embd.to_csv('../assets/kaggle_embeddings_stella_1.5b.csv')

#### Load pre-trained Model

Please download the file *kaggle_embeddings_stella_1.5b.csv* [here](https://drive.google.com/file/d/1138deudt3d6coLpm-Hd8fY5CU-qWlMa_/view?usp=sharing) and place it `../assets/kaggle_embeddings_stella_1.5b.csv`

In [None]:
description_embd = pd.read_csv('../assets/kaggle_embeddings_stella_1.5b.csv').drop('Unnamed: 0', axis = 1).rename(columns = {'id':'dataset'})
print(description_embd.info())
vectors_descriptions = np.array(description_embd.drop('dataset', axis=1).to_numpy())
cosine_sim_descriptions = cosine_similarity(vectors_descriptions)
cosine_sim_descriptions_df = pd.DataFrame(cosine_sim_descriptions, index=description_embd['dataset'], columns=description_embd['dataset'])
cosine_sim_descriptions_df.head(3)

# Graph Embedding

In [None]:
# node2vec = Node2Vec(G, dimensions=128, walk_length=40, num_walks=100, workers=2)
# model = node2vec.fit(window=10, min_count=1)
# model.wv.save_word2vec_format('./assets/KG_n2v_embedding.csv')
# model.save('./assets/KG_n2v_model')

#### Load Pretrained Model

In [18]:
from gensim.models import Word2Vec
model = Word2Vec.load('../assets/n2v_model')

In [19]:
# Retrieve node embeddings
node_ids = model.wv.index_to_key  # list of node IDs
node_labels = model.wv.key_to_index
node_embeddings = model.wv.vectors

In [None]:
n2v_embd = pd.DataFrame(node_embeddings, index=node_labels).reset_index().rename(columns = {'index':'dataset'})
n2v_embd = n2v_embd[n2v_embd['dataset'].isin(dataset_idxs)]
print(len(n2v_embd))
n2v_embd.head(2)

In [None]:
vectors_n2v = np.array(n2v_embd.drop('dataset', axis=1).to_numpy())
cosine_sim_n2v = cosine_similarity(vectors_n2v)
cosine_sim_n2v_df = pd.DataFrame(cosine_sim_n2v, index=n2v_embd['dataset'], columns=n2v_embd['dataset'])
cosine_sim_n2v_df.head(3)

# Compare Results

In [58]:
idxs = list(set(cosine_sim_n2v_df.index) & set(cosine_sim_descriptions_df.index))

results = {}
top_10_mappings = {}

for idx in idxs:
    # Description
    arr_descriptions = cosine_sim_descriptions_df[cosine_sim_descriptions_df.index == idx].values[0]
    ind_descriptions = np.argpartition(arr_descriptions, -10)[-10:]
    descriptions_ = [cosine_sim_descriptions_df[cosine_sim_descriptions_df.index == idx].columns[x] for x in ind_descriptions]

    # Graph
    arr_graphs = cosine_sim_n2v_df[cosine_sim_n2v_df.index == idx].values[0]
    ind_graphs = np.argpartition(arr_graphs, -10)[-10:]
    graphs_ = [cosine_sim_n2v_df[cosine_sim_n2v_df.index == idx].columns[x] for x in ind_graphs]

    # Create Top 10 Mapping
    top_10_mappings[idx] = {
        'semantic': descriptions_,
        'contextual': graphs_
    }

    out_dict = {
        "semantic": [],
        "semantic_similarity": [],
        "graph": [],
        "graph_similarity": []
    }
    for d, g in zip(descriptions_, graphs_):

        reference_string = kaggle_dataset[kaggle_dataset['id'] == idx]['title'].values[0]
        graph_string = kaggle_dataset[kaggle_dataset['id'] == g]['title'].values[0]
        semantic_string = kaggle_dataset[kaggle_dataset['id'] == d]['title'].values[0]

        vectorizer_semantic = TfidfVectorizer()
        vectorizer_graph = TfidfVectorizer()
        tfidf_matrix_graph = vectorizer_semantic.fit_transform([reference_string, graph_string])
        tfidf_matrix_semantic = vectorizer_graph.fit_transform([reference_string, semantic_string])

        similarity_graph = cosine_similarity(tfidf_matrix_graph[0:1], tfidf_matrix_graph[1:2])
        similarity_semantic = cosine_similarity(tfidf_matrix_semantic[0:1], tfidf_matrix_semantic[1:2])

        out_dict['graph'].append(graph_string)
        out_dict['graph_similarity'].append(round(similarity_graph[0][0], 2))
        out_dict['semantic'].append(semantic_string)
        out_dict['semantic_similarity'].append(round(similarity_semantic[0][0], 2))

    set1 = set(out_dict['graph'])
    set2 = set(out_dict['semantic'])

    common_entries = set1.intersection(set2)
    count_common_entries = len(common_entries)
    
    results[idx] = {
        'common_entries': count_common_entries,
        'graph_similarity': round(sum(out_dict['graph_similarity']) / 10, 2),
        'semantic_similarity': round(sum(out_dict['semantic_similarity']) / 10, 2),
        'domain': kaggle_dataset[kaggle_dataset['id'] == idx]['domains'].values[0]
    }

results_df = pd.DataFrame(results).T

In [None]:
results_df.sort_values('common_entries', ascending=False)