In [None]:
import pandas as pd
# from transformers import pipeline
import json
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AffinityPropagation
import networkx as nx
import community as community_louvain

In [3]:
# pip install scikit-learn
# pip install community
# pip install transformers

In [2]:
file_path = "props.json"
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)
# data

df = pd.DataFrame([(item["id"], item["label"], item["description"]) for item in data], columns=["PID", "Label", "Description"])
df["PID_numeric"] = df["PID"].str.extract(r'P(\d+)').astype(int)
df = df.sort_values(by="PID_numeric").drop(columns=["PID_numeric"])

df["PID"] = df["PID"].astype(str)
df_main = df.reset_index(drop=True)

# csv_file_path = "wikidata_properties_labels_sorted.csv"
# df_main.to_csv(csv_file_path, index=False, encoding="utf-8")

pd.set_option("display.max_colwidth", None)
df_main

Unnamed: 0,PID,Label,Description
0,P6,head of government,"head of the executive power of this town, city, municipality, state, country, or other governmental body"
1,P10,video,"relevant video. For images, use the property P18. For film trailers, qualify with ""object has role"" (P3831)=""trailer"" (Q622550)"
2,P14,traffic sign,"graphic symbol describing the item, used at the side of or above roads to give instructions or provide information to road users"
3,P15,route map,image of route map at Wikimedia Commons
4,P16,transport network,network the infrastructure is a part of
...,...,...,...
12439,P13326,Toki Pona headnoun,Toki Pona common noun for which the name serves as a proper modifier
12440,P13327,Wine AppDB ID developer ID,identifier for this software or video game company at Wine AppDB
12441,P13328,Brussels Inventory of Natural Heritage site ID,identifier of natural sites in the Brussels-Capital Region
12442,P13329,Brussels Inventory of Natural Heritage tree ID,identifier for remarkable trees in the Brussels-Capital Region


In [3]:
# classifier = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli")
# clabels = ["movies", "TV programs", "sports programs", "news programs"]

# cthreshold = 0.5

# def category(description):
#     if "identifier" in description or "ID" in description:
#         return "negative"
#     res = classifier(description, clabels)
#     plabels = res["labels"][0]
#     cscore = res["scores"][0]
#     if cscore < cthreshold:
#         return "negative"
#     return plabels

# df["predicted_labels"] = df["Description"].apply(category)
# df

In [4]:
df_id = pd.read_csv("pids_relevant_file.csv")
df_id

Unnamed: 0,PID,Label,Description,predicted_labels
0,P10,video,"relevant video. For images, use the property P18. For film trailers, qualify with ""object has role"" (P3831)=""trailer"" (Q622550)",movies
1,P54,member of sports team,sports teams or clubs that the subject represents or represented,sports programs
2,P57,director,"director(s) of film, TV-series, stageplay, video game or similar",TV programs
3,P115,home venue,home stadium or venue of a sports team or applicable performing arts organization,sports programs
4,P118,league or competition,"league or competition in which team or player has played, or in which an event occurs",sports programs
...,...,...,...,...
362,P13102,Damehåndbolddatabasen ID,handball female player page on the Damehåndbolddatabasen website,sports programs
363,P13146,picture of this person doing their job,"picture of a person in action, especially for a sportsperson, visual artist, musican, actor. P18 is normally used for portraits",sports programs
364,P13243,Game Jolt username,username on Game Jolt,sports programs
365,P13258,Presisov večjezični slovar ID,entry for a lexeme in the online edition of Presisov večjezični slovar,news programs


In [5]:
descriptions = df_id["Description"].to_list()
descriptions

['relevant video. For images, use the property P18. For film trailers, qualify with "object has role" (P3831)="trailer" (Q622550)',
 'sports teams or clubs that the subject represents or represented',
 'director(s) of film, TV-series, stageplay, video game or similar',
 'home stadium or venue of a sports team or applicable performing arts organization',
 'league or competition in which team or player has played, or in which an event occurs',
 'organization or person responsible for publishing books, periodicals, printed music, podcasts, games or software',
 'actor in the subject production [use "character role" (P453) and/or "name of the character role" (P4633) as qualifiers] [use "voice actor" (P725) for voice-only role] - [use "recorded participant" (P11108) for non-fiction productions]',
 'person(s) who produced the film, musical work, theatrical production, etc. (for film, this does not include executive producers, associate producers, etc.) [for production company, use P272, video

In [6]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embedding = model.encode(descriptions)

NameError: name 'SentenceTransformer' is not defined

In [None]:
sim_matrix = cosine_similarity(embedding)
# af = AffinityPropagation(affinity='precomputed')
# direct_cluster = af.fit_predict(sim_matrix)
# df["D_cluster"] = direct_cluster
# df.head(5)

G = nx.Graph()
for i, entity in enumerate(df["Named Entity"]):
    G.add_node(i, name=entity)
for i in range(len(sim_matrix)):
    for j in range(i+1, len(sim_matrix)):
        if sim_matrix[i,j] > 0.5:
            G.add_edge(i,j,weight=sim_matrix[i,j])
partition = community_louvain.best_partition(G)
df["Graph_cluster"] = df.index.map(partition)

In [None]:
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)
df["x"] = embeddings_2d[:,0]
df["y"] = embeddings_2d[:,1]
plt.figure(figsize=(12,5))

# plt.subplot(1,2,1)
# sns.scatterplot(x="X", y="Y", hue = "D_cluster", data=df, palette="tab10", s=100, legend = False)
# plt.title("Direct cluster")
# plt.xlabel("tsne dim1")
# plt.ylabel("tsne dim2")
# for i, row in df.iterrows():
#     plt.text(row["x"], row["y"], row["Named Entity"], fontsize = 9, ha="right")

# plt.subplot(1,2,2)
sns.scatterplot(x="X", y="Y", hue = "Graph_cluster", data=df, palette="tab10", s=100, legend = False)
plt.title("Graph based cluster")
plt.xlabel("tsne dim1")
plt.ylabel("tsne dim2")
for i, row in df.iterrows():
    plt.text(row["x"], row["y"], row["Named Entity"], fontsize = 9, ha="right")

plt.tight_layout()
plt.show()