In [1]:
import pandas as pd
import requests
import math
import json
import re

In [2]:
# testing for llm
response = requests.get("http://107.99.236.45:11434")

if response.status_code == 200:
    print("llm server is running")
else:
    print("llm server is not running")

llm server is running


In [3]:
def user_message(pds):

    user_message = (
        f"""
        You are an expert in knowledge graph construction.
        You will be given the Property descriptions which is the description of the properties of wikidata.
        Treat each description as independent.
        Extract the PID (property id) field that can be used for the QIDs of any influential people, person, place, or entity 
        that frequently appears in media, social media, TV programs, movies, sports programs, news programs, or other public spaces etc
        from each program description.
        For each of these add a new column as Relevant with information as "yes" or "no".
        Special Instructions:
            Extract the most relevant PID for each description. If the description does not fulfill the criteria then add "no" to the corresponding
            Relevant column.
            
        You can refer the below example for your reference:
        ------------------------------------------------------------------------------------------------------------------
        PID | Label | Description
        ------------------------------------------------------------------------------------------------------------------
        P6 | head of government | head of the executive power of this town, city, municipality, state, country, or other governmental body
        ------------------------------------------------------------------------------------------------------------------
        P10 | video | relevant video. For images, use the property P18. For film trailers, qualify with "object has role" (P3831)="trailer" (Q622550)
        ------------------------------------------------------------------------------------------------------------------
        P16 | transport network | network the infrastructure is a part of
        ------------------------------------------------------------------------------------------------------------------

        
        Provide output in the format below without any explanation for the results:
        ------------------------------------------------------------------------------------------------------------------
        PID | Label | Description | Relevant
        ------------------------------------------------------------------------------------------------------------------
        P6 | head of government | head of the executive power of this town, city, municipality, state, country, or other governmental body | yes
        ------------------------------------------------------------------------------------------------------------------
        P10 | video | relevant video. For images, use the property P18. For film trailers, qualify with "object has role" (P3831)="trailer" (Q622550) | yes
        ------------------------------------------------------------------------------------------------------------------
        P16 | transport network | network the infrastructure is a part of | no
        ------------------------------------------------------------------------------------------------------------------


        NOTE: Each description should be in a new row. Do not include any explanation for the results. Remove the <think> part.
        <<<>>> 
        <<<
        show description: {pds}
        >>>
        """
    )

    return user_message


def process_response(data):
    pattern = r'```\n(.*?)\n```'
    matches = re.findall(pattern, data, flags=re.DOTALL)
    if matches:
        dg = '\n'.join(matches)
    else:
        dg = ''
    lines = dg.split("\n")
    
    PID = []
    Label = []
    Description = []
    Relevant = []
    for line in lines:
        if not line.strip() or '---' in line:
            continue
        parts = line.split('|')
        if len(parts) != 4:
            continue
        
        PID_des = parts[0].strip()
        Label_des = parts[1].strip()
        Description_des = parts[2].strip()
        Relevant_des = parts[3].strip()
        
        PID.append(PID_des)
        Label.append(Label_des)
        Description.append(Description_des)
        Relevant.append(Relevant_des)
    
    df = pd.DataFrame({
        'PID': PID,
        'Label': Label,
        'Description': Description,
        'Relevant': Relevant
    })
    df = df.iloc[1:]
    return df

    

# genres_to_filter = ["news", "sports", "home improvement", "weather", "documentary", "reality", "crime",
#                     "mystery", "football", "entertainment", "outdoors", "basketball", "comedy", "adventure",
#                     "hunting", "drama", "game show", "children", "action", "soccer", 
#                     "animated", "history", "shopping", "baseball", "sitcom", 
#                     "hockey", "gardening", "talk", "consumer", "science"] 


In [4]:
file_path = "props.json"
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)
# data

df = pd.DataFrame([(item["id"], item["label"], item["description"]) for item in data], columns=["PID", "Label", "Description"])
df["PID_numeric"] = df["PID"].str.extract(r'P(\d+)').astype(int)
df = df.sort_values(by="PID_numeric").drop(columns=["PID_numeric"])

df["PID"] = df["PID"].astype(str)
df_main = df.reset_index(drop=True)

# csv_file_path = "wikidata_properties_labels_sorted.csv"
# df_main.to_csv(csv_file_path, index=False, encoding="utf-8")

pd.set_option("display.max_colwidth", None)
df_main

Unnamed: 0,PID,Label,Description
0,P6,head of government,"head of the executive power of this town, city, municipality, state, country, or other governmental body"
1,P10,video,"relevant video. For images, use the property P18. For film trailers, qualify with ""object has role"" (P3831)=""trailer"" (Q622550)"
2,P14,traffic sign,"graphic symbol describing the item, used at the side of or above roads to give instructions or provide information to road users"
3,P15,route map,image of route map at Wikimedia Commons
4,P16,transport network,network the infrastructure is a part of
...,...,...,...
12439,P13326,Toki Pona headnoun,Toki Pona common noun for which the name serves as a proper modifier
12440,P13327,Wine AppDB ID developer ID,identifier for this software or video game company at Wine AppDB
12441,P13328,Brussels Inventory of Natural Heritage site ID,identifier of natural sites in the Brussels-Capital Region
12442,P13329,Brussels Inventory of Natural Heritage tree ID,identifier for remarkable trees in the Brussels-Capital Region


In [5]:
# def complete(myquestion,session):
#     prompt =myquestion
#     model_name = 'llama3.1-405b'
#     # model_name = 'mixtral-8x7b' 
#     # model_name = 'llama3.1-405b'
#     # model_name = 'claude-3-5-sonnet'
#     cmd = """
#             select snowflake.cortex.complete(?, ?) as response        
#           """
#     df_response = session.sql(cmd, params=[model_name, prompt]).collect()
#     return df_response

from langchain_community.llms import Ollama
llm = Ollama(base_url = "http://107.99.236.45:11434", model="deepseek-r1:32b")
global llm

def complete(myquestion):
    response = llm.invoke(myquestion)
    print(response)
    return response

  llm = Ollama(base_url = "http://107.99.236.45:11434", model="deepseek-r1:32b")


In [None]:
selected_pids = []
total_rows = df_main.shape[0]
batch_size = 10
n_loop = math.ceil(total_rows/batch_size)
temp_df = pd.DataFrame()
for i in range(0,5):
    start_idx = i * batch_size
    end_idx = min((i+1) * batch_size, total_rows)
    print(f"[{start_idx}, {end_idx}]")
    # description = df_main['Description'].iloc[start_idx:end_idx].values
    description = df_main.iloc[start_idx:end_idx].values
    # break
    # tile_id_list = df_main['Label'].iloc[i*10:(i+1)*10].values
    # formatted_list = "\n".join([f"{i+1}.{desc}" for i, desc in enumerate(tile_id_list)])
    data = complete(user_message(description))
    df = process_response(data)
    temp_df = pd.concat([temp_df,df])

temp_df.reset_index(drop=True,inplace=True)
temp_df.to_csv("PIDs_check_from_llm.csv")
temp_df

[0, 10]
<think>
Alright, let's tackle this problem step by step. The user has provided a list of Wikidata properties and wants me to determine which are relevant for influential people, persons, places, or entities that appear frequently in media, social media, TV programs, movies, sports programs, news programs, or other public spaces.

First, I need to understand what each property represents by looking at its label and description. Then, I'll evaluate if it's relevant based on the given criteria.

Starting with P6: "head of government." This seems highly relevant because heads of government are prominent figures in media and politics, so tracking them makes sense. So, I'll mark this as yes.

Next is P10: "video." Since videos are a common way to feature individuals or places in media, this property could be useful. However, the description mentions it's more about relevant videos, which might not always tie directly to influential entities, but it can still be relevant. I'll tentati

In [35]:
import re

pattern = r'```\n(.*?)\n```'
matches = re.findall(pattern, data, flags=re.DOTALL)
if matches:
    dg = '\n'.join(matches)
else:
    dg = ''
dg

'PID  | Label          | Description                                                                                     | Relevant\n-----|----------------|-------------------------------------------------------------------------------------------------|---------\nP6   | head of government | head of the executive power of this town, city, municipality, state, country, or other governmental body | yes\nP10  | video          | relevant video. For images, use the property P18. For film trailers, qualify with...           | yes\nP14  | traffic sign   | graphic symbol describing the item, used at the side of or above roads to give instructions...   | no'

In [36]:
lines = dg.split("\n")
lines

['PID  | Label          | Description                                                                                     | Relevant',
 '-----|----------------|-------------------------------------------------------------------------------------------------|---------',
 'P6   | head of government | head of the executive power of this town, city, municipality, state, country, or other governmental body | yes',
 'P10  | video          | relevant video. For images, use the property P18. For film trailers, qualify with...           | yes',
 'P14  | traffic sign   | graphic symbol describing the item, used at the side of or above roads to give instructions...   | no']

In [41]:
parts[3].strip()

'Relevant'

In [43]:
PID = []
Label = []
Description = []
Relevant = []
# Iterate through each line
for line in lines:
    if not line.strip() or '---' in line:
        continue
    parts = line.split('|')
    if len(parts) != 4:
        continue
    
    PID_des = parts[0].strip()
    Label_des = parts[1].strip()
    Description_des = parts[2].strip()
    Relevant_des = parts[3].strip()
    
    PID.append(PID_des)
    Label.append(Label_des)
    Description.append(Description_des)
    Relevant.append(Relevant_des)

df = pd.DataFrame({
    'PID': PID,
    'Label': Label,
    'Description': Description,
    'Relevant': Relevant
})
df = df.iloc[1:]
df

Unnamed: 0,PID,Label,Description,Relevant
1,P6,head of government,"head of the executive power of this town, city, municipality, state, country, or other governmental body",yes
2,P10,video,"relevant video. For images, use the property P18. For film trailers, qualify with...",yes
3,P14,traffic sign,"graphic symbol describing the item, used at the side of or above roads to give instructions...",no


In [None]:
import requests

model_name = 'deepseek-r1:32b'
url = "http://107.99.236.45:11434"

apiBase = "http://107.99.236.45:11434"
headers = {"Content-Type": "application/json"}
data = {
    "model": "ollama/deepseek-r1:32b",
    "messages": [
        {
            "role": "user",
            "content": "What is the capital of France?"
        }
    ]
}

# response = requests.get(url, headers=headers, json=data)
response = requests.post(apiBase, headers=headers, json=data)
response.json()

In [1]:
# pip install langchain_community