In [None]:
!pip install SPARQLWrapper

Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-6.3.2-py3-none-any.whl (528 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m528.1/528.1 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting isodate<0.7.0,>=0.6.0 (from rdflib>=6.1.1->SPARQLWrapper)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 isodate-0.6.1 rdflib-6.3.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import random
import csv
import urllib.parse
import re
from SPARQLWrapper import JSON, SPARQLWrapper
from collections import defaultdict
import pandas as pd

WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql"
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11")


prefix = """
	PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wds: <http://www.wikidata.org/entity/statement/>
    PREFIX wdv: <http://www.wikidata.org/value/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wikibase: <http://wikiba.se/ontology#>
    PREFIX p: <http://www.wikidata.org/prop/>
    PREFIX ps: <http://www.wikidata.org/prop/statement/>
    PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX bd: <http://www.bigdata.com/rdf#>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
"""

test_num = 100

def write_csv(fname, data):
	with open(fname, 'w') as f:
		write = csv.writer(f)
		write.writerows(data)

def query_sparql(query):
	sparql.setQuery(prefix+query)
	sparql.setReturnFormat(JSON)

	results = sparql.query().convert()

	return results



def print_examples(training_data, test):
	num = 2
	print("training_data_example")
	for x in range(num):
		print(training_data[x])

	print()
	print("test_data_example")
	for x in range(num):
		print(test[x])

def getPronouns(gender, val_type="subj"):
	if gender == "female":
		if val_type == "subj":
			return "she"
		elif val_type == "poss" or val_type == "obj":
			return "her"
	elif gender == "male":
		if val_type == "subj":
			return "he"
		elif val_type == "poss":
			return "his"
		elif val_type == "obj":
			return "him"
	else:
		if val_type == "subj":
			return "they"
		elif val_type == "poss":
			return "their"
		elif val_type == "obj":
			return "them"
	return "they"

In [None]:
import os
log_dir = '/content/drive/MyDrive/'
def get_logging(logfile):
    if not os.path.isdir(log_dir):
        os.mkdir(log_dir)
    log_path = os.path.join(log_dir, logfile)
    logger = logging.getLogger(log_path)

    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                        datefmt='%m-%d %H:%M',
                        # handlers=[logging.FileHandler(logfile)],
                        filename=log_path,
                        filemode='w'
                        )
    return logger

In [None]:
import logging
#import logging_helper
from collections import defaultdict
from SPARQLWrapper import SPARQLWrapper, JSON
#PREFIXES = constants.PREFIXES
PREFIXES = prefix
WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql"
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11")

logger = get_logging('/content')


def clean_wikidata_id(wikidata_link):
    return wikidata_link.replace('http://www.wikidata.org/entity/', '')

class WikidataHelper():
    def __init__(self, logger=logger, wikidata_host=WIKIDATA_ENDPOINT):
        self.sparql = SPARQLWrapper(wikidata_host)
        self.logger = logger   # if logger else logging_helper.get_logging('{}.log'.format(self.__class__.__name__))

    def get_wikidata_id_by_label(self, entity_label, instance_of_id=None):
        retval = None

        if not entity_label:
            return retval

        instance_of_query = """
        BIND(wd:%s AS ?idType)
        {?s wdt:P31 ?idType. }
            UNION
            {?s wdt:P31 ?idNode .
            ?idNode wdt:P279 ?idType.}
                UNION
            {?s wdt:P31 ?idNode1 .
            ?idNode1 wdt:P279 ?idNode2.
            ?idNode2 wdt:P279 ?idType.}
        """ % instance_of_id if instance_of_id else ""

        query = """
        %s

        SELECT ?s (COUNT(?oedge) as ?count)
        WHERE
        {


            ?s rdfs:label '%s'@en .
            ?s wdt:P31 ?instance_of.
            FILTER (?instance_of not in ( wd:Q4167410 )  ) # filter disambiguation page
            %s
            ?s ?oedge ?other .

        } GROUP BY ?s ORDER BY DESC(?count)
        """ % (PREFIXES, entity_label, instance_of_query)
        self.sparql.setQuery(query)
        self.sparql.setReturnFormat(JSON)
        try:
            results = self.sparql.query().convert()
            retval = clean_wikidata_id(results['results']['bindings'][0]['s']['value'])
        except Exception as e:
            self.logger.info("[get_wikidata_id_by_label] Exception: {}\nquery={}".format(e, query))
        return retval


    def get_instance_of(self, wikidata_id):
        """

        :param wikidata_id:
        :return:
        """
        retval = []
        if not wikidata_id:
            return retval
        query = """
        %s
        SELECT DISTINCT ?oLabel
        WHERE
        {   wd:%s wdt:P31 ?o .
            ?o rdfs:label ?oLabel
            FILTER ( lang(?oLabel) = "en" )


        }
        """ % (PREFIXES, wikidata_id)
        self.sparql.setQuery(query)
        self.sparql.setReturnFormat(JSON)
        try:
            results = self.sparql.query().convert()
            retval = [x['oLabel']['value'] for x in results['results']['bindings']]
        except Exception as e:
            self.logger.info("[get_instance_of] Exception: {}\nquery={}".format(e, query))
        return retval

    def get_entity_all_outgoing_relations(self, wikidata_id):
        """
        all info includes all statements and identifiers in Wikidata page. Exclude alias, description
        :param wikidata_id:
        :return:
        """
        retval = {}

        if not wikidata_id:
            return retval

        query = """
        %s
        SELECT DISTINCT ?sLabel ?rel ?relName ?o ?oLabel
        WHERE
        {   wd:%s rdfs:label ?sLabel .
            wd:%s ?directClaimP ?o .          # Get the truthy triples.
            ?rel wikibase:directClaim ?directClaimP . # Find the Wikibase properties linked
                             # to the truthy triples' predicates

         FILTER (lang(?sLabel) = 'en')

         OPTIONAL {
           ?rel rdfs:label ?relName .
            FILTER ( lang(?relName) = "en" )  }
         OPTIONAL {
            ?o rdfs:label ?oLabel
            FILTER ( lang(?oLabel) = "en" )
           }

        }
        """% (PREFIXES, wikidata_id, wikidata_id)
        self.sparql.setQuery(query)
        self.sparql.setReturnFormat(JSON)
        try:
            results = self.sparql.query().convert()
            entity_lable = results['results']['bindings'][0]['sLabel']['value']
            retval['label'] = entity_lable
            relations = {}

            for r in results['results']['bindings']:
                relation_id = clean_wikidata_id(r['rel']['value'])
                object_id = clean_wikidata_id(r['o']['value']) if 'oLabel' in r else None
                object_label = r['oLabel']['value'] if 'oLabel' in r else r['o']['value']
                if relation_id in relations:
                    relations[relation_id]['values'].append({'object_id': object_id, 'object_label': object_label})
                else:
                    relations[relation_id] = {}
                    relations[relation_id]['relation_label'] = r['relName']['value']
                    relations[relation_id]['values'] = [{'object_id': object_id, 'object_label': object_label}]

            retval['relations'] = relations
        except Exception as e:
            self.logger.info("[get_entity_all_info] Exception: {}\nquery={}".format(e, query))
        return retval

    def get_entity_all_incoming_relations(self, wikidata_id):
        """
        get all direct incoming relations
        :param wikidata_id:
        :return:
        """
        retval = {}

        if not wikidata_id:
            return retval

        query = """
        %s
        SELECT DISTINCT  ?s ?sLabel ?rel ?relName ?oLabel
        WHERE
        {   wd:%s rdfs:label ?oLabel .

            ?s ?directClaimP wd:%s .          # Get the truthy triples.
            ?rel wikibase:directClaim ?directClaimP . # Find the Wikibase properties linked to the truthy triples' predicates


         FILTER (lang(?oLabel) = 'en')

         OPTIONAL {
           ?rel rdfs:label ?relName .
            FILTER ( lang(?relName) = "en" )  }
         OPTIONAL {
            ?s rdfs:label ?sLabel
            FILTER ( lang(?sLabel) = "en" )
           }
        }
        """ % (PREFIXES, wikidata_id, wikidata_id)
        self.sparql.setQuery(query)
        self.sparql.setReturnFormat(JSON)
        try:
            results = self.sparql.query().convert()
            entity_lable = results['results']['bindings'][0]['oLabel']['value']
            retval['label'] = entity_lable
            relations = {}

            for r in results['results']['bindings']:
                relation_id = clean_wikidata_id(r['rel']['value'])
                subject_id = clean_wikidata_id(r['s']['value']) if 'sLabel' in r else None
                subject_label = r['sLabel']['value'] if 'sLabel' in r else r['s']['value']
                if relation_id in relations:
                    relations[relation_id]['values'].append({'subject_id': subject_id, 'subject_label': subject_label})
                else:
                    relations[relation_id] = {}
                    relations[relation_id]['relation_label'] = r['relName']['value']
                    relations[relation_id]['values'] = [{'subject_id': subject_id, 'subject_label': subject_label}]

            retval['relations'] = relations
        except Exception as e:
            self.logger.info("[get_entity_incoming_edges] Exception: {}\nquery={}".format(e, query))
        return retval


    def get_entity_relation(self, wikidata_id, relation_id, is_outgoing_relation=True):
        """
        Get specific incoming/outgoing relation info
        :param wikidata_id: wikidata id for the node
        :param relation_id: wikidata id for the specific relation
        :param is_outgoing_relation: True for outgoing relations, False for incoming ones.
        :return: list of linked nodes with the wikidata_id and title.
        """
        retval = []

        if not wikidata_id:
            return retval

        relation_query = "wd:{} wdt:{} ?otherNode".format(wikidata_id, relation_id) if is_outgoing_relation \
            else '?otherNode wdt:{} wd:{}'.format(relation_id, wikidata_id)

        query = """
        %s
        SELECT DISTINCT  ?otherNode ?otherNodeLabel
        WHERE
        {   wd:%s rdfs:label ?nodeLabel .

            %s

         FILTER (lang(?nodeLabel) = 'en')

         OPTIONAL {
            ?otherNode rdfs:label ?otherNodeLabel
            FILTER ( lang(?otherNodeLabel) = "en" )
           }
        }
        """ % (PREFIXES, wikidata_id, relation_query)
        self.sparql.setQuery(query)
        self.sparql.setReturnFormat(JSON)
        try:
            results = self.sparql.query().convert()

            for r in results['results']['bindings']:
                retval.append({
                    'label': clean_wikidata_id(r['otherNodeLabel']['value']),
                    'wikidata_id': clean_wikidata_id(r['otherNode']['value'])
                })
        except Exception as e:
            self.logger.info("[get_entity_relation] Exception: {}\nquery={}".format(e, query))
        return retval


    def get_two_hop_path(self, source_node_id, target_node_id):
        """
        Find 2 hop path between source node and target node


        :param source_node_id:
        :param target_node_id:
        :return:
        """
        query = f"""
        {PREFIXES}
        SELECT DISTINCT  ?sourceNodeLabel ?relation1Label ?bridgeNode ?bridgeNodeLabel ?relation2Label ?targetNodeLabel
        WHERE
        {{   wd:{source_node_id} rdfs:label ?sourceNodeLabel .
            wd:{target_node_id} rdfs:label ?targetNodeLabel .
         FILTER (lang(?sourceNodeLabel) = 'en')
         FILTER (lang(?targetNodeLabel) = 'en')

         # relation: source -> relation1 -> bridge node -> relation2 -> target node
        {{ wd:{source_node_id} ?directClaimP ?bridgeNode .
            ?relations1 wikibase:directClaim ?directClaimP .
           ?bridgeNode ?otherDirectClaimP wd:{target_node_id} .
            ?relations2 wikibase:directClaim ?otherDirectClaimP .}}

         # relation: source -> relation1 -> bridge node <- relation2 <- target node
         UNION {{
            wd:{source_node_id} ?directClaimP ?bridgeNode .
           ?relations1 wikibase:directClaim ?directClaimP .
           wd:{target_node_id} ?otherDirectClaimP ?bridgeNode  .
           ?relations2 wikibase:directClaim ?otherDirectClaimP .
         }}

         # relation: source <- relations1 <- bridge node <- relations2 <- target node
        UNION {{
           ?bridgeNode ?directClaimP wd:{source_node_id}.
           ?relations1 wikibase:directClaim ?directClaimP .
           wd:{target_node_id} ?otherDirectClaimP ?bridgeNode  .
           ?relations2 wikibase:directClaim ?otherDirectClaimP .
         }}

         # relation: source <- relations1 <- bridge node -> relations2 -> target node
        UNION {{
         ?bridgeNode ?directClaimP wd:{source_node_id}.
           ?relations1 wikibase:directClaim ?directClaimP .
           ?bridgeNode ?otherDirectClaimP wd:{target_node_id}   .
           ?relations2 wikibase:directClaim ?otherDirectClaimP .
         }}


         OPTIONAL {{
            ?bridgeNode rdfs:label ?bridgeNodeLabel
            FILTER ( lang(?bridgeNodeLabel) = "en" )
           }}

         OPTIONAL {{
           ?relations1 rdfs:label ?relation1Label
                      FILTER ( lang(?relation1Label) = "en" )
         }}
         OPTIONAL {{
           ?relations2 rdfs:label ?relation2Label
                           FILTER ( lang(?relation2Label) = "en" )
         }}
        }}

        """
        retval = []
        self.sparql.setQuery(query)
        self.sparql.setReturnFormat(JSON)
        try:
            results = self.sparql.query().convert()

            for r in results['results']['bindings']:
                retval.append({
                    'sourceNodeLabel': r['sourceNodeLabel']['value'],
                    'relation1Label': r['relation1Label']['value'],
                    'bridgeNode': clean_wikidata_id(r['bridgeNode']['value']),
                    'bridgeNodeLabel': r['bridgeNodeLabel']['value'],
                    'relation2Label': r['relation2Label']['value'],
                    'targetNodeLabel': r['targetNodeLabel']['value']})

        except Exception as e:
            self.logger.info("[get_two_hop_path] Exception: {}\nquery={}".format(e, query))
        return retval

In [2]:
pop_df = pd.read_csv("/content/drive/MyDrive/KG-NLG Capstone 2023/popularity_data/sports_popularity.csv")
pop_df['entity_type'].unique()
filter_list = ['table_tennis_player', 'golfer', 'field_hockey_player', 'canadian_football_player', 'badminton_player', 'volleyball_player', 'swimmer', 'boxer', 'rugby_player',
                'tennis_player', 'ice_hockey_player',
                'baseball_player', 'cricketer', 'american_football_player',
                'soccer_player', 'basketball_player']

# Filter the DataFrame based on the filter list
filtered_pop_df = pop_df[pop_df['entity_type'].isin(filter_list)]

NameError: ignored

# get one hop data for 30 popular sports figure and 30 not so popular figures

In [None]:
player1 = filtered_pop_df.sort_values("count", ascending=False).reset_index()  # Sort the DataFrame by count
player2 = filtered_pop_df.sort_values("count", ascending=False)
player2 = player2[int(len(player2)/64):].reset_index()

NameError: ignored

In [None]:
player1

Unnamed: 0.1,level_0,Unnamed: 0,index,entity_type,wikidata_label,wikidata_label_clean,wikidata_description,imdb_id,gender,alias,date,wikipedia_link,wikipedia_title,wikipedia_title_clean,wikidata_id,count,domain_mapping
0,432611,1137394,2,soccer_player,Lionel Messi,Lionel Messi,Argentine association football player,nm2177779,male,Messi | Leo Messi | Lionel Andres Messi | Lion...,1987-06-24,https://en.wikipedia.org/wiki/Lionel_Messi,Lionel_Messi,Lionel Messi,Q615,3816771.0,sports
1,142129,236549,150,soccer_player,Cristiano Ronaldo,Cristiano Ronaldo,Portuguese footballer (born 1985),nm1860184,male,Ronaldo | CR7 | Cristiano Ronaldo dos Santos A...,1985-02-05,https://en.wikipedia.org/wiki/Cristiano_Ronaldo,Cristiano_Ronaldo,Cristiano Ronaldo,Q11571,3382718.0,sports
2,6794,9711,2825,tennis_player,George VI,George VI,"King of the United Kingdom from 1936 to 1952, ...",,male,Bertie | Albert Windsor | George Windsor | Alb...,1895-12-14,https://en.wikipedia.org/wiki/George_VI,George_VI,George VI,Q280856,2274593.0,sports
3,280839,703013,214075,soccer_player,Erling Haaland,Erling Haaland,Norwegian footballer (born 2000),nm10994643,male,Erling Braut Haaland | Erling Braut Håland | E...,2000-07-21,https://en.wikipedia.org/wiki/Erling_Haaland,Erling_Haaland,Erling Haaland,Q28967995,1684726.0,sports
4,201112,426833,7994,basketball_player,Brittney Griner,Brittney Griner,American basketball player,nm5330249,female,Brittney Yevette Griner,1990-10-18,https://en.wikipedia.org/wiki/Brittney_Griner,Brittney_Griner,Brittney Griner,Q2925780,1534569.0,sports
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
738648,783107,2282266,137253,basketball_player,Megan Pinske,Megan Pinske,Canadian basketball player,,female,,1988-10-31,,,,Q112800121,,sports
738649,783108,2282267,137254,basketball_player,Laurelle Weigl,Laurelle Weigl,Canadian basketball player,,female,,1988-04-05,,,,Q112800421,,sports
738650,783109,2282268,137255,basketball_player,Leighann Doan,Leighann Doan,Canadian basketball player,,female,,1978-11-06,,,,Q112800963,,sports
738651,783110,2282269,137256,basketball_player,Alex Anderson,Alex Anderson,American basketball player,,female,,1986-02-21,,,,Q112691972,,sports


In [None]:
helper = WikidataHelper()
helper.get_entity_all_incoming_relations('Q169138')

In [None]:
from datetime import datetime
import time
cutoff_values = [40, 5, 0]
rating_val = ['excellent', 'good', 'okay']
# rating_val = ['excellent', 'good', 'okay', 'poor']
def check_value(dictionary, value):
    for inner_dict in dictionary.values():
        if value in inner_dict.values():
            return True
    return False

def process_data(data):
    for key, value in data.items():
        if isinstance(value, str) and value == "":
            data[key] = "N/A"
        elif isinstance(value, list) and value == []:
            data[key] = ["N/A"]
    return data

def get_topn_data(df, n, req_lst):
    helper = WikidataHelper()
    full_data = []
    count = 0
    unique = []
    for index, player in df.iterrows():
        player_data = {"name":"", "sport":"", "team_positions":[], "awards":[],
                       "dob":"", "pob":"", "sports_teams":[], "height":"", "start_time":"",
                       "num_awards":0, "stats_leader": [], "rating":"", "wiki_id":""}
        player_data['wiki_id'] = player['wikidata_id']
        player_q = helper.get_entity_all_outgoing_relations(player['wikidata_id'])
        incoming_player_q = helper.get_entity_all_incoming_relations(player['wikidata_id'])
        count_flg = 0
        while 'label' not in player_q and count_flg <= 1:
            time.sleep(1)
            player_q = helper.get_entity_all_outgoing_relations(player['wikidata_id'])
            count_flg+=1
        if count_flg >= 2:
            continue

        count_flg = 0
        while 'label' not in incoming_player_q and count_flg <= 1:
            time.sleep(1)
            incoming_player_q = helper.get_entity_all_incoming_relations(player['wikidata_id'])
            count_flg+=1
        if count_flg >= 2:
            continue

        player_data['name'] = df['wikidata_label_clean']

        if all(check_value(player_q['relations'], value) for value in req_lst) and player_q['label'] not in unique:
            print(count, ": ", n)
            unique.append(player_q['label'])
            if 'label' in incoming_player_q:
                for relation in incoming_player_q['relations'].values():
                    if relation['relation_label'] == 'statistical leader':
                        for val in relation['values']:
                            player_data['stats_leader'].append(val['object_label'])
            for relation in player_q['relations'].values():
                if relation['relation_label'] == 'place of birth':
                    player_data['pob'] = relation['values'][0]['object_label']
                elif relation['relation_label'] == 'position played on team / speciality':
                    for val in relation['values']:
                        player_data['team_positions'].append(val['object_label'])
                elif relation['relation_label'] == 'statistical leader':
                    for val in relation['values']:
                        player_data['stats_leader'].append(val['object_label'])
                elif relation['relation_label'] == 'work period (start)':
                    try:
                        dt = datetime.strptime(relation['values'][0]['object_label'], '%Y-%m-%dT%H:%M:%SZ')
                        output_string = dt.strftime("%Y")
                        player_data["start_time"] = output_string
                    except ValueError:
                        player_data["start_time"] = relation['values'][0]['object_label']
                        pass
                elif relation['relation_label'] == 'member of sports team':
                    for val in relation['values']:
                        player_data['sports_teams'].append(val['object_label'])
                elif relation['relation_label'] == 'sport':
                    player_data['sport'] = relation['values'][0]['object_label']
                elif relation['relation_label'] == 'award received':
                    for val in relation['values']:
                        player_data['awards'].append(val['object_label'])
                    player_data['num_awards'] = len(player_data['awards'])
                elif relation['relation_label'] == 'height':
                    player_data['height'] = relation['values'][0]['object_label']
                elif relation['relation_label'] == 'date of birth':
                    try:
                        dt = datetime.strptime(relation['values'][0]['object_label'], '%Y-%m-%dT%H:%M:%SZ')
                        output_string = dt.strftime('%B %d, %Y').replace(' 0', ' ')
                        player_data["dob"] = output_string
                    except ValueError:
                        player_data["dob"] = relation['values'][0]['object_label']
                        pass
                for i, v in enumerate(cutoff_values):
                    if player_data['num_awards'] >= v:
                        player_data["rating"] = rating_val[i]
                        break
            player_data = process_data(player_data)
            full_data.append(player_data)
            count += 1
            if count == n:
                return pd.DataFrame(full_data)
        else:
            continue
    return pd.DataFrame(full_data)

required_attribute_list = ['place of birth',
                           'work period (start)',
                           'sport', 'height', 'date of birth']
print("first")
df1 = get_topn_data(player1, 30, required_attribute_list)
print("second")
df2 = get_topn_data(player2, 30, required_attribute_list)

first
0 :  30
1 :  30
2 :  30
3 :  30
4 :  30
5 :  30
6 :  30
7 :  30
8 :  30
9 :  30
10 :  30
11 :  30
12 :  30
13 :  30
14 :  30
15 :  30
16 :  30
17 :  30
18 :  30
19 :  30
20 :  30
21 :  30
22 :  30
23 :  30
24 :  30
25 :  30
26 :  30
27 :  30
28 :  30
29 :  30
second
0 :  30
1 :  30
2 :  30
3 :  30
4 :  30
5 :  30
6 :  30
7 :  30
8 :  30
9 :  30
10 :  30
11 :  30
12 :  30
13 :  30
14 :  30
15 :  30
16 :  30
17 :  30
18 :  30
19 :  30
20 :  30
21 :  30
22 :  30
23 :  30
24 :  30
25 :  30
26 :  30
27 :  30
28 :  30
29 :  30


KeyboardInterrupt: ignored

In [None]:
df = pd.concat([df1, df2])
df.to_csv('/content/drive/MyDrive/sport_select.csv',index=False)

# multiHOP


In [None]:
# get popularity data
def top_n_pruning(lst, n, visited):
    count_dict = dict(zip(pop_df['wikidata_id'], pop_df['count']))
    for item in lst:
        object_id = item.get('object_id', None)
        subject_id = item.get('subject_id', None)
        if object_id is not None:
            count = count_dict.get(object_id, 0)
        elif subject_id is not None:
            count = count_dict.get(subject_id, 0)
        else:
            count = 0
        item['count'] = count
    sorted_list = sorted(lst, key=lambda x: x['count'], reverse=True)
    filtered_data = [item for item in sorted_list if ('object_id' not in item or item['object_id'] not in visited) and ('subject_id' not in item or item['subject_id'] not in visited)]
    if len(filtered_data) == 0:
        return None
    else:
        return filtered_data[:n]

def filter_sentences(sentences, mandatory):
    filtered_sentences = []
    for path_str, words in sentences:
        if all(word in path_str for word in mandatory):
            filtered_sentences.append((path_str, words))
    return filtered_sentences


from typing import List, Tuple
from datetime import datetime
SAVED_NODES_OUT = {}
SAVED_NODES_IN = {}
def generate_n_hop_paths(helper, q_node: str, n_hops: int, cur_str: str = '', cur_hop: int = 0, visited: set = None,
                         mandatory_list: set = None, excluded_list: set = None, keep_list: set = None, branch: bool = False) -> List[Tuple[str, List[str]]]:
    if visited is None:
        visited = set()
    visited.add(q_node)


    if cur_hop == n_hops:
        return [(cur_str, [q_node])]

    if q_node not in SAVED_NODES_OUT:
        time.sleep(1)
        outgoing_relations = helper.get_entity_all_outgoing_relations(q_node)
        SAVED_NODES_OUT[q_node] = outgoing_relations
    else:
        outgoing_relations = SAVED_NODES_OUT[q_node]
    if q_node not in SAVED_NODES_IN:
        time.sleep(1)
        incoming_relations = helper.get_entity_all_incoming_relations(q_node)
        SAVED_NODES_IN[q_node] = incoming_relations
    else:
        incoming_relations = SAVED_NODES_IN[q_node]
    paths = []
    #keep_list = ['part of', 'performer', 'tracklist', 'record label', 'genre']
    # Process outgoing relations
    for r in outgoing_relations:
        if r == 'label' and cur_hop == 0:
            start_node = outgoing_relations[r]
            cur_str = start_node
        elif r != 'label':
            for rels in outgoing_relations[r]:
                relation_label = outgoing_relations[r][rels]['relation_label']
                if relation_label in keep_list:
                    if len(outgoing_relations[r][rels]['values']) > 10:
                        pruned_list = top_n_pruning(outgoing_relations[r][rels]['values'], 10, visited)
                    else:
                        pruned_list = outgoing_relations[r][rels]['values']
                    if pruned_list is None:  # Check if pruned_list is None
                        continue
                    for val in pruned_list:
                        if relation_label == "publication date":
                            try:
                                dt = datetime.strptime(val["object_label"], '%Y-%m-%dT%H:%M:%SZ')
                                updated_str = cur_str + " --> " + relation_label + ' --> ' + dt.strftime("%Y")
                            except ValueError:
                                pass
                        else:
                            updated_str = cur_str + " --> " + relation_label + ' --> ' + val['object_label']
                        next_q_node = val['object_id']
                        new_visited = visited.copy()
                        new_visited.add(next_q_node)
                        sub_paths = generate_n_hop_paths(helper, next_q_node, n_hops,
                                                         updated_str, cur_hop + 1, new_visited,
                                                         mandatory_list, excluded_list, keep_list, branch)
                        for sub_path in sub_paths:
                            path_str, q_nodes = sub_path
                            paths.append((path_str, [q_node] + q_nodes))

    # Process incoming relations
    for r in incoming_relations:
        if r != 'label':
            for rels in incoming_relations[r]:
                relation_label = incoming_relations[r][rels]['relation_label']
                if relation_label in keep_list:
                    if len(incoming_relations[r][rels]['values']) > 10:
                        pruned_list = top_n_pruning(incoming_relations[r][rels]['values'], 10, visited)
                    else:
                        pruned_list = incoming_relations[r][rels]['values']
                    if pruned_list is None:  # Check if pruned_list is None
                        continue
                    for val in pruned_list:
                        updated_str = cur_str + " --> " + "~" + relation_label + ' --> ' + val['subject_label']
                        prev_q_node = val['subject_id']
                        new_visited = visited.copy()
                        new_visited.add(prev_q_node)
                        sub_paths = generate_n_hop_paths(helper, prev_q_node,
                                                         n_hops, updated_str, cur_hop + 1,
                                                         new_visited, mandatory_list,
                                                         excluded_list, keep_list, branch)
                        for sub_path in sub_paths:
                            path_str, q_nodes = sub_path
                            paths.append((path_str, [q_node] + q_nodes))
    paths = filter_sentences(paths, mandatory_list)
    if len(paths) > 10:
        paths = paths[:10]
    return paths

import random
def generate_trees(shape_inp, shape_num_hops, start_q_node, n_hops, mandatory_list, excluded_list, full_list, branch):
    helper = WikidataHelper()
    ori_path_list = generate_n_hop_paths(helper, start_q_node, n_hops,
                                         mandatory_list=mandatory_list,
                                         excluded_list=excluded_list,
                                         keep_list=full_list)
    #if len(shape_inp)==0:
    #    return ori_path_list
    aditional_path_list = []
    final_path = []
    for (path, nodes) in ori_path_list:
        skip = False
        temp_str = path
        temp_nodes = []
        for i, shape in enumerate(shape_inp):
            additions = generate_n_hop_paths(helper, nodes[shape], shape_num_hops[i],
                                             mandatory_list=mandatory_list,
                                             excluded_list=excluded_list,
                                             keep_list=full_list, branch=branch)
            addition_path = [t[0] for t in additions]   # [1, 2, 3]
            addition_q_node = [t[1] for t in additions]
            non_overlapping_nodes = []
            non_overlapping_paths = []
            for n, p in zip(addition_q_node, addition_path):
                if n[1] not in nodes and n[1] not in temp_nodes:
                    non_overlapping_nodes.append(n)
                    non_overlapping_paths.append(p)
            if len(non_overlapping_paths) != 0:
                index = random.randint(0, len(non_overlapping_paths)-1)
                temp_str+= ' | ' + non_overlapping_paths[index]
                temp_nodes.append(non_overlapping_nodes[index])
            else:
                skip = True
                break
        if skip == True:
            continue
        final_path.append(temp_str)
    return final_path

if __name__ == "__main__":

    helper = WikidataHelper()
    """
    n_hops = 2
    shape = [0]
    n_hops_shape = [1]

    """
    n_hops = 2
    shape = [0, 0]
    n_hops_shape = [1, 2]
    start_q_node = 'Q18786715'

    # path_list = generate_n_hop_paths(helper, start_q_node, n_hops)
    #path_list = generate_trees(shape, n_hops_shape, start_q_node, n_hops)
    #for p in path_list:
    #   print(p)
   # result = helper.get_two_hop_path(source_node_id='Q37175', target_node_id='Q295463')
   # for r in result:
   #    print(r)

In [None]:
import ast
import pandas as pd
df_sum = pd.read_csv("/content/drive/MyDrive/athlete_select.csv")

In [None]:
import json

# Open the JSON file
with open('/content/drive/MyDrive/athlete_path_constraints.json', 'r') as file:
    # Load the JSON data
    slot_dict = json.load(file)

In [None]:
import time
from datetime import datetime
import pickle
def remove_prefix(strings):
    return [" --> ".join(string.split(' --> ')[2:]) for string in strings]

def replace(strings, rating):
    flag = False
    final_out = []
    name = strings[0].split(' | ')[0].split(' --> ')[0]
    for i, string in enumerate(strings):
        seperate_lst = string.split(' | ')
        modified = []
        for item in seperate_lst:
            if len(item.split(' --> '))==3 and flag == False:
                modified.append(name + " --> rating --> " + rating)
                flag = True
            else:
                modified.append(item)
        final_out.append(" | ".join(modified))
    return final_out


def add_rating(strings, rating):
    return [rating + " --> ~rating --> " + string for string in strings]
def add_specifiers(strings, specifiers):
    return [random.choice(specifiers) + " --> ~specifier --> " + string for string in strings]

def generate_paths_with_da_constraints(path_to_shapes, slot_dict, specifiers):
    info = pd.read_csv(path_to_shapes)
    paths = []
    rating_df = pd.read_csv("/content/drive/MyDrive/athlete_select.csv")
    for da, value in slot_dict.items():
        mand = value['mandatory']
        excld = value['excluded']
        full = value['full']
        min_slots = value['min_slots']
        max_slots = value['max_slots']
        for index, row in info.iterrows():
            if row['not generate'] == 'X' or row['num_nodes'] < min_slots or row['num_nodes'] > max_slots:
                continue
            branch_loc = ast.literal_eval(row['branch_loc'])
            n_hops_shape = ast.literal_eval(row['n_hops_shape'])
            n_hops = row["Depth/Hops"]
            shape = row['Shape']
            for n, song in df_sum.iterrows():
                print("DA:{} shape:{} {}:60".format(da, shape, n))
                print(song['wiki_id'])
                if row['branch'] == "no":
                    branch = False
                else:
                    branch = True

                if da == "give_opinion" or da == "verify_attribute":
                    path_list = generate_trees(branch_loc, n_hops_shape, song['wiki_id'], n_hops-1, mand, excld, full, branch)
                else:
                    path_list = generate_trees(branch_loc, n_hops_shape, song['wiki_id'], n_hops, mand, excld, full, branch)
                count = 0
                while len(path_list) == 0 and count <=3:
                    del SAVED_NODES_OUT[song['wiki_id']]
                    del SAVED_NODES_IN[song['wiki_id']]
                    if da == "give_opinion":
                        path_list = generate_trees(branch_loc, n_hops_shape, song['wiki_id'], n_hops-1, mand, excld, full, branch)
                    else:
                        path_list = generate_trees(branch_loc, n_hops_shape, song['wiki_id'], n_hops, mand, excld, full, branch)
                    count += 1
                if da == "request_explanation":
                    path_list = remove_prefix(path_list)
                    rating=rating_df.loc[n]['rating']
                    path_list = add_rating(path_list, rating)
                elif da == "request":
                    path_list = remove_prefix(path_list)
                    path_list = add_specifiers(path_list, specifiers)
                elif da == "give_opinion" or da == "verify_attribute":
                    rating=rating_df.loc[n]['rating']
                    path_list = add_rating(path_list, rating)
                elif da == "inform" and row['inform replace rating']=='yes' and random.random() < 1/10:
                    rating=rating_df.loc[n]['rating']
                    if len(path_list) == 0:
                        continue
                    path_list = replace(path_list, rating)

                for p in path_list:
                    path_data = {"da":da, "shape":shape, "path":p, "n_hops":n_hops}
                    paths.append(path_data)
                df = pd.DataFrame(paths)
                df.to_csv("/content/drive/MyDrive/athlete_path_with_shapes.csv", index=False)
            print(len(SAVED_NODES_OUT))
            print(len(SAVED_NODES_IN))
            with open("/content/drive/MyDrive/out_info.json", 'wb') as json_file:
                pickle.dump(SAVED_NODES_OUT, json_file)

            with open("/content/drive/MyDrive/in_info.json", 'wb') as json_file_1:
                pickle.dump(SAVED_NODES_IN, json_file_1)

            with open("/content/drive/MyDrive/out_info.json", 'rb') as file:
                SAVED_NODES_OUT = pickle.load(file)

            with open("/content/drive/MyDrive/in_info.json", 'rb') as file:
                SAVED_NODES_IN = pickle.load(file)

specifier_list = ['gifted', 'accomplished', 'seasoned', 'boring', 'inadequate', 'unlucky', "underated", "overrated"]
path_to_shapes = "/content/drive/MyDrive/shapes.csv"

generate_paths_with_da_constraints(path_to_shapes, slot_dict, specifier_list)

NameError: ignored

In [None]:
df = pd.read_csv('/content/drive/MyDrive/org_paths.csv')
import pandas as pd
import re
def add_pseudo_mr_columns(df):
    def get_pseudo_mr_1(path):
        tokens = path.split(" --> ")
        return " ".join([tokens[i] for i in range(0, len(tokens), 2)])

    def get_pseudo_mr_2(path):
        old_toks = path.split(" --> ")
        tokens = []
        for i, item in enumerate(old_toks):
            tokens.append(item)
            if i!=0 and i % 2 == 0 and i!=len(old_toks)-1:
                tokens.append(old_toks[i])
        for i, tok in enumerate(tokens):
            if '~' in tok:
                tokens[i] = tokens[i][1:]
                tokens[i+1], tokens[i-1] = tokens[i-1], tokens[i+1]
        cur_str = ''
        for i in range(0, len(tokens), 3):
            cur_str += tokens[i] + " " + tokens[i+1] + " " + tokens[i+2] + ". "

        return cur_str


    df["pseudo_mr_1"] = df["path"].apply(get_pseudo_mr_1)
    df["pseudo_mr_2"] = df["path"].apply(get_pseudo_mr_2)
    return df



adjusted_df=add_pseudo_mr_columns(df)
adjusted_df.to_csv('/content/drive/MyDrive/mr_paths.csv')

In [None]:
 class SongQueries(object):
    SONG_ALL  = """
    SELECT ?song ?songLabel ?publication ?genreLabel ?labelLabel ?performerLabel ?view ?producerLabel ?instanceLabel ?partLabel ?tracklistLabel
    {
        BIND(wd:%s AS ?song)

        ?song rdfs:label ?songLabel .
        FILTER(LANG(?songLabel) = "en") .
        OPTIONAL
        {
            ?song wdt:P31 ?instance.
            ?instance rdfs:label ?instanceLabel .
            FILTER(LANG(?instanceLabel) = "en") .
        }
        OPTIONAL
        {
            ?song p:P1651 ?youtubeID .
            ?youtubeID pq:P5436 ?view.
        }
        OPTIONAL
        {
            ?song wdt:P162 ?producer.
            ?producer rdfs:label ?producerLabel .
            FILTER(LANG(?producerLabel) = "en") .
        }
        OPTIONAL
        {
            ?song wdt:P175 ?performer.
            ?performer rdfs:label ?performerLabel .
            FILTER(LANG(?performerLabel) = "en") .
        }
        OPTIONAL
        {
            ?song wdt:P136 ?genre.
            ?genre rdfs:label ?genreLabel .
            FILTER(LANG(?genreLabel) = "en") .
        }
        OPTIONAL
        {
            ?song wdt:P264 ?label.
            ?label rdfs:label ?labelLabel .
            FILTER(LANG(?labelLabel) = "en") .
        }
        OPTIONAL
        {
            ?song wdt:P577 ?publication.
        }
        OPTIONAL
        {
            ?song wdt:P361 ?part.
            ?part rdfs:label ?partLabel .
            FILTER(LANG(?partLabel) = "en") .
        }
        OPTIONAL
        {
            ?song wdt:P361 ?album.
            ?album wdt:P658 ?tracklist.
            ?tracklist rdfs:label ?tracklistLabel .
            FILTER(LANG(?tracklistLabel) = "en") .
        }
    }
    """

In [None]:
import pandas as pd

songs = pd.read_csv("/content/drive/MyDrive/Copy of music_popularity.csv")

  songs = pd.read_csv("/content/drive/MyDrive/Copy of music_popularity.csv")


In [None]:

songs

'Blurred Lines'

In [None]:
song = songs[songs.entity_type == "song"]
song

Unnamed: 0.1,Unnamed: 0,index,entity_type,wikidata_label,wikidata_label_clean,wikidata_description,imdb_id,gender,alias,date,wikipedia_link,wikipedia_title,wikipedia_title_clean,wikidata_id,count,domain_mapping
22712,292460,1,song,Never Gonna Give You Up,Never Gonna Give You Up,song written and composed by Stock Aitken Wate...,,,,1987-01-01,https://en.wikipedia.org/wiki/Never_Gonna_Give...,Never_Gonna_Give_You_Up,Never Gonna Give You Up,Q57,175164.0,music
22713,292461,27,song,Goldfinger,Goldfinger,theme song of the James Bond film,,,Theme from « Goldfinger »,1964-01-01,https://en.wikipedia.org/wiki/Goldfinger_(Shir...,Goldfinger_(Shirley_Bassey_song),Goldfinger Shirley Bassey song,Q14716,8991.0,music
22714,292462,50,song,The Way You Make Me Feel,The Way You Make Me Feel,original song written and composed by Michael ...,,,,1987-01-01,https://en.wikipedia.org/wiki/The_Way_You_Make...,The_Way_You_Make_Me_Feel,The Way You Make Me Feel,Q14651,23275.0,music
22715,292463,61,song,Im Wartesaal zum großen Glück,Im Wartesaal zum großen Glück,Walter Andreas Schwarz song,,,"Das Lied vom großen Glück | ""Im Wartesaal zum ...",1956-01-01,https://en.wikipedia.org/wiki/Im_Wartesaal_zum...,Im_Wartesaal_zum_gro%C3%9Fen_Gl%C3%BCck,Im Wartesaal zum gro%C3%9Fen Gl%C3%BCck,Q82525,,music
22716,292464,121,song,Feel the Love,Feel the Love,Rudimental song,,,,2012-05-14,https://en.wikipedia.org/wiki/Feel_the_Love_(R...,Feel_the_Love_(Rudimental_song),Feel the Love Rudimental song,Q21083,3007.0,music
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456263,1709193,139986,song,Hello Future,Hello Future,,,,,2018-04-25,,,,Q113044415,,music
456264,1709194,139987,song,Re:RISE -e.p.- 2,ReRISE e.p. 2,,,,,2020-08-05,,,,Q113045163,,music
456265,1709195,139988,song,Re:RISE -e.p.-,ReRISE e.p.,,,,,2019-10-23,,,,Q113045180,,music
456266,1709196,139989,song,Sayonara namida/Hoshi no kakera,Sayonara namidaHoshi no kakera,2020 Spira Spica single,,,,2020-12-09,,,,Q113045146,,music


In [None]:
# Get the value of the "count" column for each quartile cutoff row
cutoff_values = [10000000, 100000, 10000, 1000]
rating_val = ['excellent', 'good', 'mediocre', 'bad']

In [None]:
song = song.sort_values("count", ascending=False)  # Sort the DataFrame by count

# extract the middle 200 values from the "count" column
song = song[int(len(song)/8):].reset_index()
song

NameError: ignored

  songs = pd.read_csv("/content/drive/MyDrive/Copy of music_popularity.csv")


60

DA:inform 0:60
['genres'] ['publication date'] ['genres', 'producer', 'record label', 'performer', 'instance of', 'part of', 'track list']
DA:inform 1:60
['genres'] ['publication date'] ['genres', 'producer', 'record label', 'performer', 'instance of', 'part of', 'track list']
DA:inform 2:60
['genres'] ['publication date'] ['genres', 'producer', 'record label', 'performer', 'instance of', 'part of', 'track list']
DA:inform 3:60
['genres'] ['publication date'] ['genres', 'producer', 'record label', 'performer', 'instance of', 'part of', 'track list']


KeyboardInterrupt: ignored

['Taylor Swift --> genre --> pop music',
 'Taylor Swift --> genre --> country music',
 'Taylor Swift --> genre --> pop rock',
 'Taylor Swift --> genre --> synth-pop',
 'Taylor Swift --> genre --> indie folk',
 'Taylor Swift --> genre --> country pop']

In [None]:
from datetime import datetime
import math
import time
all_data = []
songleng = len(song)
counter = 0
max = 1
pop_df = pd.read_csv("/content/drive/MyDrive/Copy of music_popularity.csv")
song_ids = []
for n, songId in enumerate(song["wikidata_id"]):
    if counter == 30:
        break
    # print(n, "--", songleng)
    songName = song["wikipedia_title"][n]

   # print(bookId, bookName)
   # ?songLabel ?performerLabel ?publication ?genreLabel ?labelLabel

    songs_data = {"name":"", "performer":[], "producer":[], "publication_date":"",
                  "publication_year":"", "genres":[], "record_label":[], "rating":"",
                  "instance_of":[], "is_from_album":"", "from_album":"", "songs_in_album":[],
                  "two_hop":[], "three_hop":[], "four_hop":[], "five_hop":[]}
    # performersQ = query_sparql(SongQueries.SONG_PERFORMER  % (songId))
    othersQ = query_sparql(SongQueries.SONG_ALL  % (songId))
    # print(songName)
    # producersQ = query_sparql(SongQueries.SONG_PRODUCER  % (songId))
    # viewQ = query_sparql(SongQueries.SONG_VIEW  % (songId))
    # print(othersQ)
    flag = False
    for p in othersQ["results"]["bindings"]:
        if "performerLabel" in p:
            performer = p["performerLabel"]["value"]
            songs_data["performer"].append(performer)
        else:
            flag = True
            break
        if "genreLabel" in p:
            genre = p["genreLabel"]["value"]
            songs_data["genres"].append(genre)
        else:
            flag = True
            break
        if "labelLabel" in p:
            record_label = p["labelLabel"]["value"]
            songs_data["record_label"].append(record_label)
        else:
            flag = True
            break
        if "producerLabel" in p:
            producer = p["producerLabel"]["value"]
            songs_data["producer"].append(producer)
        else:
            flag = True
            break
        if "view" in p:
            #songs_data["yotube_view"] = p["view"]["value"]
            for i, cutoff in enumerate(cutoff_values):
                if int(p["view"]["value"]) >= cutoff:
                    songs_data["rating"]= rating_val[i]
                    break
                elif int(p["view"]["value"]) < cutoff_values[-1]:
                    songs_data["rating"]= rating_val[-1]
                    break
        else:
            flag = True
            break
        if "instanceLabel" in p:
            inst = p["instanceLabel"]["value"]
            songs_data["instance_of"].append(inst)
        else:
            flag = True
            break
        if "publication" in p:
            # Convert input string to datetime object
            try:
                dt = datetime.strptime(p["publication"]["value"], '%Y-%m-%dT%H:%M:%SZ')
                songs_data["publication_year"] = dt.strftime("%Y")
                # Format datetime object to desired string format
                output_string = dt.strftime('%B %d, %Y').replace(' 0', ' ')
                songs_data["publication_date"] = output_string
            except ValueError:
                # Ignore input string if it is not in the expected format
                pass
        else:
            flag = True
            break
        if "partLabel" in p:
            part = p["partLabel"]["value"]
            songs_data["from_album"] = part
            songs_data["is_from_album"] = "is from album"
        else:
            songs_data["from_album"] = 'N/A'
            songs_data["is_from_album"] = "is a single"
        if "tracklistLabel" in p:
            songs = p["tracklistLabel"]["value"]
            songs_data["songs_in_album"].append(songs)

    if flag == True:
        continue

    counter+=1
    songs_data["name"] = re.sub(r'_', ' ', urllib.parse.unquote(songName)).split(" (")[0]
    songs_data["performer"] = set(songs_data["performer"])
    songs_data["genres"] = set(songs_data["genres"])
    songs_data["instance_of"] = set(songs_data["instance_of"])
    songs_data["record_label"] = set(songs_data["record_label"])
    songs_data["producer"] = set(songs_data["producer"])
    songs_data["songs_in_album"] = set(songs_data["songs_in_album"])
    all_data.append(songs_data)
    song_ids.append(songId)
    print("bar: ", len(all_data), 30)


  pop_df = pd.read_csv("/content/drive/MyDrive/Copy of music_popularity.csv")


bar:  1 30
bar:  2 30
bar:  3 30
bar:  4 30
bar:  5 30
bar:  6 30
bar:  7 30
bar:  8 30
bar:  9 30
bar:  10 30
bar:  11 30
bar:  12 30
bar:  13 30
bar:  14 30
bar:  15 30
bar:  16 30
bar:  17 30
bar:  18 30
bar:  19 30
bar:  20 30
bar:  21 30
bar:  22 30
bar:  23 30
bar:  24 30
bar:  25 30
bar:  26 30
bar:  27 30
bar:  28 30
bar:  29 30
bar:  30 30


In [None]:
full_list = full + song_ids
full_list

['Q18786715',
 'Q111397510',
 'Q112654939',
 'Q308895',
 'Q76566134',
 'Q192023',
 'Q111784795',
 'Q626490',
 'Q2411420',
 'Q161407',
 'Q1995194',
 'Q155894',
 'Q18208944',
 'Q592696',
 'Q651472',
 'Q62587323',
 'Q111622181',
 'Q1046717',
 'Q1999714',
 'Q108072305',
 'Q2298481',
 'Q1338452',
 'Q1164996',
 'Q1149738',
 'Q1165404',
 'Q581952',
 'Q1330171',
 'Q1420378',
 'Q12207092',
 'Q957616',
 'Q18518749',
 'Q16897727',
 'Q19893560',
 'Q19892483',
 'Q27929592',
 'Q3179698',
 'Q2269723',
 'Q15920900',
 'Q18699094',
 'Q384872',
 'Q6412295',
 'Q2277496',
 'Q42854800',
 'Q18844782',
 'Q651633',
 'Q74479041',
 'Q1170299',
 'Q1446083',
 'Q6899069',
 'Q3742015',
 'Q7113407',
 'Q2600214',
 'Q2710119',
 'Q15357783',
 'Q64504565',
 'Q30020390',
 'Q2837589',
 'Q5351571',
 'Q21203631',
 'Q1747485']

In [None]:
df = pd.DataFrame(all_data)[["two_hop", "three_hop", "four_hop", "five_hop"]]
df

Unnamed: 0,two_hop,three_hop,four_hop,five_hop
0,[Style --> performer --> Taylor Swift --> genr...,[Style --> performer --> Taylor Swift --> genr...,[Style --> performer --> Taylor Swift --> genr...,[Style --> performer --> Taylor Swift --> genr...
1,[As It Was --> record label --> Columbia Recor...,[As It Was --> record label --> Columbia Recor...,[As It Was --> record label --> Columbia Recor...,[As It Was --> record label --> Columbia Recor...
2,[Break My Soul --> genre --> house music --> ~...,[Break My Soul --> genre --> house music --> ~...,[Break My Soul --> genre --> house music --> ~...,[Break My Soul --> genre --> house music --> ~...


In [None]:
df.to_csv('song_select.csv', index=False)

In [None]:
df1 = pd.DataFrame(all_data)[["two_hop", "three_hop"]]

In [None]:
combined_df = pd.concat([df, df1])
combined_df.to_csv('song_select.csv', index=False)

In [None]:
import pandas as pd

# assume your original dataframe is called "df"

# create an empty list to store the results
results = []

# iterate over each row in the original dataframe
for index, row in df.iterrows():
    # iterate over each string in the "two_hop" column
    for path in row['two_hop']:
        # add a new dictionary to the results list
        results.append({'path': path, 'num_hops': 2})

    # iterate over each string in the "three_hop" column
    for path in row['three_hop']:
        # add a new dictionary to the results list
        results.append({'path': path, 'num_hops': 3})

    for path in row['four_hop']:
        # add a new dictionary to the results list
        results.append({'path': path, 'num_hops': 4})
    for path in row['five_hop']:
        # add a new dictionary to the results list
        results.append({'path': path, 'num_hops': 5})

# create a new dataframe from the results list
new_df = pd.DataFrame(results)

# print the resulting dataframe
new_df.head(10)
new_df.to_csv('/content/drive/MyDrive/org_paths.csv', index=False)


In [None]:
songs[songs.entity_type == "song"].sort_values("count", ascending=False)

In [None]:
songs.groupby("entity_type").count()

In [None]:
songs[["entity_type", "count"]].plot()

In [None]:
songs[songs.entity_type == "song"].reset_index()[["entity_type", "count"]].plot()

In [None]:
song_edit = song.dropna(subset=["count"])

bucket_size = len(song_edit) // 10  # Determine the size of each bucket
sorted_song = song_edit.sort_values("count", ascending=False)  # Sort the DataFrame by count
buckets = [sorted_song[i:i+bucket_size] for i in range(0, len(sorted_song), bucket_size)]  # Create a list of DataFrames for each bucket

In [None]:
buckets[0].head(20)

In [None]:
buckets[1].head(20)

In [None]:
buckets[2].head(20)