# Mapping tags to DBPedia nodes using DBPedia Spotlight

In [4]:
import pandas as pd 
import numpy as np 
import requests

In [5]:
def find_dbpedia_node(tag): 
    """Find corresponding DBpedia node. """
    URL = "https://api.dbpedia-spotlight.org/en/annotate" 
    PARAMS = {'text': tag} 
    HEADERS = {'Accept': 'application/json'}
    try: 
        r = requests.get(url = URL, params = PARAMS, headers=HEADERS) 
        data = r.json() 
        nodes = [] 
        if 'Resources' in data.keys(): 
            for r in data['Resources']: 
                nodes.append(r['@URI'])
        else: 
            nodes = []
            
        return nodes
    except Exception as e: 
        return r

In [6]:
find_dbpedia_node("Stephen King")

['http://dbpedia.org/resource/Stephen_King']

In [35]:
type([1,2,3])

list

In [7]:
df_tags = pd.read_csv("data/tags_experiments_2_dbpedia.csv", 
#                      dtype={'tag_id': np.int32, 'tag_name': str, 'cleaned_tag': str, 'DBPedia_nodes': list} 
                     )

In [8]:
df_tags.head()

Unnamed: 0,tag_id,tag_name,cleaned_tag,DBPedia_nodes
0,28583,started-but-didn-t-finish,Started But Didn T Finish,[]
1,28584,started-but-never-finished,Started But Never Finished,[]
2,28585,started-but-not-finished,Started But Not Finished,[]
3,28586,started-didn-t-finish,Started Didn T Finish,[]
4,28587,started-didn-t-like-enough-to-fini,Started Didn T Like Enough To Fini,[]


In [9]:
len(df_tags)

102

In [10]:
df_tags["DBPedia_nodes"][22]

"['http://dbpedia.org/resource/Stay_Hungry_Stay_Foolish', 'http://dbpedia.org/resource/Foolish_(Ashanti_song)']"

In [11]:
def read_as_list(value): 
    value = value.replace("[","").replace("]","").replace("'","").split(", ")
    if value == ['']: 
        return []
    else: 
        return value

In [12]:
df_tags["DBPedia_nodes"] = df_tags["DBPedia_nodes"].apply(read_as_list)

In [13]:
df_tags["DBPedia_nodes"][22]

['http://dbpedia.org/resource/Stay_Hungry_Stay_Foolish',
 'http://dbpedia.org/resource/Foolish_(Ashanti_song)']

In [14]:
df_tags["DBPedia_nodes"][0]

[]

In [15]:
is_not_empty = df_tags["DBPedia_nodes"].apply(len) > 0

In [16]:
is_not_empty

0      False
1      False
2      False
3      False
4      False
       ...  
97     False
98      True
99      True
100    False
101     True
Name: DBPedia_nodes, Length: 102, dtype: bool

In [17]:
df_nodes = df_tags[is_not_empty]
df_nodes

Unnamed: 0,tag_id,tag_name,cleaned_tag,DBPedia_nodes
7,28590,started-reading,Started Reading,[http://dbpedia.org/resource/Reading_F.C.]
22,28605,stay-hungry-stay-foolish,Stay Hungry Stay Foolish,[http://dbpedia.org/resource/Stay_Hungry_Stay_...
26,28609,steam-punk,Steam Punk,[http://dbpedia.org/resource/Steam]
27,28610,steampunk,Steampunk,[http://dbpedia.org/resource/Steampunk]
36,28619,stefan-s-diaries,Stefan S Diaries,[http://dbpedia.org/resource/Stefan_Salvatore]
37,28620,stefan-zweig,Stefan Zweig,[http://dbpedia.org/resource/Stefan_Zweig]
39,28622,steig-larsson,Steig Larsson,[http://dbpedia.org/resource/Henrik_Larsson]
42,28625,stella-gibbons,Stella Gibbons,[http://dbpedia.org/resource/Stella_Gibbons]
45,28628,stendhal,Stendhal,[http://dbpedia.org/resource/Stendhal]
54,28637,stephanie,Stephanie,[http://dbpedia.org/resource/Stephanie_McMahon]


In [18]:
def explode(df, lst_cols, fill_value='', preserve_index=False):
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res

In [19]:
df_nodes = explode(df_tags, ["DBPedia_nodes"], fill_value='')
df_nodes

Unnamed: 0,cleaned_tag,tag_id,tag_name,DBPedia_nodes
0,Started But Didn T Finish,28583,started-but-didn-t-finish,
1,Started But Never Finished,28584,started-but-never-finished,
2,Started But Not Finished,28585,started-but-not-finished,
3,Started Didn T Finish,28586,started-didn-t-finish,
4,Started Didn T Like Enough To Fini,28587,started-didn-t-like-enough-to-fini,
...,...,...,...,...
101,Stephen_King,28680,stephen_king,
102,Stephenie Meyer,28681,stephenie-meyer,http://dbpedia.org/resource/Stephenie_Meyer
103,Stephenie Meyer Books,28682,stephenie-meyer-books,http://dbpedia.org/resource/Stephenie_Meyer
104,Stephenking,28683,stephenking,


In [22]:
df_bt = pd.read_csv("data/book_tags_reduced.csv", index_col=0)

In [23]:
df_bt.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [24]:
len(df_bt)

200000

In [25]:
#t = pd.read_csv("tags.csv")

In [26]:
#merge with tags name
df_bt = df_bt.merge(df_nodes, on = 'tag_id', copy=False)
df_bt

Unnamed: 0,goodreads_book_id,tag_id,count,cleaned_tag,tag_name,DBPedia_nodes
0,25,28601,20,States,states,
1,26,28601,8,States,states,
2,1067,28601,22,States,states,
3,2203,28601,25,States,states,
4,4138,28601,22,States,states,
...,...,...,...,...,...,...
376,25148,28591,1,Started_But_Not_Finished,started_but_not_finished,
377,26506,28616,17,Steel Danielle,steel-danielle,
378,26506,28614,17,Steel,steel,
379,26506,28615,3,Steel Danielle,steel--danielle,


In [29]:
b = pd.read_csv("data/books.csv")

In [28]:
#marge with books titles
df_bt = df_bt.merge(b[[ 'goodreads_book_id', 'title']], on = 'goodreads_book_id', copy=False)

NameError: name 'b' is not defined

In [225]:
df_new = df_bt.replace({'': None})
df_new.head()

Unnamed: 0,goodreads_book_id,tag_id,count,cleaned_tag,tag_name,DBPedia_nodes,title
0,25,28601,20,States,states,,I'm a Stranger Here Myself: Notes on Returning...
1,26,28601,8,States,states,,The Lost Continent: Travels in Small Town America
2,1067,28601,22,States,states,,1776
3,2203,28601,25,States,states,,John Adams
4,2203,28598,22,State,state,,John Adams


In [226]:
df_new

Unnamed: 0,goodreads_book_id,tag_id,count,cleaned_tag,tag_name,DBPedia_nodes,title
0,25,28601,20,States,states,,I'm a Stranger Here Myself: Notes on Returning...
1,26,28601,8,States,states,,The Lost Continent: Travels in Small Town America
2,1067,28601,22,States,states,,1776
3,2203,28601,25,States,states,,John Adams
4,2203,28598,22,State,state,,John Adams
5,4138,28601,22,States,states,,Naked
6,4952,28601,21,States,states,,What Is the What
7,9742,28601,21,States,states,,The Audacity of Hope: Thoughts on Reclaiming t...
8,9742,28598,21,State,state,,The Audacity of Hope: Thoughts on Reclaiming t...
9,10878,28601,11,States,states,,The Great Shark Hunt: Strange Tales from a Str...


In [227]:
# df_bt.head = df_bt.dropna

In [228]:
#df_bt["dbpedia_nodes"] = df_bt["clean_tag"].apply(find_dbpedia_node)

In [97]:
# print(max(df_bt.goodreads_book_id))
# print(max(df_bt.goodreads_book_id))
# print(min(df_bt.tag_id))
# print(max(df_bt.tag_id))

We will have the following node IDs: 
- **[0, 100,000]** - Goodreads book IDs
- **[100,000 - 200,000]** - Tag IDs 
- **[200,000 - 300,000]** - DBPedia Nodes 


In [229]:
# create list of all nodes 
nodes = list(df_new["DBPedia_nodes"])
set_nodes = set([_ for _ in nodes if _ is not None])
set_nodes

{'http://dbpedia.org/resource/Lance_Stephenson',
 'http://dbpedia.org/resource/Steam',
 'http://dbpedia.org/resource/Steampunk',
 'http://dbpedia.org/resource/Stendhal',
 'http://dbpedia.org/resource/Stephanie_McMahon',
 'http://dbpedia.org/resource/Stephanie_Plum',
 'http://dbpedia.org/resource/Stephen_Clarke_(swimmer)',
 'http://dbpedia.org/resource/Stephen_Hawking',
 'http://dbpedia.org/resource/Stephen_King',
 'http://dbpedia.org/resource/Stephen_R._Donaldson'}

In [230]:
def generate_node_ids(set_nodes, node_to_id=None, current_id=200000): 
    """Function to generate IDs fo nodes for our graph from DBPedia nodes.
    arguments:  node_to_id - dictionary of node:id to use
               current_id - id to start
    returns: dictionary node_id
    """
    if node_to_id == None: 
        node_to_id = dict()
    #create dictionary with ids 
    for node in set_nodes: 
        node_to_id[node] = current_id 
        current_id +=1 
        
    return node_to_id

In [231]:
node_to_id = generate_node_ids(set_nodes)
node_to_id

{'http://dbpedia.org/resource/Stephen_Hawking': 200000,
 'http://dbpedia.org/resource/Stendhal': 200001,
 'http://dbpedia.org/resource/Stephanie_McMahon': 200002,
 'http://dbpedia.org/resource/Stephen_King': 200003,
 'http://dbpedia.org/resource/Stephanie_Plum': 200004,
 'http://dbpedia.org/resource/Stephen_R._Donaldson': 200005,
 'http://dbpedia.org/resource/Stephen_Clarke_(swimmer)': 200006,
 'http://dbpedia.org/resource/Steam': 200007,
 'http://dbpedia.org/resource/Steampunk': 200008,
 'http://dbpedia.org/resource/Lance_Stephenson': 200009}

In [232]:
def map_node_to_id(node, mapping = node_to_id): 
    if node is None: 
        return None 
    else: 
        return node_to_id[node]

In [233]:
df_bt["node_id"] = df_new["DBPedia_nodes"].apply(map_node_to_id)

In [239]:
#df_for_rdf = df_bt[["title", "count", "cleaned_tag", "DBPedia_nodes", "count"]]

In [240]:
#df_for_rdf

In [241]:
df_bt

Unnamed: 0,goodreads_book_id,tag_id,count,cleaned_tag,tag_name,DBPedia_nodes,title,node_id
0,25,28601,20,States,states,,I'm a Stranger Here Myself: Notes on Returning...,
1,26,28601,8,States,states,,The Lost Continent: Travels in Small Town America,
2,1067,28601,22,States,states,,1776,
3,2203,28601,25,States,states,,John Adams,
4,2203,28598,22,State,state,,John Adams,
5,4138,28601,22,States,states,,Naked,
6,4952,28601,21,States,states,,What Is the What,
7,9742,28601,21,States,states,,The Audacity of Hope: Thoughts on Reclaiming t...,
8,9742,28598,21,State,state,,The Audacity of Hope: Thoughts on Reclaiming t...,
9,10878,28601,11,States,states,,The Great Shark Hunt: Strange Tales from a Str...,


In [242]:
df_bt.to_csv('tags_with_nodes_ids_temp.csv')

In [None]:
#free text 
#books:book_id? books:has_tag ? books:

In [None]:
# count how many of the tags have representing DBPedia nodes 
print(len(df_bt[df_bt["dbpedia_nodes"] is not None]))
#or if this does not work: 
df_bt_dropna = df_bt.dropna()
print(df_bt_dropna)

In [None]:
# for every node in DBPedia, create node_id
#TODO: 
#1. Create mapping from DBPedia nodes to node_ids
#2. Create a new dataframe, such that: 
# - tags are replaced by their corresponding id 
# - if one edge tag-node occurs several times, add up all the count values and write a single edge

In [None]:
# create list of all nodes 
nodes = np.array(df_bt["authors_list"]).flatten()

In [2]:
import hashlib