# Create Tine BIO

## Import library and load data

In [2]:
import pywikibot

In [None]:
site = pywikibot.Site("wikidata", "wikidata")
repo = site.data_repository()


In [None]:
import pandas as pd

df = pd.read_csv('Datasets/14_graph.tsv', sep='\t', header=None)

## Functions for each column

In [None]:
special_conditions = {'<http://schema.org/description>': 'description',
       '<http://www.w3.org/2000/01/rdf-schema#label>': 'label',
       '<http://ddis.ch/atai/tag>': 'tag', '<http://ddis.ch/atai/rating>': 'rating'}

In [110]:

def divide(repo, x):
    if 'Q' in x:
        Q2name_func(repo, x)
    else:
        P2label_func(repo, x)

def Q2name_func(repo, Q):
    Q = Q[32:-1]
    item = pywikibot.ItemPage(repo, Q)
    if item.isRedirectPage():
        item = item.getRedirectTarget()
    if not item.exists():
        return None
    item_dict = item.get()
    labels = item_dict['labels']
    if 'en' not in list(labels.keys()):
        return None
    name = labels['en']
    return name

def P2label_func(repo, P: str):
    if P in special_conditions.keys():
        return special_conditions[P]
    if 'ent' in P:
        P = P.removeprefix('<http://www.wikidata.org/entity/').removesuffix('>')
    else:
        P = P.removeprefix('<http://www.wikidata.org/prop/direct/').removesuffix('>')
    prop = pywikibot.PropertyPage(repo, P)
    # if not prop.exists():
    #     return None
    prop_dict = prop.get()
    labels = prop_dict['labels']
    if 'en' not in list(labels.keys()):
        return None
    label = labels['en']
    return label

In [106]:
def transfer_column2(repo, item: str):
    if '/Q' in item:
        return Q2name_func(repo, item)
    elif '/P' in item:
        return P2label_func(repo, item)
    else:
        return item

## Condition search for df column 2

In [100]:
df3 = df.loc[~(df.iloc[:, 2].str.contains('/Q') | df.iloc[:, 2].str.contains('/P'))]

In [104]:
df4 = df3.loc[df3.iloc[:, 2].str.contains('http')]

In [None]:
df4.loc[~df4.iloc[:, 2].str.contains('https://commons.wikimedia.org/wiki/File')]

Column 2 condition summary:
1. /Q, /P
2. string, description
3. http file link
4. Special condition:
   1. https://theglobalstardom.com/catherine-missal/ 
   2. description containing one link: "license that is ..."

## Condition search for df column 0, 1, and clean

In [31]:
len(df.loc[df.iloc[:, 0].str.contains('entity/Q') | df.iloc[:, 0].str.contains('P')])/len(df.iloc[:, 0])

1.0

In [28]:
df1 = df.loc[~(df.iloc[:, 0].str.contains('entity/Q') | df.iloc[:, 0].str.contains('P'))]

In [29]:
df1.index

Int64Index([], dtype='int64')

In [25]:
df.iloc[list(df1.index)]

Unnamed: 0,0,1,2
277841,<http://schema.org/description>,<http://schema.org/description>,textual entity description
516258,<http://www.w3.org/2000/01/rdf-schema#label>,<http://www.w3.org/2000/01/rdf-schema#label>,node label
518667,<http://schema.org/description>,<http://www.w3.org/2000/01/rdf-schema#label>,node description
975864,<http://www.w3.org/2000/01/rdf-schema#label>,<http://schema.org/description>,short label describing an entity


In [27]:
df.drop(list(df1.index), axis=0, inplace=True)

In [36]:
df.drop(df[df.iloc[:, 0].str.contains('P')].index, inplace=True)

In [None]:
# df_new = df.apply(lambda x: pd.Series([Q2name_func(repo, x[0]), P2label_func(repo, x[1])], index=['1', '2']), axis=1)

## Transfer column 0, 1

In [98]:
df[0][:50] = df[0][:50].apply(lambda x: Q2name_func(repo, x))

In [99]:
df[1][:50] = df[1][:50].apply(lambda x: P2label_func(repo, x))

In [107]:
df[2][:50] = df[2][:50].apply(lambda x: transfer_column2(repo, x))

In [108]:
tiny_graph = df[:50]
tiny_graph.to_csv('Datasets/tiny_graph.tsv', sep='\t', header=None)

In [97]:
# df_copy = df[:5].copy()
# df_copy[0][:5] = df_copy[0][:5].apply(lambda x: Q2name_func(repo, x))
# df_copy.head(10)

Unnamed: 0,0,1,2
0,Kothanodi,<http://www.wikidata.org/prop/direct/P5021>,<http://www.wikidata.org/entity/Q105729789>
1,Dan Hennessey,<http://schema.org/description>,Canadian actor
2,Back to Burgundy,<http://www.wikidata.org/prop/direct/P495>,<http://www.wikidata.org/entity/Q142>
3,The Ketchup Effect,<http://www.wikidata.org/prop/direct/P345>,tt0385751
4,Simon Norrthon,<http://www.wikidata.org/prop/direct/P31>,<http://www.wikidata.org/entity/Q5>


In [None]:
df[0][51:5000] = df[0][51:5000].apply(lambda x: transfer_column2(repo, x))

In [None]:
df[1][51:5000] = df[1][51:5000].apply(lambda x: P2label_func(repo, x))

In [None]:
df[2][51:5000] = df[2][51:5000].apply(lambda x: transfer_column2(repo, x))

In [None]:
tiny_graph2 = df[51:5000]
tiny_graph2.to_csv('Datasets/tiny_graph2.tsv', sep='\t', header=None)

In [118]:
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import csv
import json
import networkx as nx
import rdflib
import pywikibot

graph = rdflib.Graph()
graph.parse('Datasets/14_graph.nt', format='turtle')




In [132]:

wdt = Namespace('http://www.wikidata.org/prop/direct/')
wd = Namespace('http://www.wikidata.org/entity/')
lbl = Namespace('http://www.w3.org/2000/01/rdf-schema#')

special_conditions = {'<http://schema.org/description>': 'description',
       '<http://www.w3.org/2000/01/rdf-schema#label>': 'label',
       '<http://ddis.ch/atai/tag>': 'tag', '<http://ddis.ch/atai/rating>': 'rating'}


In [235]:
def transfer(graph, item):
    if not item:
        return item
    if ('/Q' and '<' in item) or ('/P' and '<' in item):
        tr = (URIRef(item.removeprefix('<').removesuffix('>')), lbl['label'], None)
        ts = [t for t in graph.triples(tr)]
        if len(ts) < 1:
            # print (Q2name_func(repo, item))
            return None
        return ts[0][2]
    # elif '/P' in item:
    #     return P2label_func(repo, item)
    elif item in special_conditions.keys():
        return special_conditions[item]
    else:
        return item

In [230]:
df = df.applymap(lambda x: transfer(graph, x))

Def: population >= 200, < 200 m between buildings, etc does not look like a valid URI, trying to serialize this will break.
each instance is a subclass in the hierarchy under <astronomical object>(Q6999); such a subclass's instances in turn are particular identified objects in Our Universe does not look like a valid URI, trying to serialize this will break.


In [231]:
df.to_csv('Datasets/Big_graph_clean.tsv', sep='\t', header=None)

In [232]:
"Congratulation!"

'Congratulation!'

In [233]:
df[df.iloc[:, 2].str.contains('nm0926924') == True].iloc[:, 2]

1121969    nm0926924
Name: 2, dtype: object

In [234]:
df[1121960-470:1121970-470]

Unnamed: 0,0,1,2
1121961,The Internet's Own Boy: The Story of Aaron Swartz,instance of,film
1121962,Simone Jolivet,occupation,writer
1121963,The Omen,assessment,Russ Test
1121964,Rodrigo Grande,instance of,human
1121965,Staffan Lindberg,occupation,actor
1121966,Shawn Pyfrom,description,American actor
1121967,The House of Ghosts,publication date,1908-01-01
1121968,Prithviraj Sukumaran,description,
1121969,Christopher Wicking,IMDb ID,nm0926924
1121970,Maverick,cast member,Corey Feldman
