# Create Tine BIO

## Import library and load data

In [None]:
import pywikibot

In [None]:
site = pywikibot.Site("wikidata", "wikidata")
repo = site.data_repository()


In [None]:
import pandas as pd

df = pd.read_csv('Datasets/14_graph.tsv', sep='\t', header=None)

## Functions for each column

In [None]:
special_conditions = {'<http://schema.org/description>': 'description',
       '<http://www.w3.org/2000/01/rdf-schema#label>': 'label',
       '<http://ddis.ch/atai/tag>': 'tag', '<http://ddis.ch/atai/rating>': 'rating'}

## Condition search for df column 2

In [74]:
df3 = df.loc[~(df.iloc[:, 2].str.contains('/Q') | df.iloc[:, 2].str.contains('/P'))]
df3.head()

Unnamed: 0,0,1,2
1,<http://www.wikidata.org/entity/Q2358294>,<http://schema.org/description>,Canadian actor
3,<http://www.wikidata.org/entity/Q897357>,<http://www.wikidata.org/prop/direct/P345>,tt0385751
7,<http://www.wikidata.org/entity/Q278053>,<http://www.w3.org/2000/01/rdf-schema#label>,Labyrinth
10,<http://www.wikidata.org/entity/Q386389>,<http://schema.org/description>,American actor
12,<http://www.wikidata.org/entity/Q450646>,<http://www.wikidata.org/prop/direct/P18>,https://commons.wikimedia.org/wiki/File:Edie_a...


In [75]:
df4 = df3.loc[df3.iloc[:, 2].str.contains('http')]
df4.head()

Unnamed: 0,0,1,2
12,<http://www.wikidata.org/entity/Q450646>,<http://www.wikidata.org/prop/direct/P18>,https://commons.wikimedia.org/wiki/File:Edie_a...
31,<http://www.wikidata.org/entity/Q669010>,<http://www.wikidata.org/prop/direct/P18>,https://commons.wikimedia.org/wiki/File:I_Feel...
44,<http://www.wikidata.org/entity/Q2199572>,<http://www.wikidata.org/prop/direct/P18>,https://commons.wikimedia.org/wiki/File:Jelka_...
155,<http://www.wikidata.org/entity/Q484365>,<http://www.wikidata.org/prop/direct/P18>,https://commons.wikimedia.org/wiki/File:David_...
170,<http://www.wikidata.org/entity/Q5957972>,<http://www.wikidata.org/prop/direct/P18>,https://commons.wikimedia.org/wiki/File:Hayede...


In [76]:
df4.loc[~df4.iloc[:, 2].str.contains('https://commons.wikimedia.org/wiki/File')]

Unnamed: 0,0,1,2
455576,<http://www.wikidata.org/entity/Q21998813>,<http://www.wikidata.org/prop/direct/P345>,https://theglobalstardom.com/catherine-missal/
1968938,<http://www.wikidata.org/entity/Q30939938>,<http://schema.org/description>,license that is listed on the Open Definition ...


Column 2 condition summary:
1. /Q, /P
2. string, description
3. http file link
4. Special condition:
   1. https://theglobalstardom.com/catherine-missal/ 
   2. description containing one link: "license that is ..."

## Condition search for df column 0, 1, and clean

In [31]:
len(df.loc[df.iloc[:, 0].str.contains('entity/Q') | df.iloc[:, 0].str.contains('P')])/len(df.iloc[:, 0])

1.0

In [78]:
df1 = df.loc[~(df.iloc[:, 0].str.contains('entity/Q') | df.iloc[:, 0].str.contains('P'))]

In [416]:
df1.index

Int64Index([277841, 516258, 518667, 975864], dtype='int64')

In [79]:
df.iloc[list(df1.index)]

Unnamed: 0,0,1,2
277841,<http://schema.org/description>,<http://schema.org/description>,textual entity description
516258,<http://www.w3.org/2000/01/rdf-schema#label>,<http://www.w3.org/2000/01/rdf-schema#label>,node label
518667,<http://schema.org/description>,<http://www.w3.org/2000/01/rdf-schema#label>,node description
975864,<http://www.w3.org/2000/01/rdf-schema#label>,<http://schema.org/description>,short label describing an entity


In [80]:
df.drop(list(df1.index), axis=0, inplace=True)

In [81]:
df.drop(df[df.iloc[:, 0].str.contains('P')].index, inplace=True)

## Transfer

In [118]:
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import csv
import json
import networkx as nx
import rdflib
import pywikibot

graph = rdflib.Graph()
graph.parse('Datasets/14_graph.nt', format='turtle')




In [132]:

wdt = Namespace('http://www.wikidata.org/prop/direct/')
wd = Namespace('http://www.wikidata.org/entity/')
lbl = Namespace('http://www.w3.org/2000/01/rdf-schema#')

special_conditions = {'<http://schema.org/description>': 'description',
       '<http://www.w3.org/2000/01/rdf-schema#label>': 'label',
       '<http://ddis.ch/atai/tag>': 'tag', '<http://ddis.ch/atai/rating>': 'rating'}


In [339]:
def transfer(graph, item):
    if not item:
        return item
    if ('/Q' and '<' in item) or ('/P' and '<' in item):
        tr = (URIRef(item.removeprefix('<').removesuffix('>')), lbl['label'], None)
        ts = [t for t in graph.triples(tr)]
        if len(ts) < 1:
            # print (Q2name_func(repo, item))
            return None
        return str(ts[0][2])
    # elif '/P' in item:
    #     return P2label_func(repo, item)
    elif item in special_conditions.keys():
        return special_conditions[item]
    else:
        return item

In [405]:
df = df.applymap(lambda x: transfer(graph, x))

Def: population >= 200, < 200 m between buildings, etc does not look like a valid URI, trying to serialize this will break.
each instance is a subclass in the hierarchy under <astronomical object>(Q6999); such a subclass's instances in turn are particular identified objects in Our Universe does not look like a valid URI, trying to serialize this will break.


In [406]:
df.reset_index(drop=True)
df.to_csv('Datasets/Big_graph_clean.tsv', sep='\t', header=None, index=False)

In [407]:
"Congratulation!"

'Congratulation!'

In [408]:
# df[1121960-470:1121970-470]
df[-5:]

Unnamed: 0,0,1,2
2056772,The Martian,cast member,Mackenzie Davis
2056773,Philip Barantini,occupation,actor
2056774,Grégoire Aslan,occupation,actor
2056775,Top Five,cast member,Liam Ferguson
2056776,Eileen Pollock,native language,English


## Create Name_Qwiki_Qid Map

In [420]:
df_web = df.copy()
df_str = pd.read_csv('Datasets/Big_graph_clean.tsv', sep='\t', header=None)

In [421]:
len(df_web), len(df_str)

(2055965, 2055965)

In [423]:
df_web = df_web.reset_index(drop=True)

In [425]:
df_new = pd.DataFrame(data={'Str': df_str[0], 'Wiki': df_web[0]})

In [427]:
df_clean = df_new.drop_duplicates('Str')
len(df_new), len(df_clean)

(2055965, 150153)

In [453]:
df_drop_na = df_clean.dropna()

In [433]:
len(df_drop_na)

150152

In [454]:
df_drop_na = df_drop_na.reset_index(drop=True)

In [456]:
qid = df_drop_na['Wiki'].apply(lambda x: x.removeprefix('<http://www.wikidata.org/entity/').removesuffix('>'))

In [457]:
df_drop_na['Qid'] = qid

In [459]:
df_drop_na.to_csv('name_Qwiki_Qid.tsv', sep='\t', index=False)

In [463]:
name = 'Kothanodi'
Name2Qid = dict(zip(df_drop_na['Str'], df_drop_na['Qid']))
Name2Qid[name]

'Q20720659'

In [474]:
s = "Star Wars: Episode VI – Return of the Jedi"
s1 = "Star Wars: Episode VI - Return of the Jedi"
s.encode('utf-8'), s1.encode('utf-8')
# s1.decode('utf-8') in Name2Qid.keys()

(b'Star Wars: Episode VI \xe2\x80\x93 Return of the Jedi',
 b'Star Wars: Episode VI - Return of the Jedi')

### replace('–', '-')

In [484]:
print([len(df_drop_na.loc[df_drop_na.iloc[:, i].str.contains('–')]) for i in range(3)])

[380, 0, 0]


In [487]:
df_drop_na = df_drop_na.applymap(lambda x: x.replace('–', '-'))

In [488]:
print([len(df_drop_na.loc[df_drop_na.iloc[:, i].str.contains('–')]) for i in range(3)])

[0, 0, 0]


In [489]:
df_drop_na.to_csv('name_Qwiki_Qid.tsv', sep='\t', index=False)

In [492]:
df_str = df_str.dropna()
df_str.isna().any()

0    False
1    False
2    False
dtype: bool

In [493]:
df_str.reset_index()
len(df_str)

2010463

In [494]:
print([len(df_str.loc[df_str.iloc[:, i].str.contains('–')]) for i in range(3)])

[6559, 0, 4079]


In [495]:
df_str = df_str.applymap(lambda x: x.replace('–', '-'))

In [1]:
print([len(df_str.loc[df_str.iloc[:, i].str.contains('–')]) for i in range(3)])

NameError: name 'df_str' is not defined

In [497]:
df_str.to_csv('Datasets/Big_graph_very_clean.tsv', sep='\t', header=None, index=False)


## Create NER Dataset

In [3]:
df_drop_na = pd.read_csv('name_Qwiki_Qid.tsv', sep='\t')

In [4]:
names = list(df_drop_na['Str'])

In [5]:
df_relation = pd.read_csv('Datasets/Big_graph_very_clean.tsv', sep='\t', header=None)

In [6]:
import pickle
with open('P2label.pickle', 'rb') as handle:
    P2label = pickle.load(handle)
labels = list(P2label.values())

In [7]:
Qs = names
Ps = labels
relations = df_relation
print(len(Qs), len(Ps), len(relations))

150152 251 2010463


In [None]:
### some fun examples
# St Trinian's	cast member ? (multiple answers)
# instance of 隶属于

### Give P Categroies

In [None]:
group_1 = []
group_others = []
for i, P in enumerate(Ps):
    print("Type 1, id {} : {} of Q1 is Q2, or Q2 is {} of Q1".format(i, P, P))
    label = input()
    if label == 1:
        group_1.append(P)
    else:
        group_others.append(P)

11 distributed by
21 named after
31 different from
<!-- 33 based on -->
48 educated at
62 fictional universe described in
112 owner of
131 opposite of
141 conferred by
181 permanent resident of
203 facet of
246 archives at

99 group2 
151 
166 
209 


In [24]:
wr1 = ["distributed by", "named after", "based on", "different from", "educated at", "fictional universe described in", \
    "conferred by", ]
wr2 = ["EIRIN film rating", "business model", "form of creative work", "broadcast by"]

# for wr in wr1:
#     if wr in group_1:
#         group_1.remove(wr)
#     if wr not in group_others:
#         group_others.append(wr)

# for wr in wr2:
#     if wr in group_others:
#         group_others.remove(wr)
#     if wr not in group_1:
#         group_1.append(wr)

In [16]:
sub_relations = relations[:5000]

In [32]:
group1 = Ps
group2 = ["owner of", "opposite of",  "permanent resident of", "facet of", "archives at"]
group3 = ["distributed by", "named after", "based on", "different from", "educated at", "fictional universe described in", \
    "conferred by", ]

for s in group2:
    group1.remove(s)
for s in group3:
    group1.remove(s)


In [33]:
len(group1) + len(group2) + len(group3)

251

In [34]:
import numpy as np

In [56]:
def choose_template(Q1, P, type, Q2=None):

    Templates_group1 = [
        "What is the {} of {}?".format(P, Q1),
        "What is {}'s {}?".format(Q1, P),
        "Tell me the {} of {}.".format(P, Q1),
        "Do you know the {} of {}?".format(P, Q1),
        "Can you tell me the {} of {}?".format(P, Q1)
        ]

    Templates_group2 = [
        "What is the {} {}?".format(P, Q1),
        "Tell me what is {} {}.".format(P, Q1),
        "Do you know the {} {}?".format(P, Q1),
        "Can you tell me what is {} {}?".format(P, Q1),
        
    ]

    Templates_group3 = [
        "What is {} {}?".format(Q1, P),
        "Tell me what is {} {}.".format(Q1, P),
        "Can you tell me what is {} {}?".format(Q1, P)
    ]

    if type == 1:
        return Templates_group1[np.random.randint(len(Templates_group1))]
    elif type == 2:
        return Templates_group2[np.random.randint(len(Templates_group2))]
    else:
        return Templates_group3[np.random.randint(len(Templates_group3))]

In [69]:
X = list()
y = list()
for i, row in relations.iterrows():
    Q1, P, Q2 = row[0], row[1], row[2]
    if P in group3:
        type = 3
    elif P in group2:
        type = 2
    else:
        type = 1
    s = choose_template(Q1, P, type)
    # print(s)
    X.append(s)
    y.append(Q1)


In [70]:
dataset = pd.DataFrame(data = {'X': X, 'y': y})
# small_dataset = pd.DataFrame(data = {'X': X, 'y': y})
dataset.to_csv('Datasets/Relations_X_y.tsv', sep='\t', index=False)
# small_dataset.to_csv('Datasets/Relations_small_X_y.tsv', sep='\t', index=False)

## Create Question Answer Dataset

### Multimedia Dataset

In [82]:
df3 = df.loc[~(df.iloc[:, 2].str.contains('/Q') | df.iloc[:, 2].str.contains('/P'))]
df3.head()

Unnamed: 0,0,1,2
1,<http://www.wikidata.org/entity/Q2358294>,<http://schema.org/description>,Canadian actor
3,<http://www.wikidata.org/entity/Q897357>,<http://www.wikidata.org/prop/direct/P345>,tt0385751
7,<http://www.wikidata.org/entity/Q278053>,<http://www.w3.org/2000/01/rdf-schema#label>,Labyrinth
10,<http://www.wikidata.org/entity/Q386389>,<http://schema.org/description>,American actor
12,<http://www.wikidata.org/entity/Q450646>,<http://www.wikidata.org/prop/direct/P18>,https://commons.wikimedia.org/wiki/File:Edie_a...


In [88]:
df4 = df3.loc[df3.iloc[:, 2].str.contains('http')]
df4.head()

Unnamed: 0,0,1,2
12,<http://www.wikidata.org/entity/Q450646>,<http://www.wikidata.org/prop/direct/P18>,https://commons.wikimedia.org/wiki/File:Edie_a...
31,<http://www.wikidata.org/entity/Q669010>,<http://www.wikidata.org/prop/direct/P18>,https://commons.wikimedia.org/wiki/File:I_Feel...
44,<http://www.wikidata.org/entity/Q2199572>,<http://www.wikidata.org/prop/direct/P18>,https://commons.wikimedia.org/wiki/File:Jelka_...
155,<http://www.wikidata.org/entity/Q484365>,<http://www.wikidata.org/prop/direct/P18>,https://commons.wikimedia.org/wiki/File:David_...
170,<http://www.wikidata.org/entity/Q5957972>,<http://www.wikidata.org/prop/direct/P18>,https://commons.wikimedia.org/wiki/File:Hayede...


In [84]:
df4.loc[~df4.iloc[:, 2].str.contains('https://commons.wikimedia.org/wiki/File')]

Unnamed: 0,0,1,2
455576,<http://www.wikidata.org/entity/Q21998813>,<http://www.wikidata.org/prop/direct/P345>,https://theglobalstardom.com/catherine-missal/
1968938,<http://www.wikidata.org/entity/Q30939938>,<http://schema.org/description>,license that is listed on the Open Definition ...


Column 2 condition summary:
1. /Q, /P
2. string, description
3. http file link
4. Special condition:
   1. https://theglobalstardom.com/catherine-missal/ 
   2. description containing one link: "license that is ..."

In [87]:
df3.loc[df3.iloc[:, 1].str.contains('P345')]

Unnamed: 0,0,1,2
3,<http://www.wikidata.org/entity/Q897357>,<http://www.wikidata.org/prop/direct/P345>,tt0385751
16,<http://www.wikidata.org/entity/Q6986093>,<http://www.wikidata.org/prop/direct/P345>,nm0773307
48,<http://www.wikidata.org/entity/Q6171826>,<http://www.wikidata.org/prop/direct/P345>,ch0036421
54,<http://www.wikidata.org/entity/Q3554526>,<http://www.wikidata.org/prop/direct/P345>,nm2278771
63,<http://www.wikidata.org/entity/Q28127548>,<http://www.wikidata.org/prop/direct/P345>,tt6108090
...,...,...,...
2056670,<http://www.wikidata.org/entity/Q3183120>,<http://www.wikidata.org/prop/direct/P345>,nm0850939
2056690,<http://www.wikidata.org/entity/Q4315443>,<http://www.wikidata.org/prop/direct/P345>,tt0107090
2056692,<http://www.wikidata.org/entity/Q3523521>,<http://www.wikidata.org/prop/direct/P345>,nm1028218
2056711,<http://www.wikidata.org/entity/Q11748706>,<http://www.wikidata.org/prop/direct/P345>,nm0439417


In [None]:
### TODO
### find all the P refering to http file
### Answer: only image, others are id and description which are special

In [92]:
df4.iloc[:, 1].unique()

array(['<http://www.wikidata.org/prop/direct/P18>',
       '<http://www.wikidata.org/prop/direct/P345>',
       '<http://schema.org/description>'], dtype=object)

In [None]:
### TODO
### 

In [99]:
df.loc[df.iloc[:, 1].str.contains('tag')].iloc[:, 2].unique()

array(['boring', 'psychological', 'historical', 'violence',
       'autobiographical', 'romantic', 'action', 'neo_noir', 'comedy',
       'psychedelic', 'fantasy', 'satire', 'sentimental', 'murder',
       'inspiring', 'horror', 'entertaining', 'anti_war', 'home_movie',
       'mystery', 'melodrama', 'humor', 'revenge', 'feel-good', 'sadist',
       'thriller', 'drama', 'brainwashing', 'flashback',
       'thought-provoking', 'prank', 'historical_fiction',
       'good_versus_evil', 'cruelty', 'plot_twist', 'dramatic', 'bleak',
       'suspenseful', 'atmospheric', 'cult', 'insanity', 'sci-fi',
       'gothic', 'tragedy', 'paranormal', 'dark', 'philosophical',
       'claustrophobic', 'crime', 'stupid', 'alternate_reality', 'cute',
       'romance', 'adventure', 'storytelling', 'war', 'realism', 'queer',
       'absurd', 'whimsical', 'alternate_history', 'haunting', 'western',
       'intrigue', 'grindhouse_film', 'comic', 'clever', 'avant_garde',
       'documentary', 'film-noir', 'dep

### Crowding source dataset handling

In [103]:
df_relation.loc[(df_relation.iloc[:, 0].str.contains('X-Men: First Class')) & (df_relation.iloc[:, 1].str.contains('executive producer'))]

Unnamed: 0,0,1,2
655637,X-Men: First Class,executive producer,Sheryl Lee Ralph


## External resources data

### 'plots.csv'

Type: factual questions
Synopsis, plot (情节)

### 'user-comments'

qid,rating,sentiment,comment

### 'imdb-top'

Type: recommend question

## Create Multimedia Dictionary

In [2]:
import json

with open('Datasets/ImageData/images.json', 'rb')  as handle:
    images = json.load(handle)

print(images)


In [3]:
import pandas as pd

# df = pd.read_csv('Datasets/14_graph.tsv', sep='\t', header=None)
df = pd.read_csv('Datasets/Big_graph_very_clean.tsv', sep='\t', header=None)

### Find all IMDb ID

In [2]:
### nm, cast member
### tt, movie

# len(df.loc[df.iloc[:, 0].str.contains('Q')]), len(df.loc[df.iloc[:, 0].str.contains('http://www.wikidata.org/entity/Q')])

(2055965, 2055965)

In [22]:
m = df.loc[df.iloc[:, 1].str.contains('instance of')]
m.loc[m.iloc[:, 2].str.contains('film')].iloc[:, 2].unique()
# m.iloc[:, 2].str.isdigit()

array(['film', 'television film', 'animated feature film',
       'film distributor', 'film character', 'animated film',
       'film production company', 'superhero film character',
       'film organization', '3D film', 'film genre', 'film award',
       'film trilogy', 'horror film character',
       'Teenage Mutant Ninja Turtles film', 'film festival',
       'stop-motion animated film', 'film series', 'animated short film',
       'feature film', 'dubbing of film', 'short film', 'anthology film',
       'film award category', 'anime film', 'film studio',
       'adventure film character', 'traditionally animated film',
       'clay animation film', 'film poster', 'cult film',
       'award for best film by genre', 'RTC film classification category',
       'BAMID film rating category', 'live-action/animated film',
       'animated film series', 'film soundtrack',
       'award for best short film', 'film preservation', 'film project',
       'film format', 'film gauge', 'film fest

### Crowdsourcing Data Pre-processing

In [185]:
import pandas as pd
import numpy as np

In [186]:
cro_df = pd.read_csv('Datasets/CrowdData/crowd_data.tsv', sep='\t')

In [187]:
cro_df.columns

Index(['HITId', 'HITTypeId', 'Title', 'Reward', 'AssignmentId', 'WorkerId',
       'AssignmentStatus', 'WorkTimeInSeconds', 'LifetimeApprovalRate',
       'Input1ID', 'Input2ID', 'Input3ID', 'AnswerID', 'AnswerLabel',
       'FixPosition', 'FixValue'],
      dtype='object')

#### Find malicious workers, remove them, rules:

1. Work time in seconds <= 10? Check
2. Low life time  <= 40?
3. Fixposition value 'yes', 'I don’t understand' WWHL098SA43 repeat answer


In [188]:
uni, cnt = np.unique(cro_df['WorkTimeInSeconds'], return_counts=True)
uni, cnt

(array([  2,   4,  10,  32,  39,  40,  42,  43,  44,  45,  46,  50,  60,
         67,  80,  90, 120, 140, 188, 189, 191, 198, 199, 200, 202, 203,
        229, 235, 236, 237, 238, 240, 241, 298, 300, 337]),
 array([59, 21, 20,  1,  2, 10,  2,  2,  1,  1,  2,  5,  8,  1,  8,  3, 53,
         3,  1,  1,  1,  1,  1, 32,  1,  1,  1,  1,  2,  2,  1, 51,  1,  1,
         3,  1]))

In [189]:
cro_df = cro_df.loc[cro_df['WorkTimeInSeconds'] > 10]
uni, cnt = np.unique(cro_df['WorkTimeInSeconds'], return_counts=True)
uni, cnt

(array([ 32,  39,  40,  42,  43,  44,  45,  46,  50,  60,  67,  80,  90,
        120, 140, 188, 189, 191, 198, 199, 200, 202, 203, 229, 235, 236,
        237, 238, 240, 241, 298, 300, 337]),
 array([ 1,  2, 10,  2,  2,  1,  1,  2,  5,  8,  1,  8,  3, 53,  3,  1,  1,
         1,  1,  1, 32,  1,  1,  1,  1,  2,  2,  1, 51,  1,  1,  3,  1]))

In [190]:
uni, cnt = np.unique(cro_df['LifetimeApprovalRate'], return_counts=True)
uni, cnt

(array(['40%', '70%', '80%', '85%', '98%', '99%'], dtype=object),
 array([22, 21, 40, 40, 61, 21]))

In [191]:
cro_df = cro_df.loc[~cro_df['LifetimeApprovalRate'].str.contains('40%')]
uni, cnt = np.unique(cro_df['LifetimeApprovalRate'], return_counts=True)
uni, cnt

(array(['70%', '80%', '85%', '98%', '99%'], dtype=object),
 array([21, 40, 40, 61, 21]))

In [192]:
cro_df[['HITId', 'HITTypeId', 'Title', 'Reward', 'AssignmentId', 'WorkerId',
       'AssignmentStatus', 'WorkTimeInSeconds', 'LifetimeApprovalRate',
       'Input1ID', 'Input2ID', 'Input3ID', 'AnswerID', 'AnswerLabel']].isna().any()

HITId                   False
HITTypeId               False
Title                   False
Reward                  False
AssignmentId            False
WorkerId                False
AssignmentStatus        False
WorkTimeInSeconds       False
LifetimeApprovalRate    False
Input1ID                False
Input2ID                False
Input3ID                False
AnswerID                False
AnswerLabel             False
dtype: bool

In [193]:
cro_df[['FixPosition', 'FixValue']] = cro_df[['FixPosition', 'FixValue']].fillna('Empty')


In [194]:
uni, cnt = np.unique(cro_df['FixPosition'], return_counts=True)
uni, cnt

(array(['2', 'Empty', 'Object', 'Predicate', 'Subject'], dtype=object),
 array([  1, 108,  43,  16,  15]))

In [195]:
cro_df = cro_df.loc[~cro_df['FixPosition'].str.contains('2')]
uni, cnt = np.unique(cro_df['FixPosition'], return_counts=True)
uni, cnt

(array(['Empty', 'Object', 'Predicate', 'Subject'], dtype=object),
 array([108,  43,  16,  15]))

In [196]:
uni, cnt = np.unique(cro_df['FixValue'], return_counts=True)
uni, cnt

(array(['10696210', '176997168', '1992-01-01', '2010-01-01', '2011-01-01',
        '2014-02-18', '2015-08-27', '2019-02-24', '698491348', '863756051',
        'Empty', 'P106', 'P161', 'P17', 'P19', 'P27', 'P344', 'P58',
        'Q1168152', 'Q1471671', 'Q15052538', 'Q16795448', 'Q1722254',
        'Q17350908', 'Q181900', 'Q1860', 'Q268905', 'Q27703272',
        'Q28732985', 'Q3194791', 'Q409022', 'Q5423258', 'Q7360827',
        'Q7488442', 'Q884', 'Q908556', 'wd:Q72', 'wd:Q94074', 'wd:Q95073'],
       dtype=object),
 array([  1,   2,   1,   2,   6,   1,   1,   1,   1,   2, 115,   2,   2,
          1,   2,   1,   2,   6,   2,   2,   1,   1,   2,   3,   1,   1,
          1,   1,   2,   2,   2,   3,   1,   1,   1,   3,   1,   1,   1]))

In [197]:
cro_df.to_csv('Datasets/CrowdData/Clean_Cro.tsv', sep='\t', index=False)

#### Aggregate answers

##### Correct or not



In [198]:
cro_groups = cro_df.groupby(['HITId'])

In [199]:
cro_df.AnswerLabel.unique()

array(['CORRECT', 'INCORRECT'], dtype=object)

In [200]:
cro_groups['AnswerID'].mean() # (1: correct) (2: not correct)

HITId
1     1.333333
2     1.000000
3     1.333333
4     2.000000
5     1.000000
        ...   
57    1.333333
58    2.000000
59    1.333333
60    1.333333
61    1.333333
Name: AnswerID, Length: 61, dtype: float64

In [201]:
group_answers = cro_groups['AnswerID'].mean() <= 1.5

In [202]:
cro_df['GroupAnswers'] = True

In [203]:
for i in cro_df.index:
    cro_df['GroupAnswers'].loc[i] = group_answers[cro_df['HITId'].loc[i]]
    # cro_df.loc[i] = row

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cro_df['GroupAnswers'].loc[i] = group_answers[cro_df['HITId'].loc[i]]


In [204]:
cro_df.to_csv('Datasets/CrowdData/Clean_Cro.tsv', sep='\t', index=False)

##### Inter-rater agreement (0.72), voting counts(2 vs 1)

In [205]:
# !python -m pip install statsmodels

In [206]:
import statsmodels.stats.inter_rater as inter_rater

In [207]:
value_counts = cro_groups['AnswerID'].value_counts() # (1: correct) (2: not correct)

In [208]:
1 in value_counts[2].keys()

True

In [209]:
cro_df['TrueCount'] = 0
cro_df['FalseCount'] = 0
# cro_df['Agreement'] = 0.
batches = [[],[],[]]
HITId = None
for i in cro_df.index:
    di = value_counts[cro_df['HITId'].loc[i]]
    if 1 in di.keys():
        cro_df['TrueCount'].loc[i] = di[1]
    if 2 in di.keys():
        cro_df['FalseCount'].loc[i] = di[2]
    
    if HITId != cro_df['HITId'].loc[i]:
        HITId = cro_df['HITId'].loc[i]
        if cro_df['HITTypeId'].loc[i] == '7QT':
            batches[0].append([cro_df['TrueCount'].loc[i], cro_df['FalseCount'].loc[i]])
        elif cro_df['HITTypeId'].loc[i] == '8QT':
            batches[1].append([cro_df['TrueCount'].loc[i], cro_df['FalseCount'].loc[i]])
        elif cro_df['HITTypeId'].loc[i] == '9QT':
            batches[2].append([cro_df['TrueCount'].loc[i], cro_df['FalseCount'].loc[i]])    

    # cro_df['Agreement'].loc[i] = round(inter_rater.fleiss_kappa(np.array([cro_df['TrueCount'].loc[i], cro_df['FalseCount'].loc[i]]).reshape(-1, 1)), 2)
    # cro_df.loc[i] = row

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cro_df['TrueCount'].loc[i] = di[1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cro_df['FalseCount'].loc[i] = di[2]


In [210]:
cro_batches = cro_df.groupby('HITTypeId')

In [211]:
batch_value_counts = cro_batches['AnswerID'].value_counts() # (1: correct) (2: not correct)
batch_value_counts

HITTypeId  AnswerID
7QT        1.0         33
           2.0         30
8QT        2.0         35
           1.0         25
9QT        1.0         32
           2.0         27
Name: AnswerID, dtype: int64

In [212]:
arr = np.array([[2,1],[0,3],[1,2],[1,2],[3,0],[3,0],[0,3],[0,3],[0,3],[0,3]])
# arr = np.array([[2, 1], [2, 1], [2, 1], [3, 0], [3, 0], [3, 0], [2, 1], [2, 1], [2, 1], [0, 3], [0, 3], [0, 3], [3, 0], [3, 0], [3, 0], [1, 2], [1, 2], [1, 2], [2, 1], [2, 1], [2, 1], [0, 3], [0, 3], [0, 3], [0, 3], [0, 3], [0, 3], [1, 2], [1, 2], [1, 2], [2, 1], [2, 1], [2, 1], [3, 0], [3, 0], [3, 0], [2, 1], [2, 1], [2, 1], [0, 3], [0, 3], [0, 3], [3, 0], [3, 0], [3, 0], [2, 1], [2, 1], [2, 1], [1, 2], [1, 2], [1, 2], [3, 0], [3, 0], [3, 0], [1, 2], [1, 2], [1, 2], [1, 2], [1, 2], [1, 2], [1, 2], [1, 2], [1, 2]])
# arr = np.array([[1, 2], [1, 2], [1, 2], [0, 3], [0, 3], [0, 3], [0, 3], [0, 3], [0, 3], [1, 2], [1, 2], [1, 2], [0, 3], [0, 3], [0, 3], [0, 3], [0, 3], [0, 3], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [3, 0], [3, 0], [3, 0], [3, 0], [3, 0], [3, 0], [3, 0], [3, 0], [3, 0], [3, 0], [3, 0], [3, 0], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [0, 2], [0, 2], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1]])
inter_rater.fleiss_kappa(arr)

0.55

In [213]:
# init_data = pd.read_csv('Datasets/CrowdData/crowd_data.tsv', sep='\t')
# init_batches = init_data.groupby('HITTypeId')
# batches = [[],[],[]]
# init_groups = init_data.groupby(['HITId'])

# # TODO  
# value_counts = init_groups['AnswerID'].value_counts() # (1: correct) (2: not correct)
# init_data['TrueCount'] = 0
# init_data['FalseCount'] = 0
# # cro_df['Agreement'] = 0.
# batches = [[],[],[]]
# for i in init_data.index:
#     di = value_counts[init_data['HITId'].loc[i]]
#     if 1 in di.keys():
#         init_data['TrueCount'].loc[i] = di[1]
#     if 2 in di.keys():
#         init_data['FalseCount'].loc[i] = di[2]
#     if init_data['HITTypeId'].loc[i] == '7QT':
#         batches[0].append([init_data['TrueCount'].loc[i], init_data['FalseCount'].loc[i]])
#     elif init_data['HITTypeId'].loc[i] == '8QT':
#         batches[1].append([init_data['TrueCount'].loc[i], init_data['FalseCount'].loc[i]])
#     elif init_data['HITTypeId'].loc[i] == '9QT':
#         batches[2].append([init_data['TrueCount'].loc[i], init_data['FalseCount'].loc[i]])
#     # cro_df['Agreement'].loc[i] = round(inter_rater.fleiss_kappa(np.array([cro_df['TrueCount'].loc[i], cro_df['FalseCount'].loc[i]]).reshape(-1, 1)), 2)
#     # cro_df.loc[i] = row

In [218]:
# batches[2][-4][1] = 3
fks = []
for batch in batches:
    # print(batch)
    fk = round(inter_rater.fleiss_kappa(np.array(batch)), 2)
    fks.append(fk)
fks


[0.24, 0.04, 0.26]

In [219]:

cro_df['Agreement'] = 0.
for i in cro_df.index:
    if cro_df['HITTypeId'].loc[i] == '7QT':
        cro_df['Agreement'].loc[i] = fks[0]
    elif cro_df['HITTypeId'].loc[i] == '8QT':
        cro_df['Agreement'].loc[i] = fks[1]    
    elif cro_df['HITTypeId'].loc[i] == '9QT':
        cro_df['Agreement'].loc[i] = fks[2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cro_df['Agreement'].loc[i] = fks[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cro_df['Agreement'].loc[i] = fks[1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cro_df['Agreement'].loc[i] = fks[2]


In [220]:
cro_df.to_csv('Datasets/CrowdData/Clean_Cro.tsv', sep='\t', index=False)

## TEST DATASET CREATION

### Factual and Embedding

In [2]:
import pandas as pd

In [3]:
df_drop_na = pd.read_csv('name_Qwiki_Qid.tsv', sep='\t')

In [4]:
names = list(df_drop_na['Str'])

In [5]:
df_relation = pd.read_csv('Datasets/Big_graph_very_clean.tsv', sep='\t', header=None)

In [6]:
import pickle
with open('P2label.pickle', 'rb') as handle:
    P2label = pickle.load(handle)
labels = list(P2label.values())

In [7]:
Qs = names
Ps = labels
relations = df_relation
print(len(Qs), len(Ps), len(relations))

150152 251 2010463


In [None]:
### some fun examples
# St Trinian's	cast member ? (multiple answers)
# instance of 隶属于

### Give P Categroies

In [None]:
group_1 = []
group_others = []
for i, P in enumerate(Ps):
    print("Type 1, id {} : {} of Q1 is Q2, or Q2 is {} of Q1".format(i, P, P))
    label = input()
    if label == 1:
        group_1.append(P)
    else:
        group_others.append(P)

11 distributed by
21 named after
31 different from
<!-- 33 based on -->
48 educated at
62 fictional universe described in
112 owner of
131 opposite of
141 conferred by
181 permanent resident of
203 facet of
246 archives at

99 group2 
151 
166 
209 


In [8]:
wr1 = ["distributed by", "named after", "based on", "different from", "educated at", "fictional universe described in", \
    "conferred by", ]
wr2 = ["EIRIN film rating", "business model", "form of creative work", "broadcast by"]

# for wr in wr1:
#     if wr in group_1:
#         group_1.remove(wr)
#     if wr not in group_others:
#         group_others.append(wr)

# for wr in wr2:
#     if wr in group_others:
#         group_others.remove(wr)
#     if wr not in group_1:
#         group_1.append(wr)

In [9]:
sub_relations = relations[:5000]

In [10]:
group1 = Ps
group2 = ["owner of", "opposite of",  "permanent resident of", "facet of", "archives at"]
group3 = ["distributed by", "named after", "based on", "different from", "educated at", "fictional universe described in", \
    "conferred by", ]

for s in group2:
    group1.remove(s)
for s in group3:
    group1.remove(s)


In [11]:
len(group1) + len(group2) + len(group3)

251

In [12]:
import numpy as np

In [16]:
def choose_template(Q1, P, type, Q2=None):

    Templates_group1 = [
        "What is the {} of {}?".format(P, Q1),
        "What is {}'s {}?".format(Q1, P),
        "Tell me the {} of {}.".format(P, Q1),
        "Do you know the {} of {}?".format(P, Q1),
        "Can you tell me the {} of {}?".format(P, Q1)
        ]

    Templates_group2 = [
        "What is the {} {}?".format(P, Q1),
        "Tell me what is {} {}.".format(P, Q1),
        "Do you know the {} {}?".format(P, Q1),
        "Can you tell me what is {} {}?".format(P, Q1),
        
    ]

    Templates_group3 = [
        "What is {} {}?".format(Q1, P),
        "Tell me what is {} {}.".format(Q1, P),
        "Can you tell me what is {} {}?".format(Q1, P)
    ]

    if type == 1:
        return Templates_group1[np.random.randint(len(Templates_group1))]
    elif type == 2:
        return Templates_group2[np.random.randint(len(Templates_group2))]
    else:
        return Templates_group3[np.random.randint(len(Templates_group3))]

In [17]:
Questions = list()
Q1s = list()
Ps = list()
Q2s = list()
for i, row in relations.iterrows():
    Q1, P, Q2 = row[0], row[1], row[2]
    if P in group3:
        type = 3
    elif P in group2:
        type = 2
    else:
        type = 1
    s = choose_template(Q1, P, type)
    # print(s)
    Questions.append(s)
    Q1s.append(Q1)
    Ps.append(P)
    Q2s.append(Q2)


In [18]:
dataset = pd.DataFrame(data = {'Question': Questions, 'Q1': Q1s, 'P': Ps, 'Q2':Q2s})
# small_dataset = pd.DataFrame(data = {'X': X, 'y': y})
dataset.to_csv('Datasets/Testsets/Fac_Emb_Testset.tsv', sep='\t', index=False)
# small_dataset.to_csv('Datasets/Relations_small_X_y.tsv', sep='\t', index=False)

### Multimedia

In [19]:
def choose_template_mul(Q1):

    Templates_group = [
        "Show me a picture of {}?".format(Q1),
        "What does {} look like?".format(Q1),
        "Let me know what {} looks like.".format(Q1),
        ]

    return Templates_group[np.random.randint(len(Templates_group))]

In [20]:
with open('Datasets/ImageData/id2Code.pickle', 'rb') as handle:
    id2Code = pickle.load(handle)

In [21]:
mul_relations = df_relation[:20000]
mul_relations = mul_relations.loc[mul_relations.iloc[:, 1].str.contains('IMDb ID')]

In [33]:
Questions = list()
Q1s = list()
Ps = list()
Q2s = list()
codes = list()

for i, row in mul_relations.iterrows():
    id = row[2]
    Q1 = row[0]
    P = row[1]
    Q2 = id
    if id in id2Code.keys():
        s = choose_template_mul(Q1)
        # print(s)
        Questions.append(s)
        Q1s.append(Q1)
        Ps.append(P)
        Q2s.append(Q2)
        codes.append(id2Code[id])


In [34]:
dataset = pd.DataFrame(data = {'Question': Questions, 'Q1': Q1s, 'P': Ps, 'Q2':Q2s, 'Code': codes})
# small_dataset = pd.DataFrame(data = {'X': X, 'y': y})
dataset.to_csv('Datasets/Testsets/Mul_Testset.tsv', sep='\t', index=False)

### Recommend

In [39]:
def choose_template_mul(Qs):

    Templates_group = [
        "Recommend movies similar to {}.".format(', '.join(Qs)),
        "Given that I like {}, can you recommend some movies?".format(', '.join(Qs)),
        "Recommend movies like {}.".format(', '.join(Qs)),
        ]

    return Templates_group[np.random.randint(len(Templates_group))]

In [43]:
rec_relations = df_relation[:20000]
rec_relations = rec_relations.loc[rec_relations.iloc[:, 1].str.contains('instance of') & rec_relations.iloc[:, 2].str.contains('film')]

In [75]:
Questions = list()

for i in range(5000):
    
    Qs = [np.random.choice(rec_relations.iloc[:, 0]) for i in range(np.random.randint(2,5)) ]
    s = choose_template_mul(Qs)
    # print(s)
    Questions.append(s)


In [76]:
dataset = pd.DataFrame(data = {'Question': Questions})
# small_dataset = pd.DataFrame(data = {'X': X, 'y': y})
dataset.to_csv('Datasets/Testsets/Rec_Testset.tsv', sep='\t', index=False)

### Crowding

In [96]:
cro_df = pd.read_csv('Datasets/CrowdData/Clean_Cro.tsv', sep='\t')

In [97]:
cro_df = cro_df.drop_duplicates(subset=['HITId'])

In [98]:
cro_df = cro_df.loc[cro_df['Input1ID'].str.contains('wd') & cro_df['Input2ID'].str.contains('wdt')]

In [86]:
from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import csv
import json
import networkx as nx
import rdflib
import pywikibot

In [87]:
graph = rdflib.Graph()
graph.parse('Datasets/14_graph.nt', format='turtle')

<Graph identifier=Nb63a83fb34124aefac4ec5fadd3f8b19 (<class 'rdflib.graph.Graph'>)>

In [99]:
def Qwiki2name_func(Qwiki: str):
    if Qwiki == None:
        return None
    if not Qwiki.startswith('http'):
        Qwiki = 'http://www.wikidata.org/entity/{}'.format(Qwiki)
    sql = \
        """
    prefix wdt: <http://www.wikidata.org/prop/direct/> 
    prefix wd: <http://www.wikidata.org/entity/> 
    SELECT ?lbl
    WHERE{
    <%s> rdfs:label ?lbl.
    FILTER(LANG(?lbl)='en')
    }

    """ % (Qwiki)

    tl = list(graph.query(sql))
    if len(tl)>0:
        for t in tl:
            for m in t:
                print(str(m))
                return str(m)
    else:
        return None




In [90]:
with open('P2label.pickle', 'rb') as handle:
    P2label = pickle.load(handle)

In [None]:
Questions = list()
Q1s = list()
Ps = list()
for i, row in cro_df.iterrows():
    Q1, P = str(row['Input1ID']).removeprefix('wd:'), str(row['Input2ID']).removeprefix('wdt:')
    print(Q1, P)
    if P not in P2label.keys():
        continue
    Q1, P = Qwiki2name_func(Q1), P2label[P]
    if P in group3:
        type = 3
    elif P in group2:
        type = 2
    else:
        type = 1
    s = choose_template(Q1, P, type)
    print(s)
    Questions.append(s)
    Q1s.append(Q1)
    Ps.append(P)

In [102]:
dataset = pd.DataFrame(data = {'Question': Questions, 'Q1': Q1s, 'P': Ps})
# small_dataset = pd.DataFrame(data = {'X': X, 'y': y})
dataset.to_csv('Datasets/Testsets/Cro_Testset.tsv', sep='\t', index=False)