In [98]:
from pathlib import Path
from urllib.parse import urlparse
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from xml.sax.saxutils import quoteattr
import csv
from tqdm import tqdm
import re


### Data Preparation

* crawl equivalence_class mappings through SPARQL Endpoint: https://query.wikidata.org/

```jsx
SELECT ?wd ?wdLabel ?corrName ?schema
{
  values (?corr ?corrName)
    {
      (wdt:P1709 "equivClass")
    }
  ?wd ?corr ?schema
  filter(regex(str(?schema), "schema.org"))
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
} order by ?wd ?schema
```

* preparation of wikidata

In [22]:
# wikidata_class subset
urls = pd.read_csv('equivalent_class.csv')['wd'].values
wd_class_subset = []
for url in urls:
    wd_class_subset.append(Path(urlparse(url).path).stem)
# wd_class_subset

In [23]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql", 
                       agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36')

with open('wd_class_subset.csv', 'w') as f:
    writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
    line = ['Qid', 'Label', 'Description', 'AliasList', 'PropertyList']
    writer.writerow(line)

    for classItem in tqdm(wd_class_subset):
        query = """
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            PREFIX schema: <http://schema.org/>

            SELECT DISTINCT ?Qid ?Label ?Description (GROUP_CONCAT(DISTINCT ?Alias; SEPARATOR=" | ") AS ?AliasList) (GROUP_CONCAT(DISTINCT ?propertyLabel; SEPARATOR=" | ") AS ?PropertyList)
            WHERE {
            BIND(wd:%s AS ?Qid)

            # Retrieve additional information for each class
            ?Qid rdfs:label ?Label FILTER (lang(?Label) = "en").
            OPTIONAL { ?Qid schema:description ?Description FILTER (lang(?Description) = "en").}
            OPTIONAL { ?Qid skos:altLabel ?Alias FILTER (lang(?Alias) = "en"). }
            OPTIONAL { 
                ?Qid wdt:P1963 ?property.
                ?property rdfs:label ?propertyLabel FILTER (lang(?propertyLabel) = "en"). 
                }
            } 
            GROUP BY ?Qid ?Label ?Description
                """%(classItem)
        
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        results=sparql.query().convert()
    
        for row in results["results"]["bindings"]:
            # wikiclass = Path(urlparse(row['Qid']['value']).path).stem
            wikiclass = str(row['Qid']['value'])
            label = str(row['Label']['value'])
            description = str(row['Description']['value']) if 'Description' in row.keys() else None
            alias = str(row['AliasList']['value']).split(" | ") if 'AliasList' in row.keys() else None
            properti = str(row['PropertyList']['value']).split(" | ") if 'PropertyList' in row.keys() else None
            # print(wikiclass, label, description, alias, properti)
            writer.writerow([wikiclass, label, description, alias, properti])

100%|██████████| 370/370 [02:07<00:00,  2.90it/s]


* preparation of schema data

In [34]:
pattern_type = r'<https://schema\.org/(.*?)> <http://www\.w3\.org/1999/02/22-rdf-syntax-ns#type> <https://schema\.org/(.*?)> \.'
pattern_subclass = r'<https://schema\.org/(.*?)> <http://www\.w3\.org/2000/01/rdf-schema#subClassOf> <https://schema\.org/(.*?)> \.'

with open('schemaorg-current-https.nt', 'r') as fin, open('sch_types_all.txt', 'w', encoding="utf-8") as fout:
    for line in fin:
        line = line.strip() 
        match_type =  re.match(pattern_type, line)
        match_subclass = re.match(pattern_subclass, line)
        
        if match_type:
            # print('1', match_type.group(1), match_type.group(2))
            fout.write(match_type.group(1) + '\t' + match_type.group(2) + '\n')
        if match_subclass:
            # print('2', match_subclass.group(1), match_subclass.group(2))
            fout.write(match_subclass.group(1) + '\t' + match_subclass.group(2) + '\n')

In [35]:
pd.read_csv('sch_types_all.txt', delimiter='\t', names=['child', 'parent']).head()

Unnamed: 0,child,parent
0,EventMovedOnline,EventStatusType
1,MedicalSignOrSymptom,MedicalCondition
2,DoseSchedule,MedicalIntangible
3,MedicalSign,MedicalSignOrSymptom
4,Paperback,BookFormatType


### Data Cleaning

* cleaning schema_types: removing all subclasses of DataType (including DataType)
* cleaning referenced mappings and wd_subset_classes: removing types/classes which schema_types don't contain
* one-to-one mapping filtering

In [26]:
def find_all_children(df, parent_class, children=[], height=0):
    '''
    @param: df: dataframe with (parent_class, child_class) as index
            children: all children for input parent_class
            height: hierarchy height for the input parent_class
    '''
    max_height = height # note the maximum height
    _children = children.copy()
    # get all direct children
    idx = pd.IndexSlice
    if parent_class in df.index.get_level_values('parent'):
        rows = df.loc[idx[parent_class, :], :]
        new_children = rows.index.get_level_values('child').unique().tolist()

        # get children of new_children
        for child in new_children:
            if child not in _children:
                _children.append(child)
                #print(1, _children)
                _children, child_depth = find_all_children(df, child, _children, height+1)
                #print(2, _children, child_depth)
                if child_depth > max_height:
                    max_height = child_depth

    return _children, max_height

In [27]:
df_sch_types = pd.read_csv('sch_types_all.txt', delimiter='\t', names=['child', 'parent']).set_index(['parent', 'child'])
remove_types, _ = find_all_children(df_sch_types, 'DataType')

In [28]:
df_schema = pd.read_csv('schemaorg-current-https-types.csv')
df_schema = df_schema[~df_schema['label'].isin(remove_types)]

In [29]:
def clean_comment(text):
    # remove some html 
    clean_data = re.sub(r'<[^>]+>', '', text)

    # change some particular symbol
    clean_data = re.sub(r"&lt;", "<", clean_data)
    clean_data = re.sub(r"&gt;", ">", clean_data)
    clean_data = re.sub(r"&#x2014;", "—", clean_data)

    # remove some additional spaces
    clean_data = re.sub(r'\s+', ' ', clean_data)
    clean_data = re.sub(r'^\s+|\s+?$', '', clean_data)
    clean_data = re.sub(r'\n', '', clean_data)
    return clean_data

def clean_Type(URIString):
    typeURIs = URIString.split(',')
    return [Path(urlparse(type_).path).stem for type_ in typeURIs]

In [30]:
df_schema.loc[:, "comment"] = df_schema.loc[:, "comment"].apply(lambda x: clean_comment(x))
# df_schema["subTypeOf"] = df_schema["subTypeOf"].apply(lambda x: clean_Type(x) if str(x) != 'nan' else x)
# df_schema["properties"] = df_schema["properties"].apply(lambda x: clean_Type(x) if str(x) != 'nan' else x)

df_schema_desc = df_schema.loc[:, ['id', 'label', 'comment']]
df_schema_desc.head()

Unnamed: 0,id,label,comment
0,https://schema.org/3DModel,3DModel,"A 3D model represents some kind of 3D content,..."
1,https://schema.org/AMRadioChannel,AMRadioChannel,A radio channel that uses AM.
2,https://schema.org/APIReference,APIReference,Reference documentation for application progra...
3,https://schema.org/Abdomen,Abdomen,Abdomen clinical examination.
4,https://schema.org/AboutPage,AboutPage,Web page type: About page.


* cleaning referenced mappings and wd_subset_classes: removing types/classes which schema types don't contain

In [47]:
df_map = pd.read_csv('equivalent_class.csv')
df_wd_sub_classes = pd.read_csv('wd_class_subset.csv')

In [50]:
df_map['schema_label'] = df_map['schema'].apply(lambda x: Path(urlparse(x).path).stem)
# df_map['wd'] = df_map['wd'].apply(lambda x: Path(urlparse(x).path).stem)
df_map1 = df_map[df_map['schema'].isin(df_schema_desc['id'].values)] # removing mappings of DataType
df_cleaned_map = df_map1[df_map1['wd'].isin(df_wd_sub_classes['Qid'].values)] # removing mappings which have no wiki labels
df_cleaned_wd_sub_classes = df_wd_sub_classes[df_wd_sub_classes['Qid'].isin(df_cleaned_map['wd'])]

df_cleaned_wd_sub_classes = df_cleaned_wd_sub_classes.rename(columns={'Qid': 'id', 'Label':'label', 'Description':'comment'})
df_cleaned_wd_sub_classes.reset_index(drop=True, inplace=True)
df_cleaned_map.reset_index(drop=True, inplace=True)

# Removing 2-to-1 mappings
# Here only consider 1-to-1 mappings
Del=['ImageObject', 'PublicationVolume', 'Quotation','EntertainmentBusiness', 'EventVenue']
df_cleaned_mapping = df_cleaned_map[~df_cleaned_map['schema_label'].isin(Del)]
df_cleaned_mapping.reset_index(drop=True, inplace=True)

df_cleaned_wd = df_cleaned_wd_sub_classes[df_cleaned_wd_sub_classes['id'].isin(df_cleaned_mapping['wd'].to_list())].drop_duplicates()
df_cleaned_wd.reset_index(drop=True, inplace=True)

df_cleaned_schema = df_schema_desc[df_schema_desc['id'].isin(df_cleaned_mapping['schema'].to_list())].drop_duplicates()
df_cleaned_schema.reset_index(drop=True, inplace=True)
# df_map_reference['wd_desc'].fillna('', inplace=True) # deal with NaN value in wiki_desc

In [54]:
df_cleaned_wd['comment'].fillna('', inplace=True) # deal with NaN value in wiki_desc
df_cleaned_schema['comment'].fillna('', inplace=True) # deal with NaN value in schema_desc
df_cleaned_mapping = df_cleaned_mapping.rename(columns={'wd':'Class1_id', 'schema':'Class2_id', 'corrName':'Relation'})

In [65]:
# # output the cleaned files
# df_cleaned_schema.to_csv('target.csv', index=False)
# df_cleaned_mapping.loc[:, ['Class1_id', 'Class2_id', 'Relation']].to_csv('reference.csv', index=False)
# df_cleaned_wd.loc[:, ['id', 'label', 'comment']].to_csv('source.csv', index=False)

* generate expended dataset for robustness test: discussion in paper

In [6]:
df_schema = pd.read_csv('schemaorg-current-https-types.csv')
df_schema.loc[:, "comment"] = df_schema.loc[:, "comment"].apply(lambda x: clean_comment(x))
df_schema_desc = df_schema.loc[:, ['id', 'label', 'comment']]
df_schema_desc.to_csv('noised_target.csv')

In [10]:
# # the number of the whole classes in Wikidata
# a = pd.read_csv('classes.txt', delimiter=' ', names=['child', 'parent'])
# len(set(a['child'].unique()).union(set(a['parent'].unique())))

2791549

* Set data to OAEI standard format

In [None]:
from rdflib import Graph, URIRef, Literal, RDF, XSD
from rdflib.namespace import OWL, RDF, RDFS

In [74]:
# reference alignment to OAEI rdf format
def get_file_header():
    return """<?xml version=\"1.0\" encoding=\"utf-8\"?>
    <rdf:RDF xmlns="http://knowledgeweb.semanticweb.org/heterogeneity/alignment"
      xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
      xmlns:xsd="http://www.w3.org/2001/XMLSchema#">
<Alignment>
  <xml>yes</xml>
  <level>0</level>
  <type>??</type>"""

def get_mapping_format(source, target, measure):
    relation= '='
    return """
  <map>
    <Cell>
      <entity1 rdf:resource=%s/>
      <entity2 rdf:resource=%s/>
      <relation>%s</relation>
      <measure rdf:datatype="xsd:float">%.1f</measure>
    </Cell>
  </map>""" %(quoteattr(source), quoteattr(target), '=', float(measure))
#(quoteattr(source), quoteattr(target), relation, measure)

def _get_file_footer():
    return """
  </Alignment>
</rdf:RDF>
"""

def writeAlignments(file, df):
    #df=pd.read_csv(alignments)
    #df=df.drop_duplicates(subset=['Class_Name_1', 'Class_Name_2'], keep='first')

    with open(file, 'w', encoding='utf-8') as Myfile:
        Myfile.write(get_file_header())
        for i in range(len(df)):
            Myfile.write(get_mapping_format(df.loc[i,'Class1_id'], df.loc[i,'Class2_id'],'1.0'))
        Myfile.write(_get_file_footer())


In [75]:
FilePath ='reference.rdf'
writeAlignments(FilePath, df_cleaned_mapping.loc[:, ['Class1_id', 'Class2_id', 'Relation']])

In [91]:
# generate source/target ontology
# source dataframe: df_cleaned_wd
# target dataframe: df_cleaned_schema
graph = Graph()
graph.bind("owl", OWL)
graph.bind("rdfs", RDFS)

for i in df_cleaned_wd.index:
    #adding classes to the graph
    classN = URIRef(df_cleaned_wd.loc[i,'id'])
    name = Literal(df_cleaned_wd.loc[i, 'label'], datatype=XSD['string']) #the class name label
    desc = Literal(df_cleaned_wd.loc[i, 'comment'], datatype=XSD['string']) #the class description
    graph.add((classN, RDF.type, OWL.Class))
    graph.add((classN, RDFS.label, name))
    graph.add((classN, RDFS.comment, desc))

outFile = 'source.rdf'
with open(outFile, 'wb') as f:
    graph.serialize(f, format='xml')

In [96]:
graph = Graph()
graph.bind("owl", OWL)
graph.bind("rdfs", RDFS)

for i in df_cleaned_schema.index:
    #adding classes to the graph
    classN = URIRef(df_cleaned_schema.loc[i,'id'])
    name = Literal(df_cleaned_schema.loc[i, 'label'], datatype=XSD['string']) #the class name label
    desc = Literal(df_cleaned_schema.loc[i, 'comment'], datatype=XSD['string']) #the class description
    graph.add((classN, RDF.type, OWL.Class))
    graph.add((classN, RDFS.label, name))
    graph.add((classN, RDFS.comment, desc))

outFile = 'target.rdf'
with open(outFile, 'wb') as f:
    graph.serialize(f, format='xml')

In [95]:
# check the number of classes
classes=[]
for s, p, o in graph.triples((None, RDF.type, OWL.Class)):
    classes.append(str(s))
s=set(classes)
print(len(s))

343
