In [1]:
from unidecode import unidecode
from fuzzywuzzy import fuzz
from shapely.geometry import Point
from shapely.geometry import Polygon
from shapely.geometry import MultiPolygon
from shapely.geometry import shape
from shapely import wkt
import numpy as np
import pandas as pd
import pyparsing
import geopandas



#### Read all entities of Greek Administrative Areas in Yago

In [2]:
def read_file(filename):
	f = open(filename, 'r') 
	lines = f.readlines()
	return lines

In [3]:
yago_regions = read_file('data/yago4/administrative_divisions_of_Greece/classes/Administrative_Regions_YAGO.txt')
print(yago_regions[:2])

yago_dec_administrations = read_file('data/yago4/administrative_divisions_of_Greece/classes/Decentralized_Administration_YAGO.txt')

yago_municipalities = read_file('data/yago4/administrative_divisions_of_Greece/classes/Municipalities_and_communities_YAGO.txt')

yago_reg_units = read_file('data/yago4/administrative_divisions_of_Greece/classes/Regional_units_YAGO.txt')

yago_prefectures = read_file('data/yago4/administrative_divisions_of_Greece/classes/Prefectures_YAGO.txt')

['<http://yago-knowledge.org/resource/Central_Macedonia>\t<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>\t<http://yago-knowledge.org/resource/Administrative_regions_of_Greece>\t.\n', '<http://yago-knowledge.org/resource/Epirus_(region)>\t<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>\t<http://yago-knowledge.org/resource/Administrative_regions_of_Greece>\t.\n']


#### Initializing a dictionary for every Administrative Unit 

In [4]:
# create an entrty for each line, with an empty list as a value
def dict_initialization(data, bool = False):
    d = {}
    if bool == True:
        for line in data:
            # key = vals[0].split("/")[4].replace('>','')
            d.setdefault(line.split("\t")[0],[])
    else:
        for line in data:
            key = unidecode(line.split(">")[2].replace(" .", "").replace("\n", "").replace(" ", "", 1)).lower()
            value = line.split(">")[0].split("<")[1]
            d[value] = key
    return d

In [5]:
yago_dec_administrations_dict = dict_initialization(yago_dec_administrations, True)
print ("Number of Decentralized Administrations in Yago:", len(yago_dec_administrations_dict))

yago_regions_dict = dict_initialization(yago_regions, True)
print ("Number of Regions in Yago:", len(yago_regions_dict))

yago_reg_units_dict = dict_initialization(yago_reg_units, True)
print ("Number of Regional Units in Yago:", len(yago_reg_units_dict))

yago_municipalities_dict = dict_initialization(yago_municipalities, True)
print ("Number of Municipalities in Yago:", len(yago_municipalities_dict))

Number of Decentralized Administrations in Yago: 7
Number of Regions in Yago: 13
Number of Regional Units in Yago: 90
Number of Municipalities in Yago: 340


In [6]:
# keep only consistent entities from perfectures 
# and add them to regional units dictionary
def clear_perfectures(prefectures, regional_units_dict):
	for p in prefectures:
		subj = p.split("\t")[0]
		if "regional_unit" in subj and subj not in regional_units_dict:
			regional_units_dict.setdefault(subj,[])
	return regional_units_dict

In [7]:
prev_len = len(yago_reg_units_dict)
yago_reg_units_dict = clear_perfectures(yago_prefectures, yago_reg_units_dict)
print ("Added", len(yago_reg_units_dict) - prev_len, "more Regional Units from Perfectures file")

Added 2 more Regional Units from Perfectures file


#### Reading YAGO labels and storing them to corresponding dictionary

In [8]:
yago_regions_labels = read_file('data/yago4/administrative_divisions_of_Greece/labels/administrative_regions.nt')
yago_dec_administrations_labels = read_file('data/yago4/administrative_divisions_of_Greece/labels/dec_administration.nt')
yago_municipalities_labels = read_file('data/yago4/administrative_divisions_of_Greece/labels/municipalities.nt')
yago_prefectures_labels = read_file('data/yago4/administrative_divisions_of_Greece/labels/prefectures.nt')
yago_reg_units_labels = read_file('data/yago4/administrative_divisions_of_Greece/labels/reg_units.nt')

In [9]:
def insert_labels(data, d):
	for line in data:
		vals = line.split("\t")
		key = vals[0] #.split("/")[4].replace('>','')
		value = vals[2].replace(" - ", "-")
		if key in d and value not in d[key]:
			d[key].append(value.split("@")[0])
	return d

In [10]:
yago_regions_dict = insert_labels(yago_regions_labels, yago_regions_dict)
yago_dec_administrations_dict = insert_labels(yago_dec_administrations_labels, yago_dec_administrations_dict)
yago_municipalities_dict = insert_labels(yago_municipalities_labels, yago_municipalities_dict)
yago_reg_units_dict = insert_labels(yago_prefectures_labels, yago_reg_units_dict)
yago_reg_units_dict = insert_labels(yago_reg_units_labels, yago_reg_units_dict)

for x in yago_dec_administrations_dict:
    print(x, yago_dec_administrations_dict[x], "\n")

<http://yago-knowledge.org/resource/Decentralized_Administration_of_the_Aegean> ['"Decentralized Administration of the Aegean"', '"Αποκεντρωμένη Διοίκηση Αιγαίου"'] 

<http://yago-knowledge.org/resource/Decentralized_Administration_of_Crete> ['"Decentralized Administration of Crete"', '"Αποκεντρωμένη Διοίκηση Κρήτης"'] 

<http://yago-knowledge.org/resource/Decentralized_Administration_of_Macedonia_and_Thrace> ['"Decentralized Administration of Macedonia and Thrace"', '"Αποκεντρωμένη Διοίκηση Μακεδονίας-Θράκης"'] 

<http://yago-knowledge.org/resource/Decentralized_Administration_of_Peloponnese,_Western_Greece_and_the_Ionian> ['"Peloponnese, West Greece and Ionian Sea Administration"', '"Decentralized Administration of Peloponnese"', '"Αποκεντρωμένη Διοίκηση Πελοποννήσου, Δυτικής Ελλάδας και Ιονίου"'] 

<http://yago-knowledge.org/resource/Decentralized_Administration_of_Attica> ['"Decentralized Administration of Attica"', '"Αποκεντρωμένη Διοίκηση Αττικής"'] 

<http://yago-knowledge.org/r

#### Reading Gag labels and storing them to dictionaries

In [11]:
# read gag labels
gag_regions_labels = read_file('data/gag/labels/regions.nt')
gag_regions_dict = dict_initialization(gag_regions_labels)

gag_regional_units_labels = read_file('data/gag/labels/regional_units.nt')
gag_regional_units_dict = dict_initialization(gag_regional_units_labels)

gag_dec_administrations_labels = read_file('data/gag/labels/decentralized_adm.nt')
gag_dec_administrations_dict = dict_initialization(gag_dec_administrations_labels)

gag_municipalities_labels = read_file('data/gag/labels/municipalities.nt')
gag_municipalities_dict = dict_initialization(gag_municipalities_labels)

#### Matching Yago with Gag Labels with a 0.8 similarity threshold

In [12]:
def label_filter(yago, gag):
	matched = {}
	not_matched = []
	# for every label of each yago entity
	for yago_entry in yago:
		max_r = 0
		mstr = ""
		# find a match for each label
		for label in yago[yago_entry]:
			max_ratio = 0
			matched_label = ""
			for gag_entry in gag:
				ulabel = unidecode(label).lower()
				r = fuzz.ratio(ulabel, gag[gag_entry]) 
				if r > max_ratio:
					max_ratio = r
					matched_label = gag_entry
				if r == 100:
					break
			# keep the one with higher ratio
			# print(max_ratio, label, matched_label)
			if max_ratio > max_r:
				max_r = max_ratio
				mstr = matched_label
			if max_r == 100:
				break
		if max_r > 80:
			key = mstr
			if key not in matched:
				matched.setdefault(key,[])
			matched[key].append(yago_entry)
		else:
			not_matched.append(yago_entry)
	return (matched, not_matched)

In [13]:
(matched, unmatched) = label_filter(yago_dec_administrations_dict, gag_dec_administrations_dict)
print("Decentralized Administrations: Matched ", (len(yago_dec_administrations_dict) - len(unmatched)), len(yago_dec_administrations_dict))

(regions_matched, regions_unmatched) = label_filter(yago_regions_dict, gag_regions_dict)
print("Regions: Matched ", (len(yago_regions_dict) - len(regions_unmatched)), len(yago_regions_dict))

(regional_units_matched, regional_units_unmatched) = label_filter(yago_reg_units_dict, gag_regional_units_dict)
print("Regional Units: Matched ", (len(yago_reg_units_dict) - len(regional_units_unmatched)), len(yago_reg_units_dict))
print(len(regional_units_matched))

(municipalities_matched, municipalities_unmatched) = label_filter(yago_municipalities_dict, gag_municipalities_dict)
print("Municipalities: Matched ", (len(yago_municipalities_dict) - len(municipalities_unmatched)), len(yago_municipalities_dict))
print(len(municipalities_matched))

Decentralized Administrations: Matched  7 7
Regions: Matched  13 13
Regional Units: Matched  78 92
67
Municipalities: Matched  331 340
322


In [14]:
gag_unmatched_reg_units = []
for entry in gag_regional_units_dict:
    if entry not in regional_units_matched:
        gag_unmatched_reg_units.append(entry)

gag_unmatched_municipalities = []
print(len(gag_municipalities_dict))

for entry in gag_municipalities_dict:   
    if entry not in municipalities_matched:
        gag_unmatched_municipalities.append(entry) 

325


#### Reading YAGO geoCoordinates and storing them as Points to a dictionary

In [15]:
def parse_coordinates(data, bool = True):
    if bool == True:
        coords = {}
        for line in data:
            key = "<" + line.split("\"")[1] + ">"
            if key == "<subj>":
                continue
            geo = line.split("geo:")[1]
            x1 = (geo.split(",")[0])
            x2 = (geo.split(",")[1].replace("\"",""))
            point = 'POINT (' + x1 + ' ' + x2 +')'
            coords[key] = wkt.loads(point)
        return coords
    else:
        data['id'] = data['id'].str.replace(" ", "")
        data['id'] = data['id'].str.replace("\t", "")
        data['geometry'] = data['geometry'].str.replace('\"', '')
        data["geometry"] = data["geometry"].str.replace(';http://www.opengis.net/def/crs/EPSG/0/2100', '')
        data["geometry"] = data["geometry"].str.replace('http://strdf.di.uoa.gr/ontology#WKT', '')
        data["geometry"] = data["geometry"].str.replace('^', '')
        data['geometry'] = data['geometry'].apply(wkt.loads)
        gdf = geopandas.GeoDataFrame(data, geometry='geometry', crs={'init':'epsg:2100'})
        gdf.to_crs(4326, inplace = True)
        return gdf

In [16]:
f = read_file('data/yago4/administrative_divisions_of_Greece/geoCoordinates/municipalities.csv')
yago_municipalities_geo = parse_coordinates(f)

f = read_file('data/yago4/administrative_divisions_of_Greece/geoCoordinates/regional_units.csv')
yago_reg_units_geo = parse_coordinates(f)

f = read_file('data/yago4/administrative_divisions_of_Greece/geoCoordinates/regions.csv')
yago_regions_geo = parse_coordinates(f)

print (len(yago_reg_units_geo))

55


In [17]:
def add_prefecture_coordinates(data, reg_units):
    for line in data:
        key = "<" + line.split("\"")[1] + ">"
        if key == "<subj>":
            continue
        if "regional_unit" in key and key not in reg_units:
            geo = line.split("geo:")[1]
            x1 = (geo.split(",")[0])
            x2 = (geo.split(",")[1].replace("\"",""))
            point = 'POINT (' + x1 + ' ' + x2 +')'
            reg_units[key] = wkt.loads(point)
    return reg_units

In [18]:
f = read_file('data/yago4/administrative_divisions_of_Greece/geoCoordinates/prefectures.csv')
yago_reg_units_geo = add_prefecture_coordinates(f, yago_reg_units_geo)
print (len(yago_reg_units_geo))

57


#### Reading GAG geoCoordinates and storing them as Multipolygon to a dictionary

In [19]:
df = pd.read_csv('data/gag/geometries/municipalities.csv')
gag_municipalities_geo = parse_coordinates(df, False)

df = pd.read_csv('data/gag/geometries/regional_units.csv')
gag_reg_units_geo = parse_coordinates(df, False)

df = pd.read_csv('data/gag/geometries/regions.csv')
gag_regions_geo = parse_coordinates(df, False)

  return _prepare_from_string(" ".join(pjargs))


#### Removing entities matched based on their labels if their coordinates have distance > 28km

In [20]:
def geometry_filter(yago, gag, matched):
	for _, gag_entry in gag.iterrows():
		if gag_entry.id in matched:
			gag_geo = gag_entry.geometry
			yago_entries = matched[gag_entry.id]
			rem = []
			for yago_entry in yago_entries:
				if yago_entry in yago:
					yago_geo = yago[yago_entry]
					if yago_geo.distance(gag_geo) > 28:
						rem.append(yago_entry)
			for entity in rem:
				yago_entries.remove(entity)
	return matched

In [21]:
y = geometry_filter(yago_municipalities_geo, gag_municipalities_geo, municipalities_matched)
print(len(municipalities_matched), len(y))

y = geometry_filter(yago_reg_units_geo, gag_reg_units_geo, regional_units_matched)
print(len(regional_units_matched), len(y))

y = geometry_filter(yago_regions_geo, gag_regions_geo, regions_matched)
print(len(regions_matched), len(y))

322 322
67 67
13 13


In [22]:
def schema_serializer(geo_dict, entry):
    for _, x in geo_dict.iterrows():
        if x.id == entry:
            geo_type = str(x.geometry).split(" ")[0]
            if geo_type == 'POLYGON':
                geometry = str(x.geometry).replace('POLYGON ((','').replace('))', '')
                pairs = geometry.split(',')
                geo_str = ""
                idx = 0
                for p in pairs:
                    if idx == 0:
                        p1 = p.split(' ')[0]
                        p2 = p.split(' ')[1]
                    else:
                        p1 = p.split(' ')[1]
                        p2 = p.split(' ')[2]
                    geo_str += p1 + ',' + p2 + ' '
                    idx = idx + 1
                return [geo_str[:-1]]
            else:
                geometry = str(x.geometry).replace('MULTIPOLYGON ','')
                polygons = geometry.split('))')
                polygons_list = []
                for poly in polygons:
                    poly = poly.replace(', ((', '').replace('(((', '').replace(')))', '').replace(')', '')
                    pairs = poly.split(',')
                    geo_str = "" 
                    for p in pairs:
                        if p == '':
                            continue
                        points = p.split(' ')
                        if len(points) == 2:
                            p1 = points[0]
                            p2 = points[1]
                        elif len(points) == 3:
                            p1 = points[1]
                            p2 = points[2]
                        geo_str += p1 + ',' + p2 + ' '
                    polygons_list.append(geo_str)
                return polygons_list[:-1]
    return ""

#### Storing the rest of gag data

In [23]:
codes = read_file('data/gag/has_code.nt')
populations = read_file('data/gag/has_population.nt')
labels = read_file('data/gag/labels.nt')
seats = read_file('data/gag/has_seat.nt')
contained = read_file('data/gag/containedIn.nt')

In [24]:
def store_gag_obj(data):
    d = {}
    for line in data:
        subj = line.split("<")[1].split(">")[0]
        obj = line.split("> ", 2)[2].replace("\n", "").replace(" .", "")
        d[subj] = obj
    return d

In [25]:
codes_dict = store_gag_obj(codes)
populations_dict = store_gag_obj(populations)
labels_dict = store_gag_obj(labels)
seats_dict = store_gag_obj(seats)
contained_dict = store_gag_obj(contained)

#### Extending each matched YAGO entity

In [26]:
prefix = "@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n"
foaf = "@prefix foaf: <http://xmlns.com/foaf/0.1/> .\n"
schema = "@prefix schema: <http://schema.org/> .\n"
exto = "@prefix exto: <http://kr.di.uoa.gr/yago4-extension/ontology#> .\n"
extr = "@prefix extr: <http://kr.di.uoa.gr/yago4-extension/resource/> .\n"
rdf = "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n"

In [27]:
extention = open("yago4-extention.ttl","w")
extention.writelines([prefix, foaf, schema, exto, extr, rdf, "\n"]) 

#### Extending Decentralized Administration Entities

In [28]:
for x in matched:
    y = matched[x][0]
    extention.write(y + " rdf:type exto:Kallikratis_Decentralized_Administrations_of_Greece")
    extention.write(" ;\n schema:alternateName " + labels_dict[x])
    if x in codes_dict:
        extention.write(" ;\n exto:has_code " + codes_dict[x])
    if x in populations_dict:
        extention.write(" ;\n exto:has_population " + populations_dict[x])
    extention.write(" .\n")

#### Extending Regions Entities

In [29]:
geometry = 0
for x in regions_matched:
    y = regions_matched[x][0]
    extention.write(y + " rdf:type exto:Kallikratis_Regions_of_Greece")
    extention.write(" ;\n schema:alternateName " + labels_dict[x])
    if x in codes_dict:
        extention.write(" ;\n exto:has_code " + codes_dict[x])
    if x in populations_dict:
        extention.write(" ;\n exto:has_population " + populations_dict[x])
    if x in contained_dict:
        obj = contained_dict[x].replace("<","").replace(">","")
        if obj in matched:
            for z in matched[obj]:
                extention.write(" ;\n schema:containedIn " + z)
    extention.write(" ;\n schema:geo extr:Geometry_" + str(geometry))
    geo = schema_serializer(gag_regions_geo, x)
    for poly in geo:
        extention.write(" .\n extr:Geometry_" + str(geometry) + " schema:polygon " + "\"" + poly + "\"")
    geometry += 1
    extention.write(" .\n")

#### Extending Regional Units Entities

In [30]:
for x in regional_units_matched:
    y = regional_units_matched[x][0]
    extention.write(y + " rdf:type exto:Kallikratis_Regional_Units_of_Greece")
    extention.write(" ;\n schema:alternateName " + labels_dict[x])
    if x in codes_dict:
        extention.write(" ;\n exto:has_code " + codes_dict[x])
    if x in populations_dict:
        extention.write(" ;\n exto:has_population " + populations_dict[x])
    if x in contained_dict:
        obj = contained_dict[x].replace("<","").replace(">","")
        if obj in regions_matched:
            for z in regions_matched[obj]:
                extention.write(" ;\n schema:containedIn " + z)
    extention.write(" .\n")

In [31]:
counter = 1
reg_units_entities = {}
for x in gag_unmatched_reg_units:
    extention.write("extr:kallikratis_entity_" + str(counter) + " rdf:type exto:Kallikratis_Regional_Units_of_Greece")
    reg_units_entities[x] = "extr:kallikratis_entity_" + str(counter)
    extention.write(" ;\n schema:alternateName " + labels_dict[x])
    if x in codes_dict:
        extention.write(" ;\n exto:has_code " + codes_dict[x])
    if x in populations_dict:
        extention.write(" ;\n exto:has_population " + populations_dict[x])
    if x in contained_dict:
        obj = contained_dict[x].replace("<","").replace(">","")
        if obj in regions_matched:
            for z in regions_matched[obj]:
                extention.write(" ;\n schema:containedIn " + z)
    extention.write(" .\n")
    counter += 1

#### Extending Municipalities Entities

In [32]:
for x in municipalities_matched:
    y = municipalities_matched[x][0]
    extention.write(y + " rdf:type exto:Kallikratis_Municipalities_of_Greece")
    extention.write(" ;\n schema:alternateName " + labels_dict[x])
    if x in codes_dict:
        extention.write(" ;\n exto:has_code " + codes_dict[x])
    if x in populations_dict:
        extention.write(" ;\n exto:has_population " + populations_dict[x])
    if x in contained_dict:
        obj = contained_dict[x].replace("<","").replace(">","")
        if obj in regional_units_matched:
            for z in regional_units_matched[obj]:
                extention.write(" ;\n schema:containedIn " + z)
        else:
            if obj in reg_units_entities:
                extention.write(" ;\n schema:containedIn " + reg_units_entities[obj])
    extention.write(" .\n")

In [33]:
municipalities_entities = {}
for x in gag_unmatched_municipalities:
    extention.write("extr:kallikratis_entity_" + str(counter) + " rdf:type exto:Kallikratis_Municipalities_of_Greece")
    municipalities_entities[x] = "extr:kallikratis_entity_" + str(counter)
    extention.write(" ;\n schema:alternateName " + labels_dict[x])
    if x in codes_dict:
        extention.write(" ;\n exto:has_code " + codes_dict[x])
    if x in populations_dict:
        extention.write(" ;\n exto:has_population " + populations_dict[x])
    if x in contained_dict:
        obj = contained_dict[x].replace("<","").replace(">","")
        if obj in regional_units_matched:
            for z in regional_units_matched[obj]:
                extention.write(" ;\n schema:containedIn " + z)
        else:
            if obj in reg_units_entities:
                extention.write(" ;\n schema:containedIn " + reg_units_entities[obj])
    extention.write(" .\n")
    counter += 1

#### Inserting Municipal Units Entities to YAGO

In [34]:
municipal_units = read_file('data/gag/labels/municipal_units.nt')
municipal_units_entities = {}
for line in municipal_units:
    subj = line.split("<")[1].split(">")[0]
    extention.write("extr:kallikratis_entity_" + str(counter) + " rdf:type exto:Kallikratis_Municipal_Units_of_Greece")
    municipal_units_entities[subj] = "extr:kallikratis_entity_" + str(counter)
    extention.write(" ;\n schema:alternateName " + labels_dict[subj])
    if subj in codes_dict:
        extention.write(" ;\n exto:has_code " + codes_dict[subj])
    if subj in populations_dict:
        extention.write(" ;\n exto:has_population " + populations_dict[subj])
    if subj in contained_dict:
        obj = contained_dict[subj].replace("<","").replace(">","")
        if obj in municipalities_matched:
            for z in municipalities_matched[obj]:
                extention.write(" ;\n schema:containedIn " + z)
        else:
            if obj in municipalities_entities:
                extention.write(" ;\n schema:containedIn " + municipalities_entities[obj]) 
    extention.write(" .\n")
    counter += 1

#### Inserting Municipal Communities Units Entities to YAGO

In [35]:
municipal_com = read_file('data/gag/labels/municipal_communities.nt')
municipal_com_entities = {}
for line in municipal_com:
    subj = line.split("<")[1].split(">")[0]
    extention.write("extr:kallikratis_entity_" + str(counter) + " rdf:type exto:Kallikratis_Municipal_Communities_of_Greece")
    municipal_com_entities[subj] = "extr:kallikratis_entity_" + str(counter)
    extention.write(" ;\n schema:alternateName " + labels_dict[subj])
    if subj in codes_dict:
        extention.write(" ;\n exto:has_code " + codes_dict[subj])
    if subj in populations_dict:
        extention.write(" ;\n exto:has_population " + populations_dict[subj])
    if subj in contained_dict:
        obj = contained_dict[subj].replace("<","").replace(">","")
        if obj in municipal_units_entities:
            extention.write(" ;\n schema:containedIn " + municipal_units_entities[obj])
    extention.write(" .\n")
    counter += 1

In [36]:
for x in seats_dict:
    if x in municipalities_matched:
        subj = municipalities_matched[x]
    elif x in municipalities_entities:
        subj = municipalities_entities[x]
    else:
        continue
    key = seats_dict[x].replace("<", "").replace(">", "")
    obj = municipal_com_entities[key]
    if type(subj) == list:
        y = subj[0]
        extention.write(y + " exto:has_seat " + obj + " .\n")
    else:
        extention.write(subj + " exto:has_seat " + obj + " .\n")

In [37]:
extention.close()