In [1]:
# title, salary, skills, description, related_titles, responsibility

In [2]:
!pip install rdflib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdflib
  Downloading rdflib-6.2.0-py3-none-any.whl (500 kB)
[K     |████████████████████████████████| 500 kB 6.7 MB/s 
Collecting isodate
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 228 kB/s 
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-6.2.0


In [3]:
!pip install pydotplus
!pip install graphviz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from rdflib import Graph, URIRef, Literal, XSD, Namespace, RDF, RDFS, BNode
import json
import pandas as pd
import numpy as np

In [5]:
from google.colab import drive
drive.mount('/content/drive')
dir = "/content/drive/MyDrive/558_Career_KG/"

Mounted at /content/drive


## Define Namespaces

In [6]:
FOAF = Namespace('http://xmlns.com/foaf/0.1/')
SCHEMA = Namespace('https://schema.org/')
CAREER = Namespace('https://careerkg.com/')
EMSI_SKILL = Namespace('https://skills.emsidata.com/skills/')
CAREER_TITLE = Namespace('https://careerkg.com/title/')
CAREER_SCAT = Namespace('https://careerkg.com/skill/category/')
CAREER_STYPE = Namespace('https://careerkg.com/skill/type/')

In [7]:
kg = Graph()
kg.bind('foaf', FOAF)
kg.bind('schema', SCHEMA)
kg.bind('career', CAREER)
kg.bind('career_skill_category', CAREER_SCAT)
kg.bind('career_skill_type', CAREER_STYPE)
kg.bind('career_title', CAREER_TITLE)
kg.bind('emsi_skill', EMSI_SKILL)

## Define missing Properties & Classes

### relatedTitles property

In [8]:
kg.add((CAREER.relatedTitles, RDFS.range, SCHEMA.Occupation))
kg.add((CAREER.relatedTitles, RDFS.domain, SCHEMA.Occupation))

<Graph identifier=Ncc94b8d9848f499185af2bc02939fa18 (<class 'rdflib.graph.Graph'>)>

### annualSalary & hourlySalary

In [9]:
kg.add((CAREER.annualSalary, RDFS.subPropertyOf, SCHEMA.estimatedSalary))
kg.add((CAREER.hourlySalary, RDFS.subPropertyOf, SCHEMA.estimatedSalary))

<Graph identifier=Ncc94b8d9848f499185af2bc02939fa18 (<class 'rdflib.graph.Graph'>)>

### relatedSkills property

In [10]:
kg.add((CAREER.relatedSkills, RDFS.range, CAREER.skill))
kg.add((CAREER.relatedSkills, RDFS.domain, SCHEMA.Occupation))

<Graph identifier=Ncc94b8d9848f499185af2bc02939fa18 (<class 'rdflib.graph.Graph'>)>

## Skills

In [11]:
CAREER.skill

rdflib.term.URIRef('https://careerkg.com/skill')

In [12]:
with open(dir+"Data/lightcast_all_skills.json") as file:
    all_skills = json.loads(file.read())

In [13]:
kg.add((CAREER.skill, RDFS.subClassOf, SCHEMA.thing))
kg.add((CAREER.skillCategory, RDFS.subClassOf, SCHEMA.thing))
kg.add((CAREER.skillSubcategory, RDFS.subClassOf, SCHEMA.thing))
kg.add((CAREER.skillType, RDFS.subClassOf, SCHEMA.thing))
#kg.add((CAREER.languageSkill, RDFS.subClassOf, CAREER.skill))
#kg.add((CAREER.softwareSkill, RDFS.subClassOf, CAREER.skill))

<Graph identifier=Ncc94b8d9848f499185af2bc02939fa18 (<class 'rdflib.graph.Graph'>)>

In [14]:
category_set = set()
subcategory_set = set()
skill_type_set = set()

In [15]:
def add_skill_category(obj_name):
  obj = CAREER_SCAT[obj_name]
  if obj not in category_set:
    kg.add((obj, SCHEMA.name, Literal(obj_name)))
    kg.add((obj, SCHEMA.type, CAREER.skillCategory))
    category_set.add(obj)
  return obj

def add_skill_subcategory(obj_name):
  obj = CAREER_SCAT[obj_name]
  if obj not in subcategory_set:
    kg.add((obj, SCHEMA.name, Literal(obj_name)))
    kg.add((obj, SCHEMA.type, CAREER.skillSubcategory))
    subcategory_set.add(obj)
  return obj

def add_skill_type(obj_name):
  obj = CAREER_STYPE[obj_name]
  if obj not in skill_type_set:
    kg.add((obj, SCHEMA.name, Literal(obj_name)))
    kg.add((obj, SCHEMA.type, CAREER.skillType))
    skill_type_set.add(obj)
  return obj

In [16]:
for skill_info in all_skills:
  skill = URIRef(skill_info["infoUrl"])
  id = skill_info["id"]
  kg.add((skill, SCHEMA.name, Literal(skill_info["name"])))
  kg.add((skill, RDF.type, CAREER.skill))

  # Category
  if skill_info["category"]:
    category_name = skill_info["category"]["name"].replace(" ","_")
    if category_name != 'NULL':
      subcategory_name = skill_info["subcategory"]["name"].replace(" ","_")
      category = add_skill_category(category_name)
      subcategory = add_skill_subcategory(subcategory_name)

      kg.add((skill, SCHEMA.category, BNode("category_"+id)))
      kg.add((BNode("category_"+id), SCHEMA.object, category))
      kg.add((BNode("category_"+id), SCHEMA.object, subcategory))

  # Type
  type_name = skill_info["type"]["name"].replace(" ","_")
  skill_type = add_skill_type(type_name)
  kg.add((skill, SCHEMA.type, skill_type))
  if skill_info["isLanguage"]:
    kg.add((skill, CAREER.isLanguage, Literal(True, datatype=XSD.boolean)))
  else:
    kg.add((skill, CAREER.isLanguage, Literal(False, datatype=XSD.boolean)))
  if skill_info["isSoftware"]:
    kg.add((skill, CAREER.isSoftware, Literal(True, datatype=XSD.boolean)))
  else:
    kg.add((skill, CAREER.isSoftware, Literal(False, datatype=XSD.boolean)))

  # Description
  skill_description = skill_info["description"]
  if skill_description:
    kg.add((skill, SCHEMA.description, BNode("description_"+id)))
    kg.add((BNode("description_"+id), SCHEMA.object, Literal(skill_description)))
    kg.add((BNode("description_"+id), CAREER.descriptionSource, URIRef(skill_info["descriptionSource"])))

### Visualize

In [17]:
import io
import pydotplus
from IPython.display import display, Image
from rdflib.tools.rdf2dot import rdf2dot

In [18]:
def visualize(g):
    stream = io.StringIO()
    rdf2dot(g, stream, opts = {display})
    dg = pydotplus.graph_from_dot_data(stream.getvalue())
    return dg

## Title

In [19]:
all_titles = pd.read_csv(dir+"KG Construct/all_titles_processed.csv")

In [20]:
all_titles.head()

Unnamed: 0,id,name,description,responsibility,salaryYearly,salaryHourly,related_titles,skills
0,ET4A446A1A5F6142AD,.NET Application Architect,A .NET developer is an information technology ...,Designing interfaces for client use\nIdentifyi...,174308.0,,Enterprise Architect; Systems Architect; Infra...,".NET, .NET Core, APIs, ASP.NET, AWS, Agile, An..."
1,ETB5E3860B8B9A9755,.NET Architect,Software architects are expert software design...,Perform research for projects to determine the...,170168.0,,Architectural Drafter; Architectural Designer;...,"Active Directory, Azure, Frameworks, Javascrip..."
2,ETEB3BB8E555C79368,.NET Developer,A .NET developer is an information technology ...,Designing interfaces for client use\nIdentifyi...,105292.0,,Software Engineer; Software Developer; Front E...,".NET, .NET Core, .NET Framework, APIs, ASP Net..."
3,ETB3859094FF2DD443,.NET Front End Developer,Front end developers implement the visual and ...,Designing or editing websites or web applicati...,85016.0,,Software Engineer; Software Developer; Mobile ...,"AJAX, APIs, AWS, Agile, Angular, Azure, Bootst..."
4,ETE906C8A7B45816CC,.NET Full Stack Developer,A full stack developer develops and deploys th...,Converting the elements of web designs into ex...,112176.0,,Software Engineer; Software Developer; Front E...,".NET, APIs, AWS, Agile, Angular, Azure, Back E..."


In [21]:
skill_id_df = pd.read_csv(dir+"Entity Linking/linked_skills_2307.txt", sep="; ", header=None)

  return func(*args, **kwargs)


In [22]:
skill_id_df = skill_id_df[[0,1]]
skill_id_df.columns = ["crawl_skill", "api_skill_id"]
skill_id_map = skill_id_df.set_index("crawl_skill").to_dict()["api_skill_id"]

In [23]:
def add_title_node(row):
  subject = CAREER_TITLE[row.id]
  kg.add((subject, SCHEMA.name, Literal(row["name"])))
  kg.add((subject, RDF.type, SCHEMA.Occupation))

  # Descrpt
  if type(row.description) != float:
    kg.add((subject, SCHEMA.description, Literal(row.description)))

  # Resp
  if type(row.responsibility) != float:

    kg.add((subject, SCHEMA.responsibilities, Literal("•  "+row.responsibility.replace("\n","\n•  "))))

  # Salary
  if not np.isnan(row.salaryYearly):
    kg.add((subject, CAREER.annualSalary, Literal(row.salaryYearly, datatype=XSD.float)))
  if not np.isnan(row.salaryHourly):
    kg.add((subject, SCHEMA.hourlySalary, Literal(row.salaryHourly, datatype=XSD.float)))

  # Skills
  for obj_name in row.skills.split(", "):
    obj_name = obj_name.lower()
    obj_id = skill_id_map.get(obj_name,None)
    if obj_id:
      obj = EMSI_SKILL[obj_id]
      kg.add((subject, CAREER.relatedSkills, obj))
    else:
      print(obj_name)

In [24]:
temp = all_titles[["id","name"]]
temp.set_index("name", inplace=True)
name_id_map = temp.to_dict()["id"]

In [25]:
name_id_map.get(".NET Front End Developer",None)

'ETB3859094FF2DD443'

In [26]:
def relate_titles(row):
  subject = CAREER_TITLE[row.id]
  norelated = True
  if type(row.related_titles) != float:
    for obj_name in row.related_titles.split("; "):
      obj_id = name_id_map.get(obj_name,None)
      if obj_id:
        obj = CAREER_TITLE[obj_id]
        kg.add((subject, CAREER.relatedTitles, obj))
        norelated = False
  if norelated:
    kg.add((BNode(row.id+"related"), SCHEMA.name, Literal("No Related Title")))
    kg.add((subject, CAREER.relatedTitles, BNode(row.id+"related")))

In [27]:
all_titles.apply(add_title_node,axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
55792    None
55793    None
55794    None
55795    None
55796    None
Length: 55797, dtype: object

In [28]:
all_titles.apply(relate_titles,axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
55792    None
55793    None
55794    None
55795    None
55796    None
Length: 55797, dtype: object

In [29]:
kg.serialize('Career_KG.ttl', format="turtle")

<Graph identifier=Ncc94b8d9848f499185af2bc02939fa18 (<class 'rdflib.graph.Graph'>)>