# Knowledge Graph Data Preparation

In [1]:
import requests
import json
import pickle

In [2]:
import numpy as np

In [3]:
import pandas as pd

## Keyword Extraction
from [EMSI API](https://api.emsidata.com/apis/skills) and [the Department of Labor called O*NET](https://www.onetonline.org/)

In [4]:
class APIrequest:
    def __init__(self,url, token=None):
        self.url = url
        self.token = token
        self.headers = {}
        if token:
            self.headers['Authorization'] = f'Bearer {self.token}'

    def response(self,method="GET",params={},data="",contenttype=None):
        if contenttype:
            self.headers["Content-Type"] = contenttype
        response = requests.request(method, self.url, headers=self.headers, params=params, data=data)
        print(response.text)
        json_resp = json.loads(response.text)
        try:
            result = json_resp["data"]
        except KeyError:
            result = json_resp
        return result


### O*NET Keywords

#### All Careers

In [5]:
data_dir = "../Data/Occupation Data.xlsx"
title_df = pd.read_excel(data_dir)
title_list = title_df["Title"].to_list()

#### All Skills

In [6]:
soft_skill_df = pd.read_excel("../Data/Skills.xlsx")
soft_skill_list = soft_skill_df["Element Name"].unique()
tech_skill_df = pd.read_excel("../Data/Technology Skills.xlsx")
tech_skill_list = tech_skill_df["Example"].unique()
all_skill_list = list(soft_skill_list)+list(tech_skill_list)

In [7]:
all_skill_list

['Reading Comprehension',
 'Active Listening',
 'Writing',
 'Speaking',
 'Mathematics',
 'Science',
 'Critical Thinking',
 'Active Learning',
 'Learning Strategies',
 'Monitoring',
 'Social Perceptiveness',
 'Coordination',
 'Persuasion',
 'Negotiation',
 'Instructing',
 'Service Orientation',
 'Complex Problem Solving',
 'Operations Analysis',
 'Technology Design',
 'Equipment Selection',
 'Installation',
 'Programming',
 'Operations Monitoring',
 'Operation and Control',
 'Equipment Maintenance',
 'Troubleshooting',
 'Repairing',
 'Quality Control Analysis',
 'Judgment and Decision Making',
 'Systems Analysis',
 'Systems Evaluation',
 'Time Management',
 'Management of Financial Resources',
 'Management of Material Resources',
 'Management of Personnel Resources',
 'Adobe Systems Adobe Acrobat',
 'AdSense Tracker',
 'Atlassian JIRA',
 "Blackbaud The Raiser's Edge",
 'ComputerEase construction accounting software',
 'Database reporting software',
 'Databox',
 'Email software',
 'Enter

#### All Knowledge

In [8]:
data_dir = "../Data/Knowledge.xlsx"
knowledge_df = pd.read_excel(data_dir)
knowledge_list = list(knowledge_df["Element Name"].unique())

### EMSI Keywords

In [9]:
CLIENT_ID = "k5ef7hyp0ia4j2u4"
CLIENT_SECRET = "eMGBli3L"

In [10]:
def findToken():
    url = "https://auth.emsicloud.com/connect/token"
    payload = f"client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&grant_type=client_credentials&scope=emsi_open"
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    response = requests.request("POST", url, data=payload, headers=headers)
    token = response.text.split(",")[0].split(":")[1].strip("\"")
    #print(f"Token: {token}")
    return token

In [11]:
ACCESS_TOKEN = findToken()

#### Skills

In [12]:
EMSI_Skill_API = APIrequest("https://emsiservices.com/skills/versions/latest/skills", token=ACCESS_TOKEN)

In [None]:
query = {"fields":"name","limit":"500"}
raw_skills = EMSI_Skill_API.response(params=query)
skill_list = [d["name"] for d in raw_skills]

#### Titles

In [14]:
EMSI_Title_API = APIrequest("https://emsiservices.com/titles/versions/latest/titles", token=ACCESS_TOKEN)

In [None]:
query = {"fields":"name","limit":"200"}
raw_titles = EMSI_Title_API.response(params=query)
title_list = [d["name"] for d in raw_titles]

#### Keywords shuffle

In [62]:
np.random.shuffle(title_list)
np.random.shuffle(all_skill_list)
np.random.shuffle(knowledge_list)
test_keywords = title_list[:3] + all_skill_list[:10] + knowledge_list[:2]
display(test_keywords)

['Recreational Vehicle Service Technicians',
 'Automotive Engineers',
 'Motorcycle Mechanics',
 'Enigma software',
 'Cadence Virtuoso Layout Suite',
 'Mincom MineScape',
 'Critical Thinking',
 'Human Resource MicroSystems HR Entre',
 'JustBio SeqPainter',
 'Citrix cloud computing software',
 'Gas chromatograph GS software',
 'EDExpress',
 'PointClickCare healthcare software',
 'Administrative',
 'Food Production']

In [7]:
test_keywords = ['Computer Science', 'Art', 'Data Scientist', 'Amazon Web Service AWS', 'UI Design', 'Mathematics', 'Photoshop', 'SQL', 'Microsoft Excel', 'Visualization', 'Tableau', 'Bioinformatics Scientists', 'Web design software', 'Human Resource MicroSystems HR Entre', 'Optical Gaging Products Measure-X', 'Critical Thinking']

## Create Distance List

In [125]:
distance_matrix = np.array([[{a,b},np.nan] for i,a in enumerate(test_keywords) for b in test_keywords[i+1:]])

In [89]:
distance_matrix

array([[{'Art', 'Computer Science'}, nan],
       [{'Data Scientist', 'Computer Science'}, nan],
       [{'Amazon Web Service AWS', 'Computer Science'}, nan],
       [{'Computer Science', 'UI Design'}, nan],
       [{'Mathematics', 'Computer Science'}, nan],
       [{'Photoshop', 'Computer Science'}, nan],
       [{'SQL', 'Computer Science'}, nan],
       [{'Microsoft Excel', 'Computer Science'}, nan],
       [{'Visualization', 'Computer Science'}, nan],
       [{'Tableau', 'Computer Science'}, nan],
       [{'Bioinformatics Scientists', 'Computer Science'}, nan],
       [{'Web design software', 'Computer Science'}, nan],
       [{'Computer Science', 'Human Resource MicroSystems HR Entre'},
        nan],
       [{'Optical Gaging Products Measure-X', 'Computer Science'}, nan],
       [{'Computer Science', 'Critical Thinking'}, nan],
       [{'Data Scientist', 'Art'}, nan],
       [{'Amazon Web Service AWS', 'Art'}, nan],
       [{'Art', 'UI Design'}, nan],
       [{'Art', 'Mathematics'}

### Distance: Google Search Result Count

In [8]:
from bs4 import BeautifulSoup
import re
import time

In [None]:
# Test: Google Search using requests with template: "A" AND "B"
keywordA = ".Net"
keywordB = "3D"
query = f'"{keywordA}" AND "{keywordB}"'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"}
r = requests.get('http://www.google.com/search', headers=headers, params={'q':query})
soup = BeautifulSoup(r.text)
result = soup.find('div',{'id':'result-stats'}).text
count_result = int(result.split(" ")[1].replace(",",""))

In [26]:
proxy_list = []
with open("../Data/proxy_list.txt","r") as file:
    proxy_list = file.read().split("\n")

In [9]:
def gentr_fn(alist):
    while 1:
        for j in alist:
            yield {'http':f"{j}",'https':f"{j}"}

In [10]:
import browser_cookie3
from datetime import datetime

In [11]:

class GoogleSearchCount:
    def __init__(self,proxy_list=[], sleep_time=30):
        self.sleep_time = sleep_time## random wait time
        self.query = ""
        self.headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
            "Accept-Encoding": "gzip, deflate, br", 
            "Accept-Language": "zh-CN,zh;q=0.9", 
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36", 
            "X-Amzn-Trace-Id": "Root=1-62acf6c7-1765297675d1730807c47579"
        }
        self.cookies = browser_cookie3.chrome(domain_name='.google.com')
        self.proxy_list = proxy_list
        self.proxy_itr = gentr_fn(proxy_list)
        self.proxies = {}
        self.count = 1

    def search(self,keywordA,keywordB):
        error_c = 0
        self.query = f'"{keywordA}" AND "{keywordB}"'
        while error_c < 10:
            time.sleep(self.sleep_time+(np.random.randint(0,self.sleep_time*10))/10) # avoid blocking by google
            print(self.count)
            try: 
                r = requests.get('http://www.google.com/search', headers=self.headers, params={'q':self.query}, proxies=self.proxies, cookies=self.cookies)
                soup = BeautifulSoup(r.text)
                result = soup.find('div',{'id':'result-stats'}).text
                count_result = int("".join(re.findall(r"\d+",result.split(" result")[0])))
            except AttributeError: 
                print("Blocked..\n")
                #print(soup)
                count_result = np.nan
                break
            
            '''
            except requests.exceptions.ProxyError: 
                print("Proxy Failure: "+self.proxies['http'])
                #self.proxy_list.remove(self.proxies)
                self.proxies = next(self.proxy_itr)
                error_c += 1
                continue
            except AttributeError:
                self.proxies = next(self.proxy_itr)
                print("Blocked..\nTrying: "+str(self.proxies))
                error_c += 1
                continue
            '''

            self.count += 1
            return count_result

In [12]:
def save_result(result):
        now = datetime.now().strftime("%Y-%m-%d-%H-%M")
        file_name = now+"_result.pickle"
        with open('../Data/'+file_name,'wb') as file:
            pickle.dump(result,file=file)

In [13]:
def gen_distance_matrix(distance_matrix,SearchEngine):
    count = 1
    for row in distance_matrix:
        if pd.isnull(row[-1]):
            try:
                keywordA,keywordB = row[0]
                result = SearchEngine.search(keywordA,keywordB)
                if pd.isnull(result): break
                row[-1] = result
                
                if count % 50 == 0:
                    save_result(distance_matrix)
                    print(f"***Saved with {count} results***")
                count += 1
            except requests.exceptions.ProxyError:
                print('Blocked..Please try again later..')
                return distance_matrix
    save_result(distance_matrix)
    print("***Finished***")
    return distance_matrix

In [14]:
SearchEngine = GoogleSearchCount(proxy_list=[], sleep_time=20)

In [17]:
distance_matrix = gen_distance_matrix(distance_matrix,SearchEngine=SearchEngine)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
***Finished***


In [18]:
distance_matrix

array([[{'Art', 'Computer Science'}, 347000000],
       [{'Data Scientist', 'Computer Science'}, 8580000],
       [{'Computer Science', 'Amazon Web Service AWS'}, 17700],
       [{'Computer Science', 'UI Design'}, 1350000],
       [{'Computer Science', 'Mathematics'}, 358000000],
       [{'Computer Science', 'Photoshop'}, 7700000],
       [{'SQL', 'Computer Science'}, 28000000],
       [{'Microsoft Excel', 'Computer Science'}, 5970000],
       [{'Computer Science', 'Visualization'}, 34500000],
       [{'Tableau', 'Computer Science'}, 4500000],
       [{'Computer Science', 'Bioinformatics Scientists'}, 7850],
       [{'Computer Science', 'Web design software'}, 798000],
       [{'Human Resource MicroSystems HR Entre', 'Computer Science'}, 3],
       [{'Optical Gaging Products Measure-X', 'Computer Science'}, 1],
       [{'Critical Thinking', 'Computer Science'}, 13400000],
       [{'Art', 'Data Scientist'}, 12400000],
       [{'Art', 'Amazon Web Service AWS'}, 29600],
       [{'Art', 'U

In [15]:
with open('../Data/2022-06-19-22-33_result.pickle','rb') as file:
    distance_matrix = pickle.load(file)

## FastMap

In [109]:
import matplotlib.pyplot as plt

In [121]:
def prepare_dist_mat(distance_matrix,keywords_list):
    result = {"index":[],"value":[]}
    for row in distance_matrix:
        keywordA,keywordB = row[0]
        result["index"].append({keywords_list.index(keywordA),keywords_list.index(keywordB)})
        result["value"].append(row[-1])
    return result

In [None]:
prepare_dist_mat(distance_matrix,test_keywords)

In [124]:
class FastMaP:
    
    def __init__(self):
        return

    def gen_distance_matrix(self,dist_mat,x):
        for i in range(len(dist_mat["index"])):
            a,b = dist_mat["index"][i]
            dist_mat["value"][i] = np.sqrt(np.square(dist_mat["value"][i])-np.square(x[a]-x[b]))
        return dist_mat
    
    def gen_x(self,a,b,dist_mat):
        x = np.zeros(N)
        dab = self.find_distance(a,b,dist_mat)
        x[a] = 0 # P_a to itself
        x[b] = dab # Furthest distance
        for i in range(N):
            if i==a or i==b:
                continue
            dai = self.find_distance(a,i,dist_mat)
            dib = self.find_distance(i,b,dist_mat)
            x[i] = (dai**2+dab**2-dib**2)/(2*dab)
        return x
    
    def fit(self,dist_mat,keywords_list,k=2):
        N = len(keywords_list)
        result = np.zeros((N,k))
        for c in range(k):
            if c != 0:
                dist_mat = gen_distance_matrix(dist_mat,x)
            a,b = self.find_furthest_pair(dist_mat)
            x = self.gen_x(a,b,dist_mat)
            result[:,c] = x
        return result

    def plot(self,dist_mat,keywords_list,k=2):
        map_result = self.fit(dist_mat,keywords_list,k)
        fig = plt.figure()
        ax = fig.add_subplot()
        ax.scatter(map_result[:,0], map_result[:,1])
        for (label,x,y) in zip(keywords_list,map_result[:,0],map_result[:,1]):
            plt.annotate(label, (x, y))
        plt.show