In [1]:
import os
import re
import platform
import codecs
import nltk
import sys
import csv
import itertools
import urllib.request
import platform
from bs4 import BeautifulSoup
import re

This python notebook searches into http://www.onetcodeconnector.org and assigns the relevant SOC codes. For example, a search for "plastic operator" can be done by putting in:
https://www.onetcodeconnector.org/find/result?s=plastic+operator

In [2]:
phrase = 'plastic operator'

# (1.) form an URL

URL_part1 = "http://www.onetcodeconnector.org/find/result?s="
URL_part2 = re.sub(' ','+',phrase) # replace space ' ' with '+'
URL = URL_part1 + URL_part2
# For example: 
# https://www.onetcodeconnector.org/find/result?s=plastic+operator
    
# (2.) submit and request the ONET webpage

req = urllib.request.Request(URL)
resp = urllib.request.urlopen(req)
respData = resp.read()    
soup = BeautifulSoup(respData,"html.parser") 
all_html = str(soup.prettify())

# (3.) cut everything before the word "Activities" and split by line 

relevant_result = re.split('\n',re.sub('^.*Activities','',all_html))
relevant_result = [w.lower() for w in relevant_result if not '<' in w] 
#take out all html markers, i.e., everything with <...>
    
relevant_result = [w for w in relevant_result if re.findall(r'\s\d',w)] 
#only take the line that contains numbers, ie., a valid soc and score 

# (4.) take soc from HTML text 

soc_result = [re.findall(r'\d{2}-\d{4}\.\d{2}',w)[0] for w in relevant_result if re.findall(r'\d{2}-\d{4}\.\d{2}',w)]
# '\d{2}-\d{4}\.\d{2}' = 2 numbers + "_" + 4 numbers + "." + 2 numbers
# (which is the a valid soc, say "51-4072.00".) 

# (5.) create a list of string that is NOT soc...so we can extract match scores

not_soc = [w for w in relevant_result if not re.findall(r'\d{2}-\d{4}\.\d{2}',w)]
# not soc means a score

# (6.) extract match scores
score_result = [re.findall(r'\d{1,3}',w)[0] for w in not_soc if re.findall(r'\d{1,3}',w)]
# '\d{1,3}' = upto 1-3 numbers (a valid score from 0-100)

# (7.) combine soc results and scores
final_result = [[x,y] for x,y in zip(soc_result,score_result)]

# (8.) return outputs
output = '\n'.join([phrase + ',' + ','.join(w) for w in final_result])

### How an URL should look like:

In [3]:
URL

'http://www.onetcodeconnector.org/find/result?s=plastic+operator'

### How a (messy) result from the ONET website look like. Note: We only need everything after "Activities":

In [11]:
for line in re.split('\n',all_html):
    print(line)

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
                      "http://www.w3.org/TR/html4/loose.dtd">
<html lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <link href="https://www.onetcenter.org/rss/whatsnew.xml" rel="alternate" type="application/rss+xml"/>
  <link href="/shared/bubblepop/bubblepop-1.0.css" rel="stylesheet" type="text/css"/>
  <link href="/shared/fancybox-2.1.5/jquery.fancybox.css" rel="stylesheet" type="text/css"/>
  <link href="/shared/base.css" rel="stylesheet" type="text/css"/>
  <link href="/shared/header.css" rel="stylesheet" type="text/css"/>
  <link href="/css_main.css" rel="stylesheet" type="text/css"/>
  <script src="/shared/jquery/jquery-1.9.1.min.js" type="text/javascript">
  </script>
  <script src="/shared/jquery/jquery-migrate-1.1.1.min.js" type="text/javascript">
  </script>
  <script src="/shared/bubblepop/bubblepop-1.0.js" type="text/javascript">
  </script>
  <script src="/shared/fanc

From the html page, we can extract relevant result by first remove everything before *Activity*, then keep only lines with numbers (which are SOCs and match scores): 

### 

In [5]:
relevant_result

['        100',
 '        51-4072.00',
 '        94',
 '        51-4031.00',
 '        93',
 '        51-4193.00',
 '        91',
 '        51-4191.00',
 '        89',
 '        51-4032.00',
 '        87',
 '        51-4011.00',
 '        87',
 '        51-4023.00',
 '        86',
 '        51-4022.00',
 '        84',
 '        51-4035.00',
 '        81',
 '        51-4061.00',
 '        78',
 '        51-4081.00',
 '        77',
 '        51-4062.00',
 '        76',
 '        51-4021.00',
 '        72',
 '        51-4012.00',
 '        71',
 '        51-4199.00',
 '        64',
 '        51-4033.00',
 '        62',
 '        51-4034.00',
 '        52',
 '        51-9051.00',
 '        51',
 '        51-9041.00',
 '        49',
 '        51-9023.00',
 '      occupations 1-20 of 884 shown.',
 '        creative commons attribution 4.0 international license',
 '       site updated october 24, 2017']

A valid SOC code is: 2 numbers + "-" + 4 numbers + "." + 2 numbers

In [6]:
soc_result

['51-4072.00',
 '51-4031.00',
 '51-4193.00',
 '51-4191.00',
 '51-4032.00',
 '51-4011.00',
 '51-4023.00',
 '51-4022.00',
 '51-4035.00',
 '51-4061.00',
 '51-4081.00',
 '51-4062.00',
 '51-4021.00',
 '51-4012.00',
 '51-4199.00',
 '51-4033.00',
 '51-4034.00',
 '51-9051.00',
 '51-9041.00',
 '51-9023.00']

"not_soc" refers to everything from "relevant_result" that is not an SOC code:   

In [7]:
not_soc

['        100',
 '        94',
 '        93',
 '        91',
 '        89',
 '        87',
 '        87',
 '        86',
 '        84',
 '        81',
 '        78',
 '        77',
 '        76',
 '        72',
 '        71',
 '        64',
 '        62',
 '        52',
 '        51',
 '        49',
 '      occupations 1-20 of 884 shown.',
 '        creative commons attribution 4.0 international license',
 '       site updated october 24, 2017']

From, "not_soc", we only need to get numbers (can be from 1-3 numbers, which refers to 0-100). 

In [8]:
score_result

['100',
 '94',
 '93',
 '91',
 '89',
 '87',
 '87',
 '86',
 '84',
 '81',
 '78',
 '77',
 '76',
 '72',
 '71',
 '64',
 '62',
 '52',
 '51',
 '49',
 '1',
 '4',
 '24']

In [12]:
final_result

[['51-4072.00', '100'],
 ['51-4031.00', '94'],
 ['51-4193.00', '93'],
 ['51-4191.00', '91'],
 ['51-4032.00', '89'],
 ['51-4011.00', '87'],
 ['51-4023.00', '87'],
 ['51-4022.00', '86'],
 ['51-4035.00', '84'],
 ['51-4061.00', '81'],
 ['51-4081.00', '78'],
 ['51-4062.00', '77'],
 ['51-4021.00', '76'],
 ['51-4012.00', '72'],
 ['51-4199.00', '71'],
 ['51-4033.00', '64'],
 ['51-4034.00', '62'],
 ['51-9051.00', '52'],
 ['51-9041.00', '51'],
 ['51-9023.00', '49']]

In [10]:
output

'plastic operator,51-4072.00,100\nplastic operator,51-4031.00,94\nplastic operator,51-4193.00,93\nplastic operator,51-4191.00,91\nplastic operator,51-4032.00,89\nplastic operator,51-4011.00,87\nplastic operator,51-4023.00,87\nplastic operator,51-4022.00,86\nplastic operator,51-4035.00,84\nplastic operator,51-4061.00,81\nplastic operator,51-4081.00,78\nplastic operator,51-4062.00,77\nplastic operator,51-4021.00,76\nplastic operator,51-4012.00,72\nplastic operator,51-4199.00,71\nplastic operator,51-4033.00,64\nplastic operator,51-4034.00,62\nplastic operator,51-9051.00,52\nplastic operator,51-9041.00,51\nplastic operator,51-9023.00,49'

The results are equivalent to search manually:

<img src="example_search.png">