In [1]:
import importlib
import util
importlib.reload(util)
from util import *

In [2]:
import pandas as pd
from nltk import ngrams
from IPython.core.display import display, HTML

# Identify Job Ads Post Structure

last update: 2020-03-20

### Assumption:
- Sentences which belong the same section are in sequence.
- Each section only occur exactly only one time.

### Advantages:
1. Ordering of different sections does not matter.
2. May apply to different job ads websites.

### Limitataion:
1. Some sections may have overlapping issues. E.g. Responsibilities and Requirements.
2. Chinese Characters have not yet been dealt with.

### What's next:
1. Expand the tagging to sentence-wise or paragraph-wise
2. Tokenizr Chinese Characters to support job ads in Chinese

# Training Stage: Load training data

In [3]:
df = get_df('./jobsdb_dump_first100.json')

In [4]:
idx = 2
text = df["jobDetail"][idx]["jobDescription"]
pt = PrepText(text)
ajson = pt.get_variables()

In [5]:
jds = [PrepText(x["jobDescription"]).get_variables()["lower_text"] for x in df["jobDetail"]]

[Warn]<set_lower_text>: Contains non latin characters.
[Warn]<set_lower_text>: Contains non latin characters.
[Warn]<set_lower_text>: Contains non latin characters.
[Warn]<set_lower_text>: Contains non latin characters.
[Warn]<set_lower_text>: Contains non latin characters.


# Training Stage: NGram of all texts

In [6]:
# Ngram
sentence = ' '.join(jds)
sentence = sentence.replace("   ", " ").replace("  ", " ").replace("  ", " ").replace("  ", " ").strip("\n\r\t ")
sentence = sentence.replace("...", ".").replace(". . ", ". ").replace(". . ", ". ").replace(". . ", ". ")
n = 2
sixgrams = ngrams(sentence.split(), n)
gram_arr = list(set(sixgrams))

print(f'Token counts: {len(list(set(gram_arr)))}')

result = []
for idx in range(len(gram_arr)):
    word = ' '.join(gram_arr[idx])
    aobj = (word, sentence.count(word))
    
    result.append(aobj)

all_2grams = sorted(result, key=lambda x: x[1], reverse=True)
all_2grams

Token counts: 11873


[('will be', 121),
 ('or a', 107),
 ('and a', 103),
 ('experience in', 91),
 ('on a', 81),
 ('in a', 69),
 ('is a', 66),
 ('in the', 64),
 ('. to', 61),
 ('. in', 61),
 ('on an', 59),
 ('for a', 56),
 ('. we', 56),
 ('. good', 56),
 ('to the', 56),
 ('. requirement', 53),
 ('or recruitment', 47),
 ('for recruitment', 46),
 ('to a', 43),
 ('personal data', 42),
 ('to work', 42),
 ('used for', 41),
 ('apply now', 39),
 ('related discipline', 39),
 ('recruitment purpose', 37),
 ('. requirements:.', 37),
 ('be used', 37),
 ('english and', 36),
 ('. interested', 35),
 ('please send', 35),
 ('command of', 35),
 ('expected salary', 35),
 ('able to', 34),
 ('. prepare', 34),
 ('looking for', 33),
 ('for the', 33),
 ('. assist', 33),
 ('good command', 33),
 ('responsible for', 32),
 ('or above', 32),
 ('and chinese', 32),
 ('our client', 32),
 ('data collected', 31),
 ('of the', 31),
 ('be consider', 30),
 ('of a', 30),
 ('send your', 30),
 ('be considered', 29),
 ('r and', 29),
 ('assist in', 

# Training Stage: split into Sentence Array, doing one item

In [7]:
def lookfor(line, arr):
    for item in arr:
         if line.lower().find(item) != -1:
                return True
    return False

In [8]:
idx = 2
text = df["jobDetail"][idx]["jobDescription"]
pt = PrepText(text)
ajson = pt.get_variables()
ajson["sentence_arr"]

['Responsible for financial & management accounts reporting and manage day-to-day accounting operations',
 'Conduct the overall preparation of month-end, quarterly, interim and annual management reports including budgeting, forecasting and closing',
 'Prepare consolidation accounting reports for purpose of financial accounting book closing & management accounting review',
 'Coordinate and complete annual audits',
 'Assist in formulating, implementing and monitoring internal and financial control policies',
 'Support Company Secretarial matters from the financial perspective',
 'Liaise with external auditors / consultants on interim / annual audit and tax filing',
 'Involve in Business Valuation and other ad-hoc assignments as required']

In [9]:
s_arr = ajson["sentence_arr"]
collection = []
# -A requirement
# -B responsibility
# -C benefit
# -D apply resume
# -E company description
# -F others
curr_section = ""
for line in s_arr:
    if curr_section != "":
        matched = [curr_section]
    else:
        matched = []

    if lookfor(line, ["equirement", "or above", "or equivalent"]):
        curr_section = "A"
        matched.append(curr_section)

    if lookfor(line, ["description", "work on", "the role", "responsib"]):
        curr_section = "B"
        matched.append(curr_section)

    if lookfor(line, ["attractive", "competitive", "day work", "mpf", "medical", "remuneration"]):
        curr_section = "C"
        matched.append(curr_section)

    if lookfor(line, ["interested party", "resume", "apply", "more detail"]):
        curr_section = "D"
        matched.append(curr_section)

    if lookfor(line, ["our client"]):
        curr_section = "E"
        matched.append(curr_section)

    if lookfor(line, ["personal data", "personal information"]):
        curr_section = "F"
        matched.append(curr_section)

    matched = list(set(matched))

    collection.append({
        "cate": matched,
        "line": line
    })

collection

[{'cate': ['B'],
  'line': 'Responsible for financial & management accounts reporting and manage day-to-day accounting operations'},
 {'cate': ['B'],
  'line': 'Conduct the overall preparation of month-end, quarterly, interim and annual management reports including budgeting, forecasting and closing'},
 {'cate': ['B'],
  'line': 'Prepare consolidation accounting reports for purpose of financial accounting book closing & management accounting review'},
 {'cate': ['B'], 'line': 'Coordinate and complete annual audits'},
 {'cate': ['B'],
  'line': 'Assist in formulating, implementing and monitoring internal and financial control policies'},
 {'cate': ['B'],
  'line': 'Support Company Secretarial matters from the financial perspective'},
 {'cate': ['B'],
  'line': 'Liaise with external auditors / consultants on interim / annual audit and tax filing'},
 {'cate': ['B'],
  'line': 'Involve in Business Valuation and other ad-hoc assignments as required'}]

# Training Stage: split into Sentence Array, doing ALL training items

In [10]:
gcoll = []
for idx in range(len(df["jobDetail"]))[:]:
    text = df["jobDetail"][idx]["jobDescription"]
    pt = PrepText(text)
    ajson = pt.get_variables()
    ajson["sentence_arr"]
    
    s_arr = ajson["sentence_arr"]
    collection = []
    # -A requirement
    # -B responsibility
    # -C benefit
    # -D apply resume
    # -E company description
    # -F others
    curr_section = ""
    for line in s_arr:
        if curr_section != "":
            matched = [curr_section]
        else:
            matched = []

        if lookfor(line, ["equirement", "or above", "or equivalent"]):
            curr_section = "A"
            matched.append(curr_section)

        if lookfor(line, ["description", "work on", "the role", "responsib"]):
            curr_section = "B"
            matched.append(curr_section)

        if lookfor(line, ["attractive", "competitive", "day work", "mpf", "medical", "remuneration"]):
            curr_section = "C"
            matched.append(curr_section)

        if lookfor(line, ["interested party", "resume", "apply", "more detail"]):
            curr_section = "D"
            matched.append(curr_section)

        if lookfor(line, ["our client"]):
            curr_section = "E"
            matched.append(curr_section)

        if lookfor(line, ["personal data", "personal information"]):
            curr_section = "F"
            matched.append(curr_section)

        matched = list(set(matched))

        collection.append({
            "cate": matched,
            "line": line
        })

    # collection
    gcoll.append({
    "#A requirement": ' '.join([x["line"] for x in collection if "A" in x["cate"] and len(x["cate"]) == 1]),
    "#B responsibility": ' '.join([x["line"] for x in collection if "B" in x["cate"] and len(x["cate"]) == 1]),
    "#C benefit": ' '.join([x["line"] for x in collection if "C" in x["cate"] and len(x["cate"]) == 1]),
    "#D apply resume": ' '.join([x["line"] for x in collection if "D" in x["cate"] and len(x["cate"]) == 1]),
    "#E company description": ' '.join([x["line"] for x in collection if "E" in x["cate"] and len(x["cate"]) == 1]),
    "#F others": ' '.join([x["line"] for x in collection if "F" in x["cate"] and len(x["cate"]) == 1]),
    "#U unclass": ' '.join([x["line"] for x in collection if len(x["cate"]) == 0]),
    })
df_all = pd.DataFrame(gcoll)
df_all

[Warn]<set_lower_text>: Contains non latin characters.
[Warn]<set_lower_text>: Contains non latin characters.
[Warn]<set_lower_text>: Contains non latin characters.
[Warn]<set_lower_text>: Contains non latin characters.
[Warn]<set_lower_text>: Contains non latin characters.


Unnamed: 0,#A requirement,#B responsibility,#C benefit,#D apply resume,#E company description,#F others,#U unclass
0,Bachelor degree from Accounting and Finance ...,Responsibilities: Report to the Financial Co...,,For a confidential discussion about this role ...,,,They also have sales and trading functions wit...
1,Ensure capital operations are compliance with ...,,,DB Applicants are welcomed to visit our webs...,,Applicants not hearing from us within 6 weeks...,Duties: Overviews daily operation of a finance...
2,,Responsible for financial & management account...,,,,,
3,2 years of accounting experience (retail indus...,Responsibilities Handle day-to-day operations ...,Medical benefits Education Allowance Discretio...,date to us.,,,
4,"Be fun-loving, energetic, positive and open-mi...",Responsibilities: Mainly responsible for dail...,,,,,
...,...,...,...,...,...,...,...
95,,Data Communication or related disciplines • ...,,,,,
96,"Bachelor Degree in Finance, Accounting or rela...",Responsibilities : Reporting to the Senior Man...,,"(in PDF format) with details of qualification,...",,purposes in accordance with the laws and ordi...,
97,Degree holder is preferred Experience in Ship ...,Job Responsibilities Manage the purchasing pro...,Monitor and handle the supply and delivery pro...,"in strict confidence with your full resume, e...",,,"Arrange and administer the correct, timely and..."
98,"Excellent writing, organising, presentation an...","Responsibilities: Assist in copywriting, tran...",,"- e-mail or , - 13/ F, Island Place Tower, N...",,,Summer Trainee - Corporate Communication


# Training Stage: Build Sets of Tokens

In [11]:
def build_sets(df, key):
    sentence = ' '.join(df[key]).lower()
    sixgrams = ngrams(sentence.split(), n)
    # for grams in sixgrams:
    #     print(grams)
    gram_arr = list(set(sixgrams))
#     result = []
#     for idx in range(len(gram_arr)):
#         word = ' '.join(gram_arr[idx])
#         aobj = (word, sentence.count(word))

#         result.append(aobj)
#     sorted(result, key=lambda x: x[1], reverse=True)
    return gram_arr

In [12]:
mapping = {
"A": "#A requirement",
"B": "#B responsibility",
"C": "#C benefit",
"D": "#D apply resume",
"E": "#E company description",
"F": "#F others",
"U": "#U unclass",
}

In [13]:
sets = {}

In [14]:
sets["A"] = build_sets(df_all, mapping["A"])
sets["B"] = build_sets(df_all, mapping["B"])
sets["C"] = build_sets(df_all, mapping["C"])
sets["D"] = build_sets(df_all, mapping["D"])
sets["E"] = build_sets(df_all, mapping["E"])

In [15]:
def wanted(sets, wanted_set):
    mapping = {
    "A": "#A requirement",
    "B": "#B responsibility",
    "C": "#C benefit",
    "D": "#D apply resume",
    "E": "#E company description",
    "F": "#F others",
    "U": "#U unclass",
    }
    all_sets = set(sets["A"]).union(set(sets["B"]), set(sets["C"]), set(sets["D"]), set(sets["E"]))
    all_common = set(sets["A"]).intersection(set(sets["B"]), set(sets["C"]), set(sets["D"]), set(sets["E"]))
    exclu_wanted = all_sets - set(sets[wanted_set])
    exclu_set = exclu_wanted.intersection(set(sets[wanted_set]))
    new_gram_arr = list(set(sets[wanted_set]) - exclu_set - all_common)

    exclu_set = set([])
    if wanted_set != "A":
        exclu_set = exclu_set.union(set(sets[wanted_set]).intersection(set(sets["A"])))
    if wanted_set != "B":
        exclu_set = exclu_set.union(set(sets[wanted_set]).intersection(set(sets["B"])))
    if wanted_set != "C":
        exclu_set = exclu_set.union(set(sets[wanted_set]).intersection(set(sets["C"])))
    if wanted_set != "D":
        exclu_set = exclu_set.union(set(sets[wanted_set]).intersection(set(sets["D"])))
    if wanted_set != "E":
        exclu_set = exclu_set.union(set(sets[wanted_set]).intersection(set(sets["E"])))
    new_gram_arr = list(set(sets[wanted_set]) - exclu_set)

    sentence = ' '.join(df_all[mapping[wanted_set]]).lower()
    result = []
    for idx in range(len(new_gram_arr)):
        word = ' '.join(new_gram_arr[idx])
        aobj = (word, sentence.count(word))

        result.append(aobj)
    return sorted(result, key=lambda x: x[1], reverse=True)

wanted(sets, "E")

[('our client', 28),
 ('client is', 17),
 ('join the', 12),
 ('now looking', 11),
 ('join their', 9),
 ('client, a', 6),
 ('our client,', 6),
 ('are now', 6),
 ('the world', 5),
 ('is now', 5),
 ('high caliber', 5),
 ('following position:', 4),
 ('team, they', 4),
 ('of their', 4),
 ('sales person', 4),
 ('a world', 4),
 ('high calibre', 4),
 ('technical sales', 4),
 ('asia team,', 3),
 ('their strategic', 3),
 ('strategic account', 3),
 ('solution with', 3),
 ('the world,', 3),
 ('than 3000+', 3),
 ('person to', 3),
 ('who provide', 3),
 ('for their', 3),
 ('world, who', 3),
 ('clients due', 3),
 ('calibre for', 3),
 ('their asia', 3),
 ('leader of', 3),
 ('client our', 3),
 ('to fast', 3),
 ('caliber and', 3),
 ('financial analyst', 3),
 ('enterprise clients', 3),
 ('tier high', 3),
 ('high end', 3),
 ('account sales', 3),
 ('they have', 3),
 ('to global', 3),
 ('employees in', 3),
 ('end it', 3),
 ('provide top', 3),
 ('and ambitious', 3),
 ('world leader', 3),
 ('growing of', 3),
 

# Training Stage: testing mutually exclusive of token in different sets

In [16]:
token = wanted(sets, "E")[15][0]
list(filter(lambda x: x[0] == token, all_2grams))

[('a world', 5)]

In [17]:
len(all_2grams)

11873

In [18]:
{
    "word": token,
    "A": tuple(token.split()) in sets["A"],
    "B": tuple(token.split()) in sets["B"],
    "C": tuple(token.split()) in sets["C"],
    "D": tuple(token.split()) in sets["D"],
    "E": tuple(token.split()) in sets["E"],
}

{'word': 'a world', 'A': False, 'B': False, 'C': False, 'D': False, 'E': True}

# Training Stage: Build token dictionary of each section

In [19]:
wA = wanted(sets, "A")
wB = wanted(sets, "B")
wC = wanted(sets, "C")
wD = wanted(sets, "D")
wE = wanted(sets, "E")

# Testing Stage: Labelling a Job Ads with guessing section using testing Datasets 

In [20]:
df400 = get_df("./jobsdb_dump_400-500.json")

idx = 64
text = df400["jobDetail"][idx]["jobDescription"]
text = PrepText(text).get_variables()["lower_text"]
text

"a world leading sourcing office is currently looking for a senior merchandiser (outdoor furniture) to join their dynamic team.   client details.   our client is one of the european leading fmcg company. due to their steady growth, they are now looking for a potential calibre to join the expanding regional team as a senior merchandiser.   description keep track with the orders delivery schedules.   source and develop new product range according to buying brief.   work with suppliers to resolve quality concerns and production schedule concerns.   monitor daily operation, including sourcing, sampling, price negotiation and project set up.   work closely with other departments including quality control and shipping for updating the quality standard and shipment status.   coordinate with factories to ensure timely deliveries profile.   f.5 or above qualification.   proactive and well organised.   good communication and negotiation skills to work with clients and suppliers.   business engli

In [21]:
total_token_count = len(all_2grams)
collection = []
for i in range(len(text.split())-2):
    token = ' '.join(text.split()[i:i+2])
    # print(token)
    term_in_section = []
    probability = []
    lsA = list(filter(lambda x: x[0] == token, wA))
    lsB = list(filter(lambda x: x[0] == token, wB))
    lsC = list(filter(lambda x: x[0] == token, wC))
    lsD = list(filter(lambda x: x[0] == token, wD))
    lsE = list(filter(lambda x: x[0] == token, wE))

    if len(lsA) > 0:
        term_in_section.append({"key": "A", "prob": lsA[0][1] / total_token_count})
    if len(lsB) > 0:
        term_in_section.append({"key": "B", "prob": lsB[0][1] / total_token_count})
    if len(lsC) > 0:
        term_in_section.append({"key": "C", "prob": lsC[0][1] / total_token_count})
    if len(lsD) > 0:
        term_in_section.append({"key": "D", "prob": lsD[0][1] / total_token_count})
    if len(lsE) > 0:
        term_in_section.append({"key": "E", "prob": lsE[0][1] / total_token_count})
    collection.append({
        "token": token,
        "in": term_in_section,
    })
ajson = json.dumps(collection)
with open("tagged_section.json", mode="w") as f:
    f.write(ajson)
# collection

# Testing Stage: Output HTML of section tagging

In [22]:
ajson = json.dumps(collection)

with open("tagged_section.html", mode="w", encoding="utf8") as f:
    mapping_color = {
        "A": "red",
        "B": "orange",
        "C": "blue",
        "D": "green",
        "E": "#FF00FF",
    }
    resultHTML = ""
    for legend in mapping_color.keys():
        resultHTML = resultHTML + f'<p><span style="color: {mapping_color[legend]};">{mapping[legend]}</span></p>'
    resultHTML = resultHTML + '<hr>'
        
    for o in collection:
        word = o["token"].split(" ")[0];
        color = ""
        if len(o["in"]) > 0:
            if True or o["in"][0]["prob"] > (1/total_token_count):
                color = mapping_color[o["in"][0]["key"]]
            
        print(o)

        resultHTML = resultHTML + f'<span style="color: {color};">{word}</span> '
        if word.find(".") != -1:
            resultHTML = resultHTML + '<br>'
            
    lastItem = collection[len(collection) -1 ]
    word = lastItem["token"].split(" ")[1];
    color = ""
    if len(lastItem["in"]) > 0:
        color = mapping_color[lastItem["in"][0]["key"]]
    resultHTML = resultHTML + f'<span style="color: {color};">{word}</span> '

    
    js_version = """<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Tagged Job Ads section</title>
<script>
window.ajson = """ + ajson + """;
</script>
</head>

<body>
<h1>Tagged Job Ads section</h1>
<div id="tags">

</div>

<script>
let tagsEl = document.querySelector("#tags");
let resultHTML = "";
let mapping_color = {
    "A": "red",
    "B": "orange",
    "C": "blue",
    "D": "green",
    "E": "#FF00FF",
    "Other": ""
}

window.ajson.map((o) => {
    let word = o["token"].split(" ")[0];
    let color = ""
    if (o["in"].length > 0) {
        if (true || o["in"][0]["prob"] > (1/""" + str(total_token_count) + """)) {
            color = mapping_color[o["in"][0]["key"]];
        }
    }
    resultHTML += `<span style="color: ${color};">${word}</span> `
    if (word.indexOf(".") != -1) {
        resultHTML += `<br>`
    }
})


let lastItem = window.ajson[window.ajson.length-1]
word = lastItem["token"].split(" ")[1];
color = ""
if (lastItem["in"].length > 0) {
    color = mapping_color[lastItem["in"][0]["key"]];
}
resultHTML += `<span style="color: ${color};">${word}</span> `

tagsEl.innerHTML = resultHTML;
</script>
</body>
</html>
    """
    pre_gen_version = """
    <div id="tags">
    """ + resultHTML + """
</div>
    """
    f.write(js_version)

{'token': 'a world', 'in': [{'key': 'E', 'prob': 0.000336898846121452}]}
{'token': 'world leading', 'in': []}
{'token': 'leading sourcing', 'in': []}
{'token': 'sourcing office', 'in': []}
{'token': 'office is', 'in': []}
{'token': 'is currently', 'in': [{'key': 'E', 'prob': 8.4224711530363e-05}]}
{'token': 'currently looking', 'in': [{'key': 'E', 'prob': 0.000168449423060726}]}
{'token': 'looking for', 'in': []}
{'token': 'for a', 'in': []}
{'token': 'a senior', 'in': []}
{'token': 'senior merchandiser', 'in': [{'key': 'E', 'prob': 8.4224711530363e-05}]}
{'token': 'merchandiser (outdoor', 'in': []}
{'token': '(outdoor furniture)', 'in': []}
{'token': 'furniture) to', 'in': []}
{'token': 'to join', 'in': []}
{'token': 'join their', 'in': [{'key': 'E', 'prob': 0.0007580224037732671}]}
{'token': 'their dynamic', 'in': [{'key': 'E', 'prob': 8.4224711530363e-05}]}
{'token': 'dynamic team.', 'in': []}
{'token': 'team. client', 'in': []}
{'token': 'client details.', 'in': []}
{'token': 'deta

In [23]:
display(HTML(pre_gen_version))

In [24]:
# End