In [16]:
import pandas as pd
import gensim
from gensim.models import Word2Vec, FastText
import spacy
import numpy as np
import pickle
import re
import sklearn
from sklearn.ensemble import RandomForestClassifier

nlp = spacy.load('en_core_web_lg')

In [17]:
def text_to_vector(text, model):
    
    vector = np.zeros(100)
    non_zero_words = 0
    
    if type(text) == str:
        
        words = text.split()
        for word in words:
            try:
                vector += model.wv[word]
                non_zero_words += 1
            except:
                pass
    
    if non_zero_words != 0:
        return vector / non_zero_words
    else:
        return vector
    
def lemmatizer(text):
    doc = nlp(str(text))
    text = ' '.join([token.lemma_ if token.lemma_ != '-PRON-' else token.text for token in doc])
    return text

def stop_word_remover(text):
    doc = nlp(str(text))
    text = ' '.join([token.text for token in doc if token.is_stop == False])
    return text

In [124]:
model = FastText.load("FastText.model")
clf = pickle.load(open("finalized_model", 'rb'))
#clf = pickle.load(open("Logistic Regression custom fasttextLayer2 True True", 'rb'))
#clf = pickle.load(open("Probabilistic SVM custom fasttextLayer2 True True", 'rb'))

In [125]:
def sentence_preprocess(text):
    
    # (1) Removing leading and ending whitespaces
    preprocessed_text = text.strip()

    # (2) Replacing multiple spaces with a single space
    preprocessed_text = re.sub(' +', ' ', text)

    # (3) First regard special characters as different words
    preprocessed_text = re.sub('(?<=\w)([!?,.])', r' \1', preprocessed_text)

    # (4) Lowercase
    preprocessed_text = preprocessed_text.lower()

    # (5) Lemmatize
    preprocessed_text = lemmatizer(preprocessed_text)

    # (6) Tokenize into words.
    words = preprocessed_text.split(" ")

    #is_stopword = []
    #for word in words:
    #    is_stopword.append(nlp.vocab[word].is_stop)

    #words = list(zip(words, list(range(len(words))), is_stopword))
    return preprocessed_text

In [126]:
def extract_skills(token_list, threshold = 0.2):
    token_list_sorted = sorted(token_list, key=lambda t: 1-t[3])
    #print(token_list_sorted)
    extracted_skills = []
    ranges = set()
    for token in token_list_sorted:
        if token[3] < threshold:
            break
        tmp = set(range(token[2][0], token[2][1]+1))
        if len(tmp.intersection(ranges)) == 0:
            extracted_skills.append(token)
            ranges = ranges.union(tmp)
    return extracted_skills

In [127]:
def app(text, contextual=False, confidence_threshold=0.3):

    labelled_tokens = []
    words = sentence_preprocess(text).split(" ")
    
    for token_length in range(1,5):
        
        for index in range(len(words) + 1 - token_length):

            candidate_skill = words[index:index + token_length]
            candidate_skill = " ".join(word for word in candidate_skill)

            if not nlp.vocab[candidate_skill].is_stop and candidate_skill not in (".", ","):

                if contextual==True:
                    if index != 0:
                        left_context = ' '.join([str(item) for item in words[0:index] if not nlp.vocab[item].is_stop])
                        if len(left_context.split(" ")) > 5:
                            left_context = ' '.join(word for word in left_context.split(" ")[-5:])
                    else: 
                        left_context = ''
                    if index != len(words):
                        right_context   = ' '.join([str(item) for item in words[index+token_length:] if not nlp.vocab[item].is_stop])
                        if len(right_context.split(" ")) > 5:
                            right_context = ' '.join(word for word in right_context.split(" ")[0:5])
                    else:
                        right_context = ''

                    wv1 = text_to_vector(left_context,  model)
                    wv2 =  text_to_vector(candidate_skill, model)
                    wv3 = text_to_vector(right_context, model)

                    input_vector = np.concatenate((wv1, wv2, wv3), axis=None).reshape(1, -1)

                else:
                    
                    input_vector = text_to_vector(candidate_skill, model).reshape(1, -1)
                
                # Maybe we need to calibrate probabilities 
                confidence = clf.predict_proba(input_vector)
                #print(type(confidence))
                if confidence.max() > confidence_threshold:
                    label = clf.predict(input_vector)
                else:
                    label = "no skill"
                
                if label != "no skill":
                    labelled_tokens.append((candidate_skill, label[0], (index, index - 1 + token_length), confidence.max()))
        
    return extract_skills(labelled_tokens, threshold = confidence_threshold)
    #return labelled_tokens

In [128]:
sentence_preprocess(text1)

'our company be one of the large car industry leadership huge market share , with over 20000 employee and over 1000 manager as a candidate you must be communicative , have a good work ethic as well as possess a descent knowledge of python .'

In [129]:
text = "You must be proficient in web security as well as self motivated"

In [130]:
app(text, contextual=True, confidence_threshold=0.25)

[('motivate', 'Emotional intelligence', (11, 11), 0.26)]

In [131]:
text1 = "Our company is one of the largest car industries leadership huge market share, with over 20000 employees and over 1000 managers As a candidate you must be communicative, have a good work ethic as well as possess a descent knowledge of Python."
text2 = "The ideal candidate will have leadership and also be influential in order to motivate his team."

In [132]:
app(text2, contextual=True,  confidence_threshold=0.2)

[('leadership', 'Leadership', (5, 5), 0.37),
 ('motivate', 'Emotional intelligence', (13, 13), 0.26)]

In [133]:
app(text1, contextual=True,  confidence_threshold=0.25)

[('python', 'software engineering', (43, 43), 0.5),
 ('leadership', 'Leadership', (9, 9), 0.4),
 ('communicative', 'Communication', (28, 28), 0.32),
 ('work ethic as well', 'Adaptive', (33, 36), 0.26)]

In [134]:
text3 = "SQL, MongoDB as wells as some basic knowledge of statistics would be taken into account. "
text4 = "You will be also responsible for the account management"
app(text3, contextual=True,  confidence_threshold=0.25)

[('sql', 'software engineering', (0, 0), 0.52),
 ('mongodb', 'software engineering', (2, 2), 0.33),
 ('statistic', 'research methods', (10, 10), 0.27)]

In [135]:
app(text4, contextual=True,  confidence_threshold=0.15)

[('account management', 'accounting', (7, 8), 0.18),
 ('responsible', 'Accountability', (4, 4), 0.17)]

In [136]:
text5 = "As a trainer you will deal with pythons and other animals"
app(text5, contextual=True,  confidence_threshold=0.25)

[('python', 'software engineering', (7, 7), 0.49)]

In [137]:
text6 = "Good knowledge of Python and Java and generally Microsoft Office will be appreciated"
app(text6, contextual=True,  confidence_threshold=0.2)

[('python', 'software engineering', (3, 3), 0.51),
 ('java', 'software engineering', (5, 5), 0.38),
 ('microsoft', 'business intelligence and it systems design', (8, 8), 0.22)]

In [152]:
# Robustness Check

In [145]:
# Selenium (True)
text = "You need to have experience in Selenium as well as be able to come up with innovative solutions for demanding problems"
app(text, contextual=True,  confidence_threshold=0.1)

[('up with', 'it systems and support', (14, 15), 0.24),
 ('innovative', 'Creativity', (16, 16), 0.23),
 ('able to', 'Time management', (11, 12), 0.16),
 ('well as be', 'education, languages and art', (8, 10), 0.15),
 ('to have', 'it systems and support', (2, 3), 0.14),
 ('problem', 'Problem solving', (20, 20), 0.13),
 ('selenium', 'it systems and support', (6, 6), 0.11)]

In [144]:
# Nagios (True)
text = "Experience with common monitoring and configuration management tools such as Nagios."
app(text, contextual=True,  confidence_threshold=0.1)

[('nagio', 'it systems and support', (10, 10), 0.24),
 ('configuration', 'it systems and support', (5, 5), 0.19),
 ('management', 'management and hr', (6, 6), 0.17),
 ('tool', 'construction, maintenance and transport', (7, 7), 0.15),
 ('with common monitoring and', 'management and hr', (1, 4), 0.13)]

In [149]:
# Tenacious (True)
text = 'Sharp and tenacious troubleshooting skills will be appreciated.'
app(text, contextual=True,  confidence_threshold=0.1)

[('troubleshooting skill', 'Problem solving', (3, 4), 0.19),
 ('tenacious', 'Leadership', (2, 2), 0.18),
 ('sharp', 'Critical thinking', (0, 0), 0.16),
 ('will be', 'Enthusiasm', (5, 6), 0.16),
 ('appreciate', 'Social skills', (7, 7), 0.13)]

In [151]:
text = 'Background with Highcharts will be helpful.'
app(text, contextual=True,  confidence_threshold=0.05)

[('be helpful .', 'education, languages and art', (4, 6), 0.18),
 ('background with', 'caregiving and rehabilitation', (0, 1), 0.12),
 ('highchart will', 'Enthusiasm', (2, 3), 0.09)]

In [153]:
# meticulous (False), cheerful (True)
text = 'Meticulous editor, perfectionist, obsessive attention to detail, maddened by typos and broken links, delighted by finding and fixing them and Cheerful under pressure.'
app(text, contextual=True,  confidence_threshold=0.05)

[('attention to detail', 'Detail', (6, 8), 0.6),
 ('fix', 'business intelligence and it systems design', (21, 21), 0.16),
 (', madden by', 'mechanical and electrical engineering', (9, 11), 0.16),
 ('broken', 'it systems and support', (14, 14), 0.15),
 ('under pressure .', 'Ethic', (25, 27), 0.15),
 ('meticulous', 'Enthusiasm', (0, 0), 0.13),
 ('editor', 'education, languages and art', (1, 1), 0.13),
 (', perfectionist ,', 'education, languages and art', (2, 4), 0.13),
 ('typo', 'construction, maintenance and transport', (12, 12), 0.12),
 ('cheerful', 'Enthusiasm', (24, 24), 0.12),
 ('delight', 'Leadership', (17, 17), 0.1),
 ('they and', 'it systems and support', (22, 23), 0.09),
 ('link ,', 'accounting', (15, 16), 0.08)]

In [154]:
#  filing (True), mailing (False)
text = 'Helping with office administrative work, such as filing, mailing, and preparing for meetings.'
app(text, contextual=True,  confidence_threshold=0.05)

[('work ,', 'Adaptive', (4, 5), 0.42),
 ('meeting .', 'Goal', (15, 16), 0.29),
 ('office administrative', 'administration and law', (2, 3), 0.28),
 (', mailing ,', 'construction, maintenance and transport', (9, 11), 0.18),
 ('help with', 'education, languages and art', (0, 1), 0.17),
 ('filing', 'administration and law', (8, 8), 0.13),
 ('and prepare for', 'accounting', (12, 14), 0.07)]

In [156]:
# Digging (False)
text = 'Some duties may include digging, removing dirt, carrying bricks, mixing cement, plastering, painting, cleaning and other various labor intensive needs.'
app(text, contextual=True,  confidence_threshold=0.05)

[('duty may', 'healthcare administration', (1, 2), 0.27),
 (', mix', 'it systems and support', (11, 12), 0.2),
 (', plastering ,', 'mechanical and electrical engineering', (14, 16), 0.2),
 ('various labor', 'Leadership', (22, 23), 0.18),
 ('dirt , carry', 'construction, maintenance and transport', (7, 9), 0.15),
 ('dig , remove', 'mechanical and electrical engineering', (4, 6), 0.13),
 ('painting , cleaning and',
  'construction, maintenance and transport',
  (17, 20),
  0.13),
 ('cement', 'finance', (13, 13), 0.12),
 ('intensive need .', 'Leadership', (24, 26), 0.08)]

In [157]:
# Powerpoint (True), Independently (True)
text = "Experience in PowerPoint presentation in front of senior management team  Works effectively and independently in a dynamic environment."
app(text, contextual=True,  confidence_threshold=0.05)

[('management team \xa0', 'Conflict management', (8, 10), 0.29),
 ('work', 'Adaptive', (11, 11), 0.25),
 ('presentation', 'Presentation', (3, 3), 0.24),
 ('powerpoint', 'business intelligence and it systems design', (2, 2), 0.15),
 ('effectively', 'Leadership', (12, 12), 0.14),
 ('senior', 'education, languages and art', (7, 7), 0.12),
 ('and independently', 'Leadership', (13, 14), 0.09),
 ('environment .', 'Leadership', (18, 19), 0.09),
 ('a dynamic', 'education, languages and art', (16, 17), 0.08)]

In [158]:
# Arabic (True), Education (True)
text = 'Arabic & Persian/Farsi Linguist- Must Speak both languages Requisition'
app(text, contextual=True,  confidence_threshold=0.05)

[('speak', 'education, languages and art', (7, 7), 0.46),
 ('language', 'education, languages and art', (9, 9), 0.32),
 ('persian', 'education, languages and art', (2, 2), 0.22),
 ('arabic &', 'education, languages and art', (0, 1), 0.2),
 ('/ farsi', 'mechanical and electrical engineering', (3, 4), 0.17)]

In [160]:
# Intercultular (False)
text = 'Lastly, you have an intercultural mindset, collaborative working style, and very good communication skills.'
app(text, contextual=True,  confidence_threshold=0.05)

[('communication skill', 'Communication', (15, 16), 0.45),
 (', you have an', 'education, languages and art', (1, 4), 0.21),
 ('mindset', 'Goal', (6, 6), 0.17),
 (', collaborative working', 'Adaptive', (7, 9), 0.17),
 ('style ,', 'design', (10, 11), 0.12),
 ('intercultural', 'Leadership', (5, 5), 0.08)]

In [163]:
# degree in counseling (False)
text = 'Four-year degree in counseling or related field, or experience in another field of similar experience.'
app(text, contextual=True,  confidence_threshold=0.0)

[('-', 'mechanical and electrical engineering', (1, 1), 0.18),
 ('another field', 'civil engineering and design', (13, 14), 0.17),
 (', or', 'education, languages and art', (9, 10), 0.15),
 ('degree in counseling', 'accounting', (3, 5), 0.14),
 ('experience in', 'finance', (11, 12), 0.13),
 ('of similar', 'mechanical and electrical engineering', (15, 16), 0.11),
 ('related', 'accounting', (7, 7), 0.1),
 ('experience', 'Leadership', (17, 17), 0.1)]