In [1]:
import pandas as pd
import numpy as np
import flashtext
import spacy
from spacy.matcher import PhraseMatcher
from flashtext import KeywordProcessor

nlp = spacy.load("en_core_web_lg")

In [2]:
sentences = pd.read_csv("sentences_preprocessed.csv")
skills    = pd.read_csv("skills_preprocessed.csv")

In [4]:
skills_list    = skills["skills_lemmatized"].tolist()
sentences_list = sentences["lemmatized_without_stopwords"].tolist()
raw_sentences_list = sentences["sentence"].tolist()

In [5]:
def extract_sentences(raw_sentences_list, sentences_list, skills_list, context_horizon):
    
    keywordprocessor = KeywordProcessor()
    keywordprocessor.add_keywords_from_list(skills_list)
    
    origin_sentence_list = []
    left_context_list    = []
    right_context_list   = []
    candidate_skill_list = []
    
    n_sentence = 0 
    
    for sent in sentences_list:
        
        origin_sentence = raw_sentences_list[n_sentence]
        n_sentence      = n_sentence + 1
        
        try:
            
            extracted_keywords = keywordprocessor.extract_keywords(sent)
            text = nlp(sent)
            keywords = [nlp(word) for word in extracted_keywords]
            matcher = PhraseMatcher(nlp.vocab)
            matcher.add("Keyword_Matcher", None, *keywords)
            found_matches = matcher(text)

            for keyword_match in found_matches:

                start = keyword_match[1]
                end   = keyword_match[2]
                left_context  = text[max(start - context_horizon, 0) : start]
                right_context = text[end : min(end + context_horizon, len(text))]
                candidate_skill = text[start : end]

                origin_sentence_list.append(origin_sentence)
                left_context_list.append(left_context)
                right_context_list.append(right_context)
                candidate_skill_list.append(candidate_skill)
                
                
        except:
            
            pass
            
    return origin_sentence_list, left_context_list, right_context_list, candidate_skill_list


def tuple_to_string(tup):
    
    tup = str(tup)
    return "".join(tup)

In [6]:
context_horizon = 5
origin_sentence, left_context, right_context, candidate_skills = extract_sentences(raw_sentences_list, sentences_list, skills_list, context_horizon)

In [7]:
skills_with_context = pd.DataFrame(list(zip(origin_sentence, left_context, candidate_skills, right_context)), 
                                   columns = ["origin_sentence", "left_context", "candidate_skill", "right_context"])

In [8]:
skills_with_context["left_context"] = skills_with_context["left_context"].apply(tuple_to_string)
skills_with_context["right_context"] = skills_with_context["right_context"].apply(tuple_to_string)
skills_with_context["candidate_skill"] = skills_with_context["candidate_skill"].apply(tuple_to_string)

In [9]:
skills_with_context.to_csv("skills_with_context.csv", index = False)

In [35]:
skills_with_context.iloc[2504]

origin_sentence    management
left_context                 
candidate_skill    management
right_context                
Name: 2504, dtype: object