In [1]:
import nltk
# nltk.download()
import pandas as pd
import spacy
# from spacy import displacy
# from collections import Counter
import en_core_web_lg
nlp = en_core_web_lg.load()
import re
import truecase

In [2]:
import pandas as pd
import spacy
from spacy.displacy.render import EntityRenderer
from IPython.core.display import display, HTML
%load_ext autoreload
%autoreload 2

In [3]:
def custom_render(doc, df, column, options={}, page=False, minify=False, idx=0):
    """Overload the spaCy built-in rendering to allow custom part-of-speech (POS) tags.
    
    Keyword arguments:
    doc -- a spaCy nlp doc object
    df -- a pandas dataframe object
    column -- the name of of a column of interest in the dataframe
    options -- various options to feed into the spaCy renderer, including colors
    page -- rendering markup as full HTML page (default False)
    minify -- for compact HTML (default False)
    idx -- index for specific query or doc in dataframe (default 0)
    
    """
    renderer, converter = EntityRenderer, parse_custom_ents
    renderer = renderer(options=options)
    parsed = [converter(doc, df=df, idx=idx, column=column)]
    html = renderer.render(parsed, page=page, minify=minify).strip()  
    return display(HTML(html))

def parse_custom_ents(doc, df, idx, column):
    """Parse custom entity types that aren't in the original spaCy module.
    
    Keyword arguments:
    doc -- a spaCy nlp doc object
    df -- a pandas dataframe object
    idx -- index for specific query or doc in dataframe
    column -- the name of of a column of interest in the dataframe
    
    """
    if column in df.columns:
        entities = df[column][idx]
        ents = [{'start': ent[1], 'end': ent[2], 'label': ent[3]} 
                for ent in entities]
    else:
        ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
            for ent in doc.ents]
    return {'text': doc.text, 'ents': ents, 'title': None}

def render_entities(idx, df, options={}, column='named_ents'):
    """A wrapper function to get text from a dataframe and render it visually in jupyter notebooks
    
    Keyword arguments:
    idx -- index for specific query or doc in dataframe (default 0)
    df -- a pandas dataframe object
    options -- various options to feed into the spaCy renderer, including colors
    column -- the name of of a column of interest in the dataframe (default 'named_ents')
    
    """
    text = df['text'][idx]
    custom_render(nlp(text), df=df, column=column, options=options, idx=idx)

In [4]:
# colors for additional part of speech tags we want to visualize
options = {
    'colors': {'COMPOUND': '#FE6BFE', 'PROPN': '#18CFE6', 'NOUN': '#18CFE6', 'NP': '#1EECA6', 'ENTITY': '#FF8800'}
}

In [5]:
pd.set_option('display.max_rows', 10) # edit how jupyter will render our pandas dataframes
pd.options.mode.chained_assignment = None # prevent warning about working on a copy of a dataframe

In [6]:
tech = pd.read_csv("Dataset/Technology Summary Matching Project - technologies.csv")
# summary = pd.read_csv("a4f32761-65b8-45e8-99ed-fb8ad4a63f8d.csv")
# summary = pd.read_csv("train_data_original_untagged.csv")
summary = pd.read_csv("Train_Data/Accuracy_Data/Dataset_to_run_on_all/tagged_simanchala_100.csv")

# summary = pd.read_csv("Train_Data/String_Match_non_duplicates - String_Match_non_duplicates.csv")
output = pd.read_csv("Dataset/output - output.csv")

In [7]:
df = summary

In [8]:
lower = lambda x: x.lower() # make everything lowercase

In [9]:
df = pd.DataFrame(df['summaries'].apply(lower))
df.columns = ['text']
display(df)

Unnamed: 0,text
0,[15+ years within the recruiting industry comb...
1,"[bachelor of science (b.s) degree in "" telecom..."
2,[i'm a current online student for fashion merc...
3,"[high-performing,strategic-thinking profession..."
4,[i am a highly motivated civil engineer with o...
...,...
95,[features: thor: ragnarok spiderman: homecomin...
96,[passionate visioner and doer. expert in susta...
97,[i am a self-motivated sales & marketing profe...
98,[www.lisawilkins.com specialties: visual/ui/ux...


## Extract Named Entities 

In [10]:
def extract_named_ents(text):
    """Extract named entities, and beginning, middle and end idx using spaCy's out-of-the-box model. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    return [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in nlp(text).ents]

def add_named_ents(df):
    """Create new column in data frame with named entity tuple extracted.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['named_ents'] = df['text'].apply(extract_named_ents)    

In [11]:
add_named_ents(df)
display(df)

Unnamed: 0,text,named_ents
0,[15+ years within the recruiting industry comb...,"[(15+ years, 1, 10, DATE), (ten years, 56, 65,..."
1,"[bachelor of science (b.s) degree in "" telecom...","[(640, 406, 409, CARDINAL), (642, 449, 452, CA..."
2,[i'm a current online student for fashion merc...,"[(4 years ago, 98, 109, DATE), (atlanta, 232, ..."
3,"[high-performing,strategic-thinking profession...","[(more than 6 years', 54, 72, DATE), (9 years'..."
4,[i am a highly motivated civil engineer with o...,"[(eight years, 50, 61, DATE), (hong kong, 563,..."
...,...,...
95,[features: thor: ragnarok spiderman: homecomin...,"[(alice, 122, 127, PERSON), (john carter, 294,..."
96,[passionate visioner and doer. expert in susta...,"[(fifteen years, 363, 376, DATE), (fifteen yea..."
97,[i am a self-motivated sales & marketing profe...,"[(miami, 267, 272, GPE), (miami, 331, 336, GPE..."
98,[www.lisawilkins.com specialties: visual/ui/ux...,"[(first, 133, 138, ORDINAL), (first, 1191, 119..."


In [12]:
type(df["named_ents"][0][0])

tuple

In [13]:
column = 'named_ents'
render_entities(0, df, options=options, column=column) # take a look at one of the abstracts

## Extracting NOUNS 

In [14]:
def extract_nouns(text):
    """Extract a few types of nouns, and beginning, middle and end idx using spaCy's POS (part of speech) tagger. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    keep_pos = ['PROPN', 'NOUN']
    return [(tok.text, tok.idx, tok.idx+len(tok.text), tok.pos_) for tok in nlp(text) if tok.pos_ in keep_pos]

def add_nouns(df):
    """Create new column in data frame with nouns extracted.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['nouns'] = df['text'].apply(extract_nouns)

In [15]:
add_nouns(df)
display(df)

Unnamed: 0,text,named_ents,nouns
0,[15+ years within the recruiting industry comb...,"[(15+ years, 1, 10, DATE), (ten years, 56, 65,...","[(years, 5, 10, NOUN), (recruiting, 22, 32, NO..."
1,"[bachelor of science (b.s) degree in "" telecom...","[(640, 406, 409, CARDINAL), (642, 449, 452, CA...","[(bachelor, 1, 9, NOUN), (science, 13, 20, NOU..."
2,[i'm a current online student for fashion merc...,"[(4 years ago, 98, 109, DATE), (atlanta, 232, ...","[(student, 22, 29, NOUN), (fashion, 34, 41, NO..."
3,"[high-performing,strategic-thinking profession...","[(more than 6 years', 54, 72, DATE), (9 years'...","[(years, 66, 71, NOUN), (experience, 76, 86, N..."
4,[i am a highly motivated civil engineer with o...,"[(eight years, 50, 61, DATE), (hong kong, 563,...","[(engineer, 31, 39, NOUN), (years, 56, 61, NOU..."
...,...,...,...
95,[features: thor: ragnarok spiderman: homecomin...,"[(alice, 122, 127, PERSON), (john carter, 294,...","[(features, 1, 9, NOUN), (thor, 11, 15, NOUN),..."
96,[passionate visioner and doer. expert in susta...,"[(fifteen years, 363, 376, DATE), (fifteen yea...","[(visioner, 12, 20, NOUN), (doer, 25, 29, NOUN..."
97,[i am a self-motivated sales & marketing profe...,"[(miami, 267, 272, GPE), (miami, 331, 336, GPE...","[(self, 8, 12, NOUN), (sales, 23, 28, NOUN), (..."
98,[www.lisawilkins.com specialties: visual/ui/ux...,"[(first, 133, 138, ORDINAL), (first, 1191, 119...","[(specialties, 21, 32, NOUN), (visual, 34, 40,..."


In [16]:
column = 'nouns'
render_entities(0, df, options=options, column=column)


## Combine Named Entities and Nouns 

In [17]:
def extract_named_nouns(row_series):
    """Combine nouns and non-numerical entities. 
    
    Keyword arguments:
    row_series -- a Pandas Series object
    
    """
    ents = set()
    idxs = set()
    # remove duplicates and merge two lists together
    for noun_tuple in row_series['nouns']:
        for named_ents_tuple in row_series['named_ents']:
            if noun_tuple[1] == named_ents_tuple[1]: 
                idxs.add(noun_tuple[1])
                ents.add(named_ents_tuple)
        if noun_tuple[1] not in idxs:
            ents.add(noun_tuple)
    
    return sorted(list(ents), key=lambda x: x[1])

def add_named_nouns(df):
    """Create new column in data frame with nouns and named ents.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['named_nouns'] = df.apply(extract_named_nouns, axis=1)

In [18]:
add_named_nouns(df)
display(df)

Unnamed: 0,text,named_ents,nouns,named_nouns
0,[15+ years within the recruiting industry comb...,"[(15+ years, 1, 10, DATE), (ten years, 56, 65,...","[(years, 5, 10, NOUN), (recruiting, 22, 32, NO...","[(years, 5, 10, NOUN), (recruiting, 22, 32, NO..."
1,"[bachelor of science (b.s) degree in "" telecom...","[(640, 406, 409, CARDINAL), (642, 449, 452, CA...","[(bachelor, 1, 9, NOUN), (science, 13, 20, NOU...","[(bachelor, 1, 9, NOUN), (science, 13, 20, NOU..."
2,[i'm a current online student for fashion merc...,"[(4 years ago, 98, 109, DATE), (atlanta, 232, ...","[(student, 22, 29, NOUN), (fashion, 34, 41, NO...","[(student, 22, 29, NOUN), (fashion, 34, 41, NO..."
3,"[high-performing,strategic-thinking profession...","[(more than 6 years', 54, 72, DATE), (9 years'...","[(years, 66, 71, NOUN), (experience, 76, 86, N...","[(years, 66, 71, NOUN), (experience, 76, 86, N..."
4,[i am a highly motivated civil engineer with o...,"[(eight years, 50, 61, DATE), (hong kong, 563,...","[(engineer, 31, 39, NOUN), (years, 56, 61, NOU...","[(engineer, 31, 39, NOUN), (years, 56, 61, NOU..."
...,...,...,...,...
95,[features: thor: ragnarok spiderman: homecomin...,"[(alice, 122, 127, PERSON), (john carter, 294,...","[(features, 1, 9, NOUN), (thor, 11, 15, NOUN),...","[(features, 1, 9, NOUN), (thor, 11, 15, NOUN),..."
96,[passionate visioner and doer. expert in susta...,"[(fifteen years, 363, 376, DATE), (fifteen yea...","[(visioner, 12, 20, NOUN), (doer, 25, 29, NOUN...","[(visioner, 12, 20, NOUN), (doer, 25, 29, NOUN..."
97,[i am a self-motivated sales & marketing profe...,"[(miami, 267, 272, GPE), (miami, 331, 336, GPE...","[(self, 8, 12, NOUN), (sales, 23, 28, NOUN), (...","[(self, 8, 12, NOUN), (sales, 23, 28, NOUN), (..."
98,[www.lisawilkins.com specialties: visual/ui/ux...,"[(first, 133, 138, ORDINAL), (first, 1191, 119...","[(specialties, 21, 32, NOUN), (visual, 34, 40,...","[(specialties, 21, 32, NOUN), (visual, 34, 40,..."


In [19]:
column = 'named_nouns'
render_entities(0, df, options=options, column=column)

## Extract Noun Phrases 

A Chunky Pipeline
Even mild exposure to computer science, or any of the various isoforms of engineering, will have introduced you to the idea of an abstraction, wherein low-level concepts are bundled into higher-order relationships. The noun phrase or chunk is an abstraction which consists of two or more words, and is the by-product of dependency parsing, POS tagging, and tokenization. spaCy's POS tagger is essentially a statistical model which learns to predict the tag (noun, verb, adjective, etc.) for a given word using examples of tagged-sentences.

This supervised machine learning approach relies on tokens generated from splitting text into somewhat atomic units using a rule-based tokenizer (although there are some interesting unsupervised models out there as well). Dependency parsing then uncovers relationships between these tagged tokens, allowing us to finally extract noun chunks or phrases of relevance.

The full pipeline goes something like this:

raw text → tokenization → POS tagging → dependency parsing → noun chunk extraction

Theoretically, one could swap out noun chunk extraction for named entity recognition, but that's the part of the pipeline we are attempting to modify for our own purposes, because we want n-length entities. Barring our custom intrusion, however, this is exactly how spaCy's built-in model works! If you don't believe me (which you shouldn't, since you're a scientist), scroll up to the very top of this notebook to convince yourself.

In [20]:
text = summary["summaries"][0]
spacy.displacy.render(nlp(text), jupyter=True) # generating raw-markup using spacy's built-in renderer

In [21]:
def extract_noun_phrases(text):
    """Combine noun phrases. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    return [(chunk.text, chunk.start_char, chunk.end_char, chunk.label_) for chunk in nlp(text).noun_chunks]

def add_noun_phrases(df):
    """Create new column in data frame with noun phrases.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['noun_phrases'] = df['text'].apply(extract_noun_phrases)

In [22]:
def visualize_noun_phrases(text):
    """Create a temporary dataframe to extract and visualize noun phrases. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    df = pd.DataFrame([text]) 
    df.columns = ['text']
    add_noun_phrases(df)
    column = 'noun_phrases'
    render_entities(0, df, options=options, column=column)

In [23]:
visualize_noun_phrases(text)


In [24]:
add_noun_phrases(df)
display(df)

Unnamed: 0,text,named_ents,nouns,named_nouns,noun_phrases
0,[15+ years within the recruiting industry comb...,"[(15+ years, 1, 10, DATE), (ten years, 56, 65,...","[(years, 5, 10, NOUN), (recruiting, 22, 32, NO...","[(years, 5, 10, NOUN), (recruiting, 22, 32, NO...","[(the recruiting industry, 18, 41, NP), (ten y..."
1,"[bachelor of science (b.s) degree in "" telecom...","[(640, 406, 409, CARDINAL), (642, 449, 452, CA...","[(bachelor, 1, 9, NOUN), (science, 13, 20, NOU...","[(bachelor, 1, 9, NOUN), (science, 13, 20, NOU...","[(science, 13, 20, NP), ((b.s) degree, 21, 33,..."
2,[i'm a current online student for fashion merc...,"[(4 years ago, 98, 109, DATE), (atlanta, 232, ...","[(student, 22, 29, NOUN), (fashion, 34, 41, NO...","[(student, 22, 29, NOUN), (fashion, 34, 41, NO...","[(i, 1, 2, NP), (a current online student, 5, ..."
3,"[high-performing,strategic-thinking profession...","[(more than 6 years', 54, 72, DATE), (9 years'...","[(years, 66, 71, NOUN), (experience, 76, 86, N...","[(years, 66, 71, NOUN), (experience, 76, 86, N...","[(more than 6 years, 54, 71, NP), (experience,..."
4,[i am a highly motivated civil engineer with o...,"[(eight years, 50, 61, DATE), (hong kong, 563,...","[(engineer, 31, 39, NOUN), (years, 56, 61, NOU...","[(engineer, 31, 39, NOUN), (years, 56, 61, NOU...","[(i, 1, 2, NP), (a highly motivated civil engi..."
...,...,...,...,...,...
95,[features: thor: ragnarok spiderman: homecomin...,"[(alice, 122, 127, PERSON), (john carter, 294,...","[(features, 1, 9, NOUN), (thor, 11, 15, NOUN),...","[(features, 1, 9, NOUN), (thor, 11, 15, NOUN),...","[([features, 0, 9, NP), (thor, 11, 15, NP), (r..."
96,[passionate visioner and doer. expert in susta...,"[(fifteen years, 363, 376, DATE), (fifteen yea...","[(visioner, 12, 20, NOUN), (doer, 25, 29, NOUN...","[(visioner, 12, 20, NOUN), (doer, 25, 29, NOUN...","[([passionate visioner and doer. expert, 0, 37..."
97,[i am a self-motivated sales & marketing profe...,"[(miami, 267, 272, GPE), (miami, 331, 336, GPE...","[(self, 8, 12, NOUN), (sales, 23, 28, NOUN), (...","[(self, 8, 12, NOUN), (sales, 23, 28, NOUN), (...","[(i, 1, 2, NP), (a self-motivated sales, 6, 28..."
98,[www.lisawilkins.com specialties: visual/ui/ux...,"[(first, 133, 138, ORDINAL), (first, 1191, 119...","[(specialties, 21, 32, NOUN), (visual, 34, 40,...","[(specialties, 21, 32, NOUN), (visual, 34, 40,...","[([www.lisawilkins.com specialties, 0, 32, NP)..."


In [25]:
column = 'noun_phrases'
render_entities(0, df, options=options, column=column)

## Extract compound noun phrases

In [26]:
def extract_compounds(text):
    """Extract compound noun phrases with beginning and end idxs. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    comp_idx = 0
    compound = []
    compound_nps = []
    tok_idx = 0
    for idx, tok in enumerate(nlp(text)):
        if tok.dep_ == 'compound':

            # capture hyphenated compounds
            children = ''.join([c.text for c in tok.children])
            if '-' in children:
                compound.append(''.join([children, tok.text]))
            else:
                compound.append(tok.text)

            # remember starting index of first child in compound or word
            try:
                tok_idx = [c for c in tok.children][0].idx
            except IndexError:
                if len(compound) == 1:
                    tok_idx = tok.idx
            comp_idx = tok.i

        # append the last word in a compound phrase
        if tok.i - comp_idx == 1:
            compound.append(tok.text)
            if len(compound) > 1: 
                compound = ' '.join(compound)
                compound_nps.append((compound, tok_idx, tok_idx+len(compound), 'COMPOUND'))

            # reset parameters
            tok_idx = 0 
            compound = []

    return compound_nps

def add_compounds(df):
    """Create new column in data frame with compound noun phrases.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['compounds'] = df['text'].apply(extract_compounds)

In [27]:
add_compounds(df)
display(df)

Unnamed: 0,text,named_ents,nouns,named_nouns,noun_phrases,compounds
0,[15+ years within the recruiting industry comb...,"[(15+ years, 1, 10, DATE), (ten years, 56, 65,...","[(years, 5, 10, NOUN), (recruiting, 22, 32, NO...","[(years, 5, 10, NOUN), (recruiting, 22, 32, NO...","[(the recruiting industry, 18, 41, NP), (ten y...","[(recruiting industry, 22, 41, COMPOUND), (qua..."
1,"[bachelor of science (b.s) degree in "" telecom...","[(640, 406, 409, CARDINAL), (642, 449, 452, CA...","[(bachelor, 1, 9, NOUN), (science, 13, 20, NOU...","[(bachelor, 1, 9, NOUN), (science, 13, 20, NOU...","[(science, 13, 20, NP), ((b.s) degree, 21, 33,...","[(telecommunications networks, 39, 66, COMPOUN..."
2,[i'm a current online student for fashion merc...,"[(4 years ago, 98, 109, DATE), (atlanta, 232, ...","[(student, 22, 29, NOUN), (fashion, 34, 41, NO...","[(student, 22, 29, NOUN), (fashion, 34, 41, NO...","[(i, 1, 2, NP), (a current online student, 5, ...","[(fashion merchandising, 34, 55, COMPOUND), (n..."
3,"[high-performing,strategic-thinking profession...","[(more than 6 years', 54, 72, DATE), (9 years'...","[(years, 66, 71, NOUN), (experience, 76, 86, N...","[(years, 66, 71, NOUN), (experience, 76, 86, N...","[(more than 6 years, 54, 71, NP), (experience,...","[(device marketing, 90, 106, COMPOUND), (produ..."
4,[i am a highly motivated civil engineer with o...,"[(eight years, 50, 61, DATE), (hong kong, 563,...","[(engineer, 31, 39, NOUN), (years, 56, 61, NOU...","[(engineer, 31, 39, NOUN), (years, 56, 61, NOU...","[(i, 1, 2, NP), (a highly motivated civil engi...","[(construction industry, 83, 104, COMPOUND), (..."
...,...,...,...,...,...,...
95,[features: thor: ragnarok spiderman: homecomin...,"[(alice, 122, 127, PERSON), (john carter, 294,...","[(features, 1, 9, NOUN), (thor, 11, 15, NOUN),...","[(features, 1, 9, NOUN), (thor, 11, 15, NOUN),...","[([features, 0, 9, NP), (thor, 11, 15, NP), (r...","[(ragnarok spiderman, 17, 35, COMPOUND), (inde..."
96,[passionate visioner and doer. expert in susta...,"[(fifteen years, 363, 376, DATE), (fifteen yea...","[(visioner, 12, 20, NOUN), (doer, 25, 29, NOUN...","[(visioner, 12, 20, NOUN), (doer, 25, 29, NOUN...","[([passionate visioner and doer. expert, 0, 37...","[(project development, 53, 72, COMPOUND), (pac..."
97,[i am a self-motivated sales & marketing profe...,"[(miami, 267, 272, GPE), (miami, 331, 336, GPE...","[(self, 8, 12, NOUN), (sales, 23, 28, NOUN), (...","[(self, 8, 12, NOUN), (sales, 23, 28, NOUN), (...","[(i, 1, 2, NP), (a self-motivated sales, 6, 28...","[(communication skills, 69, 89, COMPOUND), (mg..."
98,[www.lisawilkins.com specialties: visual/ui/ux...,"[(first, 133, 138, ORDINAL), (first, 1191, 119...","[(specialties, 21, 32, NOUN), (visual, 34, 40,...","[(specialties, 21, 32, NOUN), (visual, 34, 40,...","[([www.lisawilkins.com specialties, 0, 32, NP)...","[(ux design, 34, 43, COMPOUND), (business requ..."


In [28]:
column = 'compounds'
render_entities(0, df, options=options, column=column)

In [29]:
def extract_comp_nouns(row_series, cols=[]):
    """Combine compound noun phrases and entities. 
    
    Keyword arguments:
    row_series -- a Pandas Series object
    
    """
    return {noun_tuple[0] for col in cols for noun_tuple in row_series[col]}

def add_comp_nouns(df, cols=[]):
    """Create new column in data frame with merged entities.
    
    Keyword arguments:
    df -- a dataframe object
    cols -- a list of column names that need to be merged
    
    """
    df['comp_nouns'] = df.apply(extract_comp_nouns, axis=1, cols=cols)
cols = ['nouns', 'compounds']
add_comp_nouns(df, cols=cols)
display(df)


Unnamed: 0,text,named_ents,nouns,named_nouns,noun_phrases,compounds,comp_nouns
0,[15+ years within the recruiting industry comb...,"[(15+ years, 1, 10, DATE), (ten years, 56, 65,...","[(years, 5, 10, NOUN), (recruiting, 22, 32, NO...","[(years, 5, 10, NOUN), (recruiting, 22, 32, NO...","[(the recruiting industry, 18, 41, NP), (ten y...","[(recruiting industry, 22, 41, COMPOUND), (qua...","{client groups, groups, member, relationship, ..."
1,"[bachelor of science (b.s) degree in "" telecom...","[(640, 406, 409, CARDINAL), (642, 449, 452, CA...","[(bachelor, 1, 9, NOUN), (science, 13, 20, NOU...","[(bachelor, 1, 9, NOUN), (science, 13, 20, NOU...","[(science, 13, 20, NP), ((b.s) degree, 21, 33,...","[(telecommunications networks, 39, 66, COMPOUN...","{tudor, switching, bgp, july, ospf, dwdm, tech..."
2,[i'm a current online student for fashion merc...,"[(4 years ago, 98, 109, DATE), (atlanta, 232, ...","[(student, 22, 29, NOUN), (fashion, 34, 41, NO...","[(student, 22, 29, NOUN), (fashion, 34, 41, NO...","[(i, 1, 2, NP), (a current online student, 5, ...","[(fashion merchandising, 34, 55, COMPOUND), (n...","{nordstrom, nordstrom(nordstrom, merchandising..."
3,"[high-performing,strategic-thinking profession...","[(more than 6 years', 54, 72, DATE), (9 years'...","[(years, 66, 71, NOUN), (experience, 76, 86, N...","[(years, 66, 71, NOUN), (experience, 76, 86, N...","[(more than 6 years, 54, 71, NP), (experience,...","[(device marketing, 90, 106, COMPOUND), (produ...","{track, marketing, relationship, life, product..."
4,[i am a highly motivated civil engineer with o...,"[(eight years, 50, 61, DATE), (hong kong, 563,...","[(engineer, 31, 39, NOUN), (years, 56, 61, NOU...","[(engineer, 31, 39, NOUN), (years, 56, 61, NOU...","[(i, 1, 2, NP), (a highly motivated civil engi...","[(construction industry, 83, 104, COMPOUND), (...","{excavation, project teams, fit, construction ..."
...,...,...,...,...,...,...,...
95,[features: thor: ragnarok spiderman: homecomin...,"[(alice, 122, 127, PERSON), (john carter, 294,...","[(features, 1, 9, NOUN), (thor, 11, 15, NOUN),...","[(features, 1, 9, NOUN), (thor, 11, 15, NOUN),...","[([features, 0, 9, NP), (thor, 11, 15, NP), (r...","[(ragnarok spiderman, 17, 35, COMPOUND), (inde...","{day, turtles, harry, carribean, compass gener..."
96,[passionate visioner and doer. expert in susta...,"[(fifteen years, 363, 376, DATE), (fifteen yea...","[(visioner, 12, 20, NOUN), (doer, 25, 29, NOUN...","[(visioner, 12, 20, NOUN), (doer, 25, 29, NOUN...","[([passionate visioner and doer. expert, 0, 37...","[(project development, 53, 72, COMPOUND), (pac...","{career progression, establishment, project, c..."
97,[i am a self-motivated sales & marketing profe...,"[(miami, 267, 272, GPE), (miami, 331, 336, GPE...","[(self, 8, 12, NOUN), (sales, 23, 28, NOUN), (...","[(self, 8, 12, NOUN), (sales, 23, 28, NOUN), (...","[(i, 1, 2, NP), (a self-motivated sales, 6, 28...","[(communication skills, 69, 89, COMPOUND), (mg...","{rotor, life, llc, clients, mastery, property,..."
98,[www.lisawilkins.com specialties: visual/ui/ux...,"[(first, 133, 138, ORDINAL), (first, 1191, 119...","[(specialties, 21, 32, NOUN), (visual, 34, 40,...","[(specialties, 21, 32, NOUN), (visual, 34, 40,...","[([www.lisawilkins.com specialties, 0, 32, NP)...","[(ux design, 34, 43, COMPOUND), (business requ...","{solutions, tv systems, answer, details, busin..."


In [30]:
def drop_duplicate_np_splits(ents):
    """Drop any entities that are already captured by noun phrases. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    drop_ents = set()
    for ent in ents:
        if len(ent.split(' ')) > 1:
            for e in ent.split(' '):
                if e in ents:
                    drop_ents.add(e)
    return ents - drop_ents

def drop_single_char_nps(ents):
    """Within an entity, drop single characters. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    return {' '.join([e for e in ent.split(' ') if not len(e) == 1]) for ent in ents}

def drop_double_char(ents):
    """Drop any entities that are less than three characters. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    drop_ents = {ent for ent in ents if len(ent) < 3}
    return ents - drop_ents

def keep_alpha(ents):
    """Keep only entities with alphabetical unicode characters, hyphens, and spaces. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    keep_char = set('-abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ')
    drop_ents = {ent for ent in ents if not set(ent).issubset(keep_char)}
    return ents - drop_ents

In [31]:
type(df["nouns"][0][0][0])

str

In [32]:
df["nouns"][0][0][0]

'years'

In [33]:
# Take all entries from all columns and put in a single dictionary as entities,

# Trying Knowledge Graph Approach

- https://www.kaggle.com/pavansanagapati/knowledge-graph-nlp-tutorial-bert-spacy-nltk

In [34]:
import re
import pandas as pd
# import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_lg')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [35]:
# import wikipedia sentences
candidate_sentences = pd.read_csv("Train_Data/Accuracy_Data/Dataset_to_run_on_all/tagged_simanchala_100.csv")
candidate_sentences.shape

(100, 11)

In [36]:
candidate_sentences['summaries'].sample(5)


69    [i have recently read a number of books that reference the growth mindset which is a concept that resonates with me. i've applied a growth mindset in my career and it drives me to find new things ...
76    [een mooi merk en een sterk team met ruim 17 jaar marketing & communicatie-ervaring zit ik nu op plek waarbij twee van mijn inspiratiebronnen bij elkaar komen: een mooi, sterk merk en een team van...
87    [dyson transform every category they enter with radical and iconic reinventions that work, perform and look very different. that makes dyson far from ordinary and i love the unique culture. not al...
19    [i'm co-owner/coo of alesmith brewing company which has been operating since 1995. we are also business partners with the soon to be mikkeller san diego brewing company which is due to open early ...
20    [having 10 years of experience in digital, analog mixed and rf board designing - knowledge of product designing from scratch to end - expertise in iot product design incl

In [37]:
def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]

In [38]:
entity_pairs = []

for i in tqdm(candidate_sentences["summaries"]):
  entity_pairs.append(get_entities(i))

100%|██████████| 100/100 [00:16<00:00,  6.02it/s]


In [39]:
entity_pairs[10:20]


[['technology cloud it', 'information facebook'],
 ['managing process yasemin', 'northwest relocation council'],
 ['larger loan i', 'residential branch bankunited'],
 ['specialist  vmware', 'solutioning investment ibm'],
 ['active brand leader', 'small advocacy influitive'],
 ['i', 'nonstop product support amazon'],
 ['principal development strategist i', 'grumman engineer corporation'],
 ['freelance  i', 'startup italiana things'],
 ['corporate account ability', 'recruitment vdart inc'],
 ['cover we', 'human alesmith brewing company']]

In [40]:
def get_relation(sent):

  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", [pattern]) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [41]:
relations = [get_relation(i) for i in tqdm(candidate_sentences['summaries'])]

100%|██████████| 100/100 [00:13<00:00,  7.20it/s]


In [42]:
pd.Series(relations).value_counts()[:50]

am                    4
is                    3
specialist at         2
manager at            2
specialties           2
                     ..
covered               1
experience of lead    1
receive positive      1
experienced           1
research              1
Length: 50, dtype: int64

In [43]:

# extract subject
source = [i[0] for i in entity_pairs]

# extract object
target = [i[1] for i in entity_pairs]

kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})

In [44]:
kg_df

Unnamed: 0,source,target,edge
0,senior technology any,purple squirrel microsoft,are
1,glbp \n mpls,8 network asavie,members in
2,i,founder fashion icandydesigns,sell on
3,proven distributor expertise,medical product system,receive positive
4,unique i,capable site multiplex,am
...,...,...,...
95,final cloth cut,senior engine design inc,td at
96,strategic business,plastic supply bank,location
97,professional helicopter pilot,total qc miami,broker at
98,entire development time,well nbc news,designer at


In [45]:
# Target is the Entity

In [46]:
len(kg_df["target"].to_list())

100

In [47]:
# df names : kg_df and df

In [48]:
df

Unnamed: 0,text,named_ents,nouns,named_nouns,noun_phrases,compounds,comp_nouns
0,[15+ years within the recruiting industry combined with ten years in management contribute to a well-rounded recruiting professional dedicated to finding quality candidates for my client groups. m...,"[(15+ years, 1, 10, DATE), (ten years, 56, 65, DATE), (13 years, 1005, 1013, DATE), (ten years, 1059, 1068, DATE), (11 years, 2009, 2017, DATE), (ten years, 2063, 2072, DATE)]","[(years, 5, 10, NOUN), (recruiting, 22, 32, NOUN), (industry, 33, 41, NOUN), (years, 60, 65, NOUN), (management, 69, 79, NOUN), (quality, 154, 161, NOUN), (candidates, 162, 172, NOUN), (client, 18...","[(years, 5, 10, NOUN), (recruiting, 22, 32, NOUN), (industry, 33, 41, NOUN), (years, 60, 65, NOUN), (management, 69, 79, NOUN), (quality, 154, 161, NOUN), (candidates, 162, 172, NOUN), (client, 18...","[(the recruiting industry, 18, 41, NP), (ten years, 56, 65, NP), (management, 69, 79, NP), (quality candidates, 154, 172, NP), (my client groups, 177, 193, NP), (my goal, 195, 202, NP), (the perfe...","[(recruiting industry, 22, 41, COMPOUND), (quality candidates, 154, 172, COMPOUND), (client groups, 180, 193, COMPOUND), (term relationship, 296, 313, COMPOUND), (sourcing guru, 758, 771, COMPOUND...","{client groups, groups, member, relationship, abilities, capabilities, technology, quality, match, ways, collector, squirrel stalker, skills, specialties, recruiting industry, recruiting abilities..."
1,"[bachelor of science (b.s) degree in "" telecommunications networks and software "" program at politehnica university - electronics ,telecommunications and information technology faculty master of s...","[(640, 406, 409, CARDINAL), (642, 449, 452, CARDINAL), (642, 525, 528, CARDINAL), (642, 610, 613, CARDINAL), (l2/l3, 750, 755, ORG), (2011, 1835, 1839, DATE), (july 2013, 1994, 2003, DATE), (workd...","[(bachelor, 1, 9, NOUN), (science, 13, 20, NOUN), (b.s, 22, 25, NOUN), (degree, 27, 33, NOUN), (telecommunications, 39, 57, NOUN), (networks, 58, 66, NOUN), (software, 71, 79, NOUN), (program, 82,...","[(bachelor, 1, 9, NOUN), (science, 13, 20, NOUN), (b.s, 22, 25, NOUN), (degree, 27, 33, NOUN), (telecommunications, 39, 57, NOUN), (networks, 58, 66, NOUN), (software, 71, 79, NOUN), (program, 82,...","[(science, 13, 20, NP), ((b.s) degree, 21, 33, NP), (telecommunications networks, 39, 66, NP), (software "" program, 71, 89, NP), (politehnica, 93, 104, NP), (university - electronics ,telecommunic...","[(telecommunications networks, 39, 66, COMPOUND), (university -, 105, 117, COMPOUND), (faculty master, 177, 191, COMPOUND), (university -, 284, 296, COMPOUND), (faculty routing, 356, 371, COMPOUND...","{tudor, switching, bgp, july, ospf, dwdm, technology, platforms, repositories, faculty \n \n master, eigrp, switch, faculty master, cisco ccna, m.s, engineer-internet services, mpls, level, networ..."
2,"[i'm a current online student for fashion merchandising and management. i started my own business 4 years ago, called icandydesigns. a company where i sell my own handcrafted jewelry and women's c...","[(4 years ago, 98, 109, DATE), (atlanta, 232, 239, GPE), (2015 4th quarter, 381, 397, DATE), (april 2016-may 2016, 508, 527, DATE), (over 5 years, 550, 562, DATE), (3 yrs, 574, 579, DATE), (5 year...","[(student, 22, 29, NOUN), (fashion, 34, 41, NOUN), (merchandising, 42, 55, NOUN), (management, 60, 70, NOUN), (business, 89, 97, NOUN), (years, 100, 105, NOUN), (icandydesigns, 118, 131, NOUN), (c...","[(student, 22, 29, NOUN), (fashion, 34, 41, NOUN), (merchandising, 42, 55, NOUN), (management, 60, 70, NOUN), (business, 89, 97, NOUN), (years, 100, 105, NOUN), (icandydesigns, 118, 131, NOUN), (c...","[(i, 1, 2, NP), (a current online student, 5, 29, NP), (fashion merchandising, 34, 55, NP), (management, 60, 70, NP), (i, 72, 73, NP), (my own business, 82, 97, NP), (icandydesigns, 118, 131, NP),...","[(fashion merchandising, 34, 55, COMPOUND), (nordstrom rack, 338, 352, COMPOUND), (store manager, 432, 445, COMPOUND), (nordstrom leaders, 472, 489, COMPOUND), (event coordinating, 597, 615, COMPO...","{nordstrom, nordstrom(nordstrom, merchandising, events, boutique, interest, nordstrom leaders, hands, time, student, marketing skills, a., fashion blogging, leaders, manager, snhu, fashion retail,..."
3,"[high-performing,strategic-thinking professional with more than 6 years' of experience in medical device marketing and product management. an engineer by qualification, a marketing profession by v...","[(more than 6 years', 54, 72, DATE), (9 years', 1011, 1019, DATE), (8 years', 1958, 1966, DATE)]","[(years, 66, 71, NOUN), (experience, 76, 86, NOUN), (device, 98, 104, NOUN), (marketing, 105, 114, NOUN), (product, 119, 126, NOUN), (management, 127, 137, NOUN), (engineer, 142, 150, NOUN), (qual...","[(years, 66, 71, NOUN), (experience, 76, 86, NOUN), (device, 98, 104, NOUN), (marketing, 105, 114, NOUN), (product, 119, 126, NOUN), (management, 127, 137, NOUN), (engineer, 142, 150, NOUN), (qual...","[(more than 6 years, 54, 71, NP), (experience, 76, 86, NP), (medical device marketing, 90, 114, NP), (product management, 119, 137, NP), (an engineer, 139, 150, NP), (qualification, 154, 167, NP),...","[(device marketing, 90, 106, COMPOUND), (product management, 119, 137, COMPOUND), (marketing profession, 171, 191, COMPOUND), (brand identity, 302, 316, COMPOUND), (track record, 365, 377, COMPOUN...","{track, marketing, relationship, life, product specalist, life -, collateral, sales, co, -, sr, marketing profession, system, specialist, marketing collateral, experience, skills, application, tim..."
4,"[i am a highly motivated civil engineer with over eight years of experience in the construction industry. throughout my career i have been involved in many aspects of civil works and construction,...","[(eight years, 50, 61, DATE), (hong kong, 563, 572, GPE), (ten years, 1131, 1140, DATE), (hong kong, 1349, 1358, GPE), (nine years, 2134, 2144, DATE), (hong kong, 2550, 2559, GPE), (eight years, 3...","[(engineer, 31, 39, NOUN), (years, 56, 61, NOUN), (experience, 65, 75, NOUN), (construction, 83, 95, NOUN), (industry, 96, 104, NOUN), (career, 120, 126, NOUN), (aspects, 156, 163, NOUN), (works, ...","[(engineer, 31, 39, NOUN), (years, 56, 61, NOUN), (experience, 65, 75, NOUN), (construction, 83, 95, NOUN), (industry, 96, 104, NOUN), (career, 120, 126, NOUN), (aspects, 156, 163, NOUN), (works, ...","[(i, 1, 2, NP), (a highly motivated civil engineer, 6, 39, NP), (over eight years, 45, 61, NP), (experience, 65, 75, NP), (the construction industry, 79, 104, NP), (my career, 117, 126, NP), (i, 1...","[(construction industry, 83, 104, COMPOUND), (certificate iv, 474, 488, COMPOUND), (project management, 492, 510, COMPOUND), (hong kong, 563, 572, COMPOUND), (project teams, 647, 660, COMPOUND), (...","{excavation, project teams, fit, construction team, contractor, project, years experience, section engineer, leadership, uae, project management, quality, focus, uk, section, infrastructure works,..."
...,...,...,...,...,...,...,...
95,[features: thor: ragnarok spiderman: homecoming logan fantastic beasts and where to find them independence day resurgence alice through the looking glass chappie teenage mutant ninja turtles pompe...,"[(alice, 122, 127, PERSON), (john carter, 294, 305, PERSON), (3delight, 504, 512, CARDINAL), (alice, 960, 965, PERSON), (john carter, 1132, 1143, PERSON), (3delight, 1342, 1350, CARDINAL), (alice,...","[(features, 1, 9, NOUN), (thor, 11, 15, NOUN), (ragnarok, 17, 25, NOUN), (spiderman, 26, 35, NOUN), (homecoming, 37, 47, NOUN), (beasts, 64, 70, NOUN), (independence, 94, 106, NOUN), (day, 107, 11...","[(features, 1, 9, NOUN), (thor, 11, 15, NOUN), (ragnarok, 17, 25, NOUN), (spiderman, 26, 35, NOUN), (homecoming, 37, 47, NOUN), (beasts, 64, 70, NOUN), (independence, 94, 106, NOUN), (day, 107, 11...","[([features, 0, 9, NP), (thor, 11, 15, NP), (ragnarok spiderman, 17, 35, NP), (homecoming logan fantastic beasts, 37, 70, NP), (them, 89, 93, NP), (the looking glass chappie teenage mutant ninja t...","[(ragnarok spiderman, 17, 35, COMPOUND), (independence day, 94, 110, COMPOUND), (resurgence alice, 111, 127, COMPOUND), (percy jackson, 222, 235, COMPOUND), (recall byzantium, 252, 268, COMPOUND),...","{day, turtles, harry, carribean, compass generation, stanger, inc, prince, water, monsters, carter, john carter, photoshop, stanger tides harry potter, zbrush, byzantium, blood prince, scripting, ..."
96,"[passionate visioner and doer. expert in sustainable project development, plastic packaging manufacturing, polymer products marketing and procurement ""...to grow professionally by securing a chall...","[(fifteen years, 363, 376, DATE), (fifteen years, 1375, 1388, DATE), (twelve years, 2382, 2394, DATE), (15 years, 3176, 3184, DATE), (los angeles, 3358, 3369, GPE), (california, 3371, 3381, GPE), ...","[(visioner, 12, 20, NOUN), (doer, 25, 29, NOUN), (expert, 31, 37, NOUN), (project, 53, 60, NOUN), (development, 61, 72, NOUN), (plastic, 74, 81, NOUN), (packaging, 82, 91, NOUN), (manufacturing, 9...","[(visioner, 12, 20, NOUN), (doer, 25, 29, NOUN), (expert, 31, 37, NOUN), (project, 53, 60, NOUN), (development, 61, 72, NOUN), (plastic, 74, 81, NOUN), (packaging, 82, 91, NOUN), (manufacturing, 9...","[([passionate visioner and doer. expert, 0, 37, NP), (sustainable project development, 41, 72, NP), (plastic packaging manufacturing, 74, 105, NP), (polymer products marketing, 107, 133, NP), (pro...","[(project development, 53, 72, COMPOUND), (packaging manufacturing, 82, 105, COMPOUND), (polymer products marketing, 107, 133, COMPOUND), (business knowledge, 257, 275, COMPOUND), (years experienc...","{career progression, establishment, project, california, fact, polymer products marketing, los angeles, application, forecasting, location, bank, inc, growth, george, doer, packaging, angeles, alp..."
97,"[i am a self-motivated sales & marketing professional with excellent communication skills. in addition, i possess strong web based marketing and database mgmt experience. as a owner & operator at ...","[(miami, 267, 272, GPE), (miami, 331, 336, GPE), (miami, 962, 967, GPE), (miami, 1026, 1031, GPE), (miami, 1623, 1628, GPE), (miami, 1687, 1692, GPE), (second, 2758, 2764, ORDINAL), (miami, 2881, ...","[(self, 8, 12, NOUN), (sales, 23, 28, NOUN), (marketing, 31, 40, NOUN), (communication, 69, 82, NOUN), (skills, 83, 89, NOUN), (addition, 94, 102, NOUN), (web, 121, 124, NOUN), (marketing, 131, 14...","[(self, 8, 12, NOUN), (sales, 23, 28, NOUN), (marketing, 31, 40, NOUN), (communication, 69, 82, NOUN), (skills, 83, 89, NOUN), (addition, 94, 102, NOUN), (web, 121, 124, NOUN), (marketing, 131, 14...","[(i, 1, 2, NP), (a self-motivated sales, 6, 28, NP), (marketing, 31, 40, NP), (excellent communication skills, 59, 89, NP), (addition, 94, 102, NP), (i, 104, 105, NP), (strong web based marketing ...","[(communication skills, 69, 89, COMPOUND), (mgmt experience, 154, 169, COMPOUND), (cleaning industry, 275, 292, COMPOUND), (qc miami, 328, 336, COMPOUND), (cleaning job, 372, 384, COMPOUND), (busi...","{rotor, life, llc, clients, mastery, property, methods, marketing skills, tql miami, rotor wing pilot, communication, peak management llc, construction, man, crm, capacity, manager, qc, rapport, l..."
98,"[www.lisawilkins.com specialties: visual/ui/ux design and research for online media, mobile & ott. business requirements in hand, my first and foremost goal is to create an experience where my wor...","[(first, 133, 138, ORDINAL), (first, 1191, 1196, ORDINAL), (first, 1973, 1978, ORDINAL), (https://medium.com/@rhysys/no-dickheads-a-guide-to-building-happy-healthy-and-creative-teams-7e9b049fc57d\...","[(specialties, 21, 32, NOUN), (visual, 34, 40, NOUN), (ui, 41, 43, NOUN), (ux, 44, 46, NOUN), (design, 47, 53, NOUN), (research, 58, 66, NOUN), (media, 78, 83, NOUN), (mobile, 85, 91, NOUN), (ott,...","[(specialties, 21, 32, NOUN), (visual, 34, 40, NOUN), (ui, 41, 43, NOUN), (ux, 44, 46, NOUN), (design, 47, 53, NOUN), (research, 58, 66, NOUN), (media, 78, 83, NOUN), (mobile, 85, 91, NOUN), (ott,...","[([www.lisawilkins.com specialties, 0, 32, NP), (visual/ui/ux design, 34, 53, NP), (research, 58, 66, NP), (online media, 71, 83, NP), (mobile, 85, 91, NP), (ott, 94, 97, NP), (hand, 124, 128, NP)...","[(ux design, 34, 43, COMPOUND), (business requirements, 99, 120, COMPOUND), (end -, 218, 223, COMPOUND), (user tests, 289, 299, COMPOUND), (development process, 711, 730, COMPOUND), (information h...","{solutions, tv systems, answer, details, business requirements, insecurity, end -, devs, time, prototypes, css, teams, devices, wireframes, front-end development, ux designer, nbc news, https://me..."


In [49]:
def entity_list(column_name_list):
    entity_list = []
    for i in range(0, len(column_name_list)):
        entity_list.append(column_name_list[i][0])
        
    return entity_list

In [50]:
df['named_ents'] = df.apply(lambda x: entity_list(x["named_ents"]), axis = 1)

In [51]:
df['nouns'] = df.apply(lambda x: entity_list(x["nouns"]), axis = 1)

In [52]:
df['named_nouns'] = df.apply(lambda x: entity_list(x["named_nouns"]), axis = 1)

In [53]:
df['noun_phrases'] = df.apply(lambda x: entity_list(x["noun_phrases"]), axis = 1)

In [54]:
df['compounds'] = df.apply(lambda x: entity_list(x["compounds"]), axis = 1)

In [55]:
type(df['comp_nouns'][0])

set

In [56]:
def combine_entities(list_1,list_2,list_3,list_4,list_5,list_6):
    list_6 = list(list_6)
    final_list = list_1+list_2+list_3+list_4+list_5+list_6
    
    combined_list_unique = list(set(final_list))
    
    return combined_list_unique

In [57]:
df['combined_list_unique'] = df.apply(lambda x: combine_entities(x["named_ents"], x["nouns"], x["named_nouns"], x["noun_phrases"], x["compounds"], x["comp_nouns"]), axis = 1)

In [58]:
len(df['combined_list_unique'][0])

104

In [59]:
df

Unnamed: 0,text,named_ents,nouns,named_nouns,noun_phrases,compounds,comp_nouns,combined_list_unique
0,[15+ years within the recruiting industry combined with ten years in management contribute to a well-rounded recruiting professional dedicated to finding quality candidates for my client groups. m...,"[15+ years, ten years, 13 years, ten years, 11 years, ten years]","[years, recruiting, industry, years, management, quality, candidates, client, groups, goal, marriage, candidate, team, term, relationship, match, recruiter, work, ways, skills, colleagues, time, r...","[years, recruiting, industry, years, management, quality, candidates, client, groups, goal, marriage, candidate, team, term, relationship, match, recruiter, work, ways, skills, colleagues, time, r...","[the recruiting industry, ten years, management, quality candidates, my client groups, my goal, the perfect marriage, the candidate, the team, a long term relationship, the perfect match, i, the c...","[recruiting industry, quality candidates, client groups, term relationship, sourcing guru, networking capabilities, sourcing techniques, it sector, recruiting abilities, recruiting industry, quali...","{client groups, groups, member, relationship, abilities, capabilities, technology, quality, match, ways, collector, squirrel stalker, skills, specialties, recruiting industry, recruiting abilities...","[technology, match, collector, senior technical recruiter, techniques, time, mentor, 13 years, squirrel, purple squirrel collector, all statements, networking capabilities, a member, the consummat..."
1,"[bachelor of science (b.s) degree in "" telecommunications networks and software "" program at politehnica university - electronics ,telecommunications and information technology faculty master of s...","[640, 642, 642, 642, l2/l3, 2011, july 2013, workday, 8]","[bachelor, science, b.s, degree, telecommunications, networks, software, program, politehnica, university, electronics, telecommunications, information, technology, faculty, master, science, m.s, ...","[bachelor, science, b.s, degree, telecommunications, networks, software, program, politehnica, university, electronics, telecommunications, information, technology, faculty, master, science, m.s, ...","[science, (b.s) degree, telecommunications networks, software "" program, politehnica, university - electronics ,telecommunications and information technology faculty master, science, management, n...","[telecommunications networks, university -, faculty master, university -, faculty routing, cisco ccna, cisco ccnp, 642 -, cisco ccnp switch, cisco ccnp, 642 -, l3 protocols, -data link protocols, ...","{tudor, switching, bgp, july, ospf, dwdm, technology, platforms, repositories, faculty \n \n master, eigrp, switch, faculty master, cisco ccna, m.s, engineer-internet services, mpls, level, networ...","[tudor, switching, bgp, july, ospf, dwdm, (b.s) degree, technology, platforms, repositories, (b.s, faculty \n \n master, \n- mpls fundamentals, eigrp, switch, senior network analyst, faculty maste..."
2,"[i'm a current online student for fashion merchandising and management. i started my own business 4 years ago, called icandydesigns. a company where i sell my own handcrafted jewelry and women's c...","[4 years ago, atlanta, 2015 4th quarter, april 2016-may 2016, over 5 years, 3 yrs, 5 years ago, atlanta, 2015 4th quarter, 6 years, 3 yrs, 4 years ago, atlanta, over 5 years, 3 yrs, turner&#39;s, ...","[student, fashion, merchandising, management, business, years, icandydesigns, company, jewelry, women, clothing, boutique, atlanta, events, website, events, company, nordstrom, rack, quarter, star...","[student, fashion, merchandising, management, business, years, icandydesigns, company, jewelry, women, clothing, boutique, atlanta, events, website, events, company, nordstrom, rack, quarter, star...","[i, a current online student, fashion merchandising, management, i, my own business, icandydesigns, a company, i, my own handcrafted jewelry, women's clothing, i, a boutique, atlanta, events, i, m...","[fashion merchandising, nordstrom rack, store manager, nordstrom leaders, event coordinating, customer service, time management, fashion blogging, marketing skills, fashion merchandising, store ma...","{nordstrom, nordstrom(nordstrom, merchandising, events, boutique, interest, nordstrom leaders, hands, time, student, marketing skills, a., fashion blogging, leaders, manager, snhu, fashion retail,...","[nordstrom, 4 years ago, my own events, nordstrom(nordstrom, merchandising, events, boutique, interest, my store manager, nordstrom leaders, hands, time, nordstrom(nordstrom rack, a great eye, stu..."
3,"[high-performing,strategic-thinking professional with more than 6 years' of experience in medical device marketing and product management. an engineer by qualification, a marketing profession by v...","[more than 6 years', 9 years', 8 years']","[years, experience, device, marketing, product, management, engineer, qualification, marketing, profession, virtue, position, capacity, projects, timelines, brand, identity, values, team, leader, ...","[years, experience, device, marketing, product, management, engineer, qualification, marketing, profession, virtue, position, capacity, projects, timelines, brand, identity, values, team, leader, ...","[more than 6 years, experience, medical device marketing, product management, an engineer, qualification, a marketing profession, virtue, current position, strong capacity, multiple projects, stri...","[device marketing, product management, marketing profession, brand identity, track record, communication skills, device marketing, product management, life -, product training, application special...","{track, marketing, relationship, life, product specalist, life -, collateral, sales, co, -, sr, marketing profession, system, specialist, marketing collateral, experience, skills, application, tim...","[life, co, 9 years, -, marketing profession, marketing collateral, application, brand, interest, demonstration, product launch, track record, workers, strong capacity, 9 years', virtue, expertise,..."
4,"[i am a highly motivated civil engineer with over eight years of experience in the construction industry. throughout my career i have been involved in many aspects of civil works and construction,...","[eight years, hong kong, ten years, hong kong, nine years, hong kong, eight years, hong kong, seven years, hong kong, 6+ years, hong kong]","[engineer, years, experience, construction, industry, career, aspects, works, construction, focus, excavation, concrete, piling, steelworks, works, lifting, logistics, career, person, operations, ...","[engineer, years, experience, construction, industry, career, aspects, works, construction, focus, excavation, concrete, piling, steelworks, works, lifting, logistics, career, person, operations, ...","[i, a highly motivated civil engineer, over eight years, experience, the construction industry, my career, i, many aspects, civil works, construction, a main focus, excavation, reinforced concrete...","[construction industry, certificate iv, project management, hong kong, project teams, leadership roles, quality results, project teams, project team, construction industry, infrastructure works, f...","{excavation, project teams, fit, construction team, contractor, project, years experience, section engineer, leadership, uae, project management, quality, focus, uk, section, infrastructure works,...","[project teams, fit, both the client, contractor, project, project management, laing, nine years, kong, construction industry, course, deadlines, time, certificate, teams, a construction team, hon..."
...,...,...,...,...,...,...,...,...
95,[features: thor: ragnarok spiderman: homecoming logan fantastic beasts and where to find them independence day resurgence alice through the looking glass chappie teenage mutant ninja turtles pompe...,"[alice, john carter, 3delight, alice, john carter, 3delight, alice, john carter, 3delight]","[features, thor, ragnarok, spiderman, homecoming, beasts, independence, day, resurgence, alice, glass, chappie, turtles, instruments, percy, jackson, sea, monsters, byzantium, expectations, john, ...","[features, thor, ragnarok, spiderman, homecoming, beasts, independence, day, resurgence, alice, glass, chappie, turtles, instruments, percy, jackson, sea, monsters, byzantium, expectations, john c...","[[features, thor, ragnarok spiderman, homecoming logan fantastic beasts, them, the looking glass chappie teenage mutant ninja turtles, the mortal instruments, percy jackson, sea, monsters, byzanti...","[ragnarok spiderman, independence day, resurgence alice, percy jackson, recall byzantium, john carter, mars pirates, stanger tides harry potter, blood prince bedtime stories, compass generation, v...","{day, turtles, harry, carribean, compass generation, stanger, inc, prince, water, monsters, carter, john carter, photoshop, stanger tides harry potter, zbrush, byzantium, blood prince, scripting, ...","[day, turtles, harry, byzantium great expectations, carribean, compass generation, stanger, inc, prince, the half blood prince\nbedtime stories, the carribean, water, monsters, stanger tides, fant..."
96,"[passionate visioner and doer. expert in sustainable project development, plastic packaging manufacturing, polymer products marketing and procurement ""...to grow professionally by securing a chall...","[fifteen years, fifteen years, twelve years, 15 years, los angeles, california, 16 years, 13 years, 12 years, 15 years]","[visioner, doer, expert, project, development, plastic, packaging, manufacturing, polymer, products, marketing, procurement, position, organization, business, knowledge, experience, skills, growth...","[visioner, doer, expert, project, development, plastic, packaging, manufacturing, polymer, products, marketing, procurement, position, organization, business, knowledge, experience, skills, growth...","[[passionate visioner and doer. expert, sustainable project development, plastic packaging manufacturing, polymer products marketing, procurement, a challenging leading position, a dynamic organiz...","[project development, packaging manufacturing, polymer products marketing, business knowledge, years experience, packaging industry, business processes, career progression, contract administration...","{career progression, establishment, project, california, fact, polymer products marketing, los angeles, application, forecasting, location, bank, inc, growth, george, doer, packaging, angeles, alp...","[career progression, establishment, project, california, fact, polymer products marketing, los angeles, my business knowledge, fifteen years experience, fact-based, process-oriented approaches, ap..."
97,"[i am a self-motivated sales & marketing professional with excellent communication skills. in addition, i possess strong web based marketing and database mgmt experience. as a owner & operator at ...","[miami, miami, miami, miami, miami, miami, second, miami, miami]","[self, sales, marketing, communication, skills, addition, web, marketing, database, mgmt, experience, owner, operator, quality, miami, objective, focus, miami, cleaning, industry, service, miami, ...","[self, sales, marketing, communication, skills, addition, web, marketing, database, mgmt, experience, owner, operator, quality, miami, objective, focus, miami, cleaning, industry, service, miami, ...","[i, a self-motivated sales, marketing, excellent communication skills, addition, i, strong web based marketing and database mgmt experience, a owner, operator, quality clean miami, my objective, f...","[communication skills, mgmt experience, cleaning industry, qc miami, cleaning job, business man, sell products, network ties, investment zoom, communication skills, cleaning industry, qc miami, cl...","{rotor, life, llc, clients, mastery, property, methods, marketing skills, tql miami, rotor wing pilot, communication, peak management llc, construction, man, crm, capacity, manager, qc, rapport, l...","[we, market/ sell products, rotor, life, a property manager, llc, clients, mastery, strong web and crm experience, property, methods, any cleaning job, marketing skills, tql miami, rotor wing pilo..."
98,"[www.lisawilkins.com specialties: visual/ui/ux design and research for online media, mobile & ott. business requirements in hand, my first and foremost goal is to create an experience where my wor...","[first, first, first, https://medium.com/@rhysys/no-dickheads-a-guide-to-building-happy-healthy-and-creative-teams-7e9b049fc57d\n\n, nbc]","[specialties, visual, ui, ux, design, research, media, mobile, ott, business, requirements, hand, goal, experience, work, end, user, solutions, wireframes, prototypes, user, tests, iterations, ske...","[specialties, visual, ui, ux, design, research, media, mobile, ott, business, requirements, hand, goal, experience, work, end, user, solutions, wireframes, prototypes, user, tests, iterations, ske...","[[www.lisawilkins.com specialties, visual/ui/ux design, research, online media, mobile, ott, hand, my first and foremost goal, an experience, my work, the end-user, my solutions, i, wireframes, pr...","[ux design, business requirements, end -, user tests, development process, information hoarders, tv systems, business requirements, end -, user tests, development process, \n\n specialties, ux des...","{solutions, tv systems, answer, details, business requirements, insecurity, end -, devs, time, prototypes, css, teams, devices, wireframes, front-end development, ux designer, nbc news, https://me...","[solutions, tv systems, we, answer, the team members, details, business requirements, insecurity, end -, devs, visual/ui/ux design, principle/invision/pixate, time, prototypes, the entire developm..."


In [60]:
entity_df = pd.concat([df,kg_df["target"]], axis =1)

In [61]:
entity_df.columns

Index(['text', 'named_ents', 'nouns', 'named_nouns', 'noun_phrases',
       'compounds', 'comp_nouns', 'combined_list_unique', 'target'],
      dtype='object')

In [62]:
entity_df.drop(columns = ['text', 'named_ents', 'nouns', 'named_nouns', 'noun_phrases',
       'compounds', 'comp_nouns'], inplace = True)

In [63]:
entity_df["combined_list_unique"][0]

['technology',
 'match',
 'collector',
 'senior technical recruiter',
 'techniques',
 'time',
 'mentor',
 '13 years',
 'squirrel',
 'purple squirrel collector',
 'all statements',
 'networking capabilities',
 'a member',
 'the consummate recruiter',
 'people',
 'sector',
 'unicorn wrangler',
 'it sector',
 'term relationship',
 'effective sourcing techniques',
 'microsoft',
 '\n\n specialties',
 'the perfect marriage',
 'strong networking capabilities',
 'new people',
 'goal',
 'my client groups',
 'groups',
 'relationship',
 'abilities',
 'capabilities',
 'passive candidates',
 'my goal',
 'my skills',
 'the perfect match',
 '15+ years',
 'skills',
 'specialties',
 'purple squirrel hunter',
 'recruiting industry',
 'the candidate',
 'unity',
 'candidate',
 'term',
 'snap',
 'hewlett packard enterprise',
 'squirrel hunter',
 'sourcing guru',
 'methodologies',
 'recruiting',
 'work',
 'ten years',
 'technologies',
 'recruiter',
 'seattle',
 'networking',
 'client',
 'guru',
 'team',
 't

In [64]:
type(entity_df["target"][0])

str

In [65]:
def append_entities(l1, l2):
    print(type(l1))
    print(l1)
    l1.append(l2)
    return l1
    

In [66]:
entity_df['combined_list_unique'] = entity_df.apply(lambda x: append_entities(x["combined_list_unique"],x["target"]), axis = 1)

<class 'list'>
['technology', 'match', 'collector', 'senior technical recruiter', 'techniques', 'time', 'mentor', '13 years', 'squirrel', 'purple squirrel collector', 'all statements', 'networking capabilities', 'a member', 'the consummate recruiter', 'people', 'sector', 'unicorn wrangler', 'it sector', 'term relationship', 'effective sourcing techniques', 'microsoft', '\n\n specialties', 'the perfect marriage', 'strong networking capabilities', 'new people', 'goal', 'my client groups', 'groups', 'relationship', 'abilities', 'capabilities', 'passive candidates', 'my goal', 'my skills', 'the perfect match', '15+ years', 'skills', 'specialties', 'purple squirrel hunter', 'recruiting industry', 'the candidate', 'unity', 'candidate', 'term', 'snap', 'hewlett packard enterprise', 'squirrel hunter', 'sourcing guru', 'methodologies', 'recruiting', 'work', 'ten years', 'technologies', 'recruiter', 'seattle', 'networking', 'client', 'guru', 'team', 'the recruiting industry', 'member', 'quality'

In [67]:
entity_df

Unnamed: 0,combined_list_unique,target
0,"[technology, match, collector, senior technical recruiter, techniques, time, mentor, 13 years, squirrel, purple squirrel collector, all statements, networking capabilities, a member, the consummat...",purple squirrel microsoft
1,"[tudor, switching, bgp, july, ospf, dwdm, (b.s) degree, technology, platforms, repositories, (b.s, faculty \n \n master, \n- mpls fundamentals, eigrp, switch, senior network analyst, faculty maste...",8 network asavie
2,"[nordstrom, 4 years ago, my own events, nordstrom(nordstrom, merchandising, events, boutique, interest, my store manager, nordstrom leaders, hands, time, nordstrom(nordstrom rack, a great eye, stu...",founder fashion icandydesigns
3,"[life, co, 9 years, -, marketing profession, marketing collateral, application, brand, interest, demonstration, product launch, track record, workers, strong capacity, 9 years', virtue, expertise,...",medical product system
4,"[project teams, fit, both the client, contractor, project, project management, laing, nine years, kong, construction industry, course, deadlines, time, certificate, teams, a construction team, hon...",capable site multiplex
...,...,...
95,"[day, turtles, harry, byzantium great expectations, carribean, compass generation, stanger, inc, prince, the half blood prince\nbedtime stories, the carribean, water, monsters, stanger tides, fant...",senior engine design inc
96,"[career progression, establishment, project, california, fact, polymer products marketing, los angeles, my business knowledge, fifteen years experience, fact-based, process-oriented approaches, ap...",plastic supply bank
97,"[we, market/ sell products, rotor, life, a property manager, llc, clients, mastery, strong web and crm experience, property, methods, any cleaning job, marketing skills, tql miami, rotor wing pilo...",total qc miami
98,"[solutions, tv systems, we, answer, the team members, details, business requirements, insecurity, end -, devs, visual/ui/ux design, principle/invision/pixate, time, prototypes, the entire developm...",well nbc news


In [68]:
entity_df.drop(columns = ["target"], axis =1)

Unnamed: 0,combined_list_unique
0,"[technology, match, collector, senior technical recruiter, techniques, time, mentor, 13 years, squirrel, purple squirrel collector, all statements, networking capabilities, a member, the consummat..."
1,"[tudor, switching, bgp, july, ospf, dwdm, (b.s) degree, technology, platforms, repositories, (b.s, faculty \n \n master, \n- mpls fundamentals, eigrp, switch, senior network analyst, faculty maste..."
2,"[nordstrom, 4 years ago, my own events, nordstrom(nordstrom, merchandising, events, boutique, interest, my store manager, nordstrom leaders, hands, time, nordstrom(nordstrom rack, a great eye, stu..."
3,"[life, co, 9 years, -, marketing profession, marketing collateral, application, brand, interest, demonstration, product launch, track record, workers, strong capacity, 9 years', virtue, expertise,..."
4,"[project teams, fit, both the client, contractor, project, project management, laing, nine years, kong, construction industry, course, deadlines, time, certificate, teams, a construction team, hon..."
...,...
95,"[day, turtles, harry, byzantium great expectations, carribean, compass generation, stanger, inc, prince, the half blood prince\nbedtime stories, the carribean, water, monsters, stanger tides, fant..."
96,"[career progression, establishment, project, california, fact, polymer products marketing, los angeles, my business knowledge, fifteen years experience, fact-based, process-oriented approaches, ap..."
97,"[we, market/ sell products, rotor, life, a property manager, llc, clients, mastery, strong web and crm experience, property, methods, any cleaning job, marketing skills, tql miami, rotor wing pilo..."
98,"[solutions, tv systems, we, answer, the team members, details, business requirements, insecurity, end -, devs, visual/ui/ux design, principle/invision/pixate, time, prototypes, the entire developm..."


In [69]:
entity_df["combined_list_unique"][0]

['technology',
 'match',
 'collector',
 'senior technical recruiter',
 'techniques',
 'time',
 'mentor',
 '13 years',
 'squirrel',
 'purple squirrel collector',
 'all statements',
 'networking capabilities',
 'a member',
 'the consummate recruiter',
 'people',
 'sector',
 'unicorn wrangler',
 'it sector',
 'term relationship',
 'effective sourcing techniques',
 'microsoft',
 '\n\n specialties',
 'the perfect marriage',
 'strong networking capabilities',
 'new people',
 'goal',
 'my client groups',
 'groups',
 'relationship',
 'abilities',
 'capabilities',
 'passive candidates',
 'my goal',
 'my skills',
 'the perfect match',
 '15+ years',
 'skills',
 'specialties',
 'purple squirrel hunter',
 'recruiting industry',
 'the candidate',
 'unity',
 'candidate',
 'term',
 'snap',
 'hewlett packard enterprise',
 'squirrel hunter',
 'sourcing guru',
 'methodologies',
 'recruiting',
 'work',
 'ten years',
 'technologies',
 'recruiter',
 'seattle',
 'networking',
 'client',
 'guru',
 'team',
 't

In [71]:
entity_df = pd.concat([summary["id"], entity_df], axis = 1)

In [72]:
entity_df

Unnamed: 0,id,combined_list_unique,target
0,zPnnWGWHb2gSYnd98GKaag_0000,"[technology, match, collector, senior technical recruiter, techniques, time, mentor, 13 years, squirrel, purple squirrel collector, all statements, networking capabilities, a member, the consummat...",purple squirrel microsoft
1,z5LcE0BcZevf5WmwL8nunQ_0000,"[tudor, switching, bgp, july, ospf, dwdm, (b.s) degree, technology, platforms, repositories, (b.s, faculty \n \n master, \n- mpls fundamentals, eigrp, switch, senior network analyst, faculty maste...",8 network asavie
2,xi8skZhrJpM4GVvKfqmuiQ_0000,"[nordstrom, 4 years ago, my own events, nordstrom(nordstrom, merchandising, events, boutique, interest, my store manager, nordstrom leaders, hands, time, nordstrom(nordstrom rack, a great eye, stu...",founder fashion icandydesigns
3,xemFTL4WGIObJBz0VZv0gg_0000,"[life, co, 9 years, -, marketing profession, marketing collateral, application, brand, interest, demonstration, product launch, track record, workers, strong capacity, 9 years', virtue, expertise,...",medical product system
4,wGX7QmVHBijxQrH-U3ncbA_0000,"[project teams, fit, both the client, contractor, project, project management, laing, nine years, kong, construction industry, course, deadlines, time, certificate, teams, a construction team, hon...",capable site multiplex
...,...,...,...
95,04dK5T2y78a27gewYZxfEA_0000,"[day, turtles, harry, byzantium great expectations, carribean, compass generation, stanger, inc, prince, the half blood prince\nbedtime stories, the carribean, water, monsters, stanger tides, fant...",senior engine design inc
96,-sQNuO1antttcEmOWs-o5Q_0000,"[career progression, establishment, project, california, fact, polymer products marketing, los angeles, my business knowledge, fifteen years experience, fact-based, process-oriented approaches, ap...",plastic supply bank
97,-nbbpInS-iD0thomdMs6BQ_0000,"[we, market/ sell products, rotor, life, a property manager, llc, clients, mastery, strong web and crm experience, property, methods, any cleaning job, marketing skills, tql miami, rotor wing pilo...",total qc miami
98,-jsY6ula0lsaOQ5rBHxHLg_0000,"[solutions, tv systems, we, answer, the team members, details, business requirements, insecurity, end -, devs, visual/ui/ux design, principle/invision/pixate, time, prototypes, the entire developm...",well nbc news


In [73]:
entity_df.to_csv("entity_other_from_ner.csv", index = False)