In [1]:
#spacy
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

#gensim
import gensim
from gensim import corpora

#Visualization
from spacy import displacy
import pyLDAvis.gensim_models
from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt

#Data loading/ Data manipulation
import pandas as pd
import numpy as np
import jsonlines

#nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet'])

#warning
import warnings 
warnings.filterwarnings('ignore')

from spacy.cli import download

# download("en_core_web_lg")

In [2]:
nlp = spacy.load("en_core_web_lg")
skill_pattern_path = "../jz_skill_patterns.jsonl"

In [32]:
ruler = nlp.create_pipe('entity_ruler')
ruler.from_disk(skill_pattern_path)
nlp.add_pipe(ruler, before='ner')
nlp.pipe_names

['tagger', 'parser', 'entity_ruler', 'ner']

In [33]:
def get_skills(text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            subset.append(ent.text)
    myset.append(subset)
    return subset


def unique_skills(x):
    return list(set(x))

In [35]:
clean = []
review = re.sub(
    '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"',
    " ",
    text,
)
review = review.lower()
review = review.split()
lm = WordNetLemmatizer()
review = [
    lm.lemmatize(word)
    for word in review
    if not word in set(stopwords.words("english"))
]
review = " ".join(review)
clean.append(review)

In [36]:
clean

['data science ai engineer tien phat bui 0962980173 buitienphat2462002 com phat bui ba3421222 go vap district ho chi minh city education bachelor information technolo gy expected 05 2024 ho chi minh city open university 97 vo van tan ward 6 district 3 ho chi minh city gpa 3 76 4 00 highest major major information technology relevant coursework machine learning data visualization communication database system sql natural language processing computer vision deep learning neural network skill python library framework panda matplotlib numpy beutiful soup4 o scikit learn hugging face transformer pytorch python flask langchain nltk big data apache hadoop pyspark sql software sqlite mysql sql postgresql power bi tool power query visualize c library stl boot c armadillo project legal search re ponse platform group competition project november 2023 provides user quick access legal information delivers reliable answer legal query enables efficient navi gation legal code document utilizes reactjs

In [37]:
skills = unique_skills(get_skills(text.lower()))
skills

['postgresql',
 'hadoop',
 'data science',
 'python',
 'continuous integration',
 'deployment',
 'natural language',
 'interaction',
 'mysql',
 'deep learning',
 'continuous deployment',
 'visualization',
 'pytorch',
 'pandas',
 'ai',
 'flask',
 'machine learning',
 'docker',
 'computer vision',
 'big data',
 'source code',
 'software',
 'database',
 'design',
 'data visualization',
 'sqlite',
 'numpy',
 'comprehension',
 'libraries']

In [38]:
sent = nlp(text)
displacy.render(sent, style="ent", jupyter=True)

---