## Skill chunk extraction

In [27]:
# !pip install fuzzywuzzy

In [28]:
# !pip install texthero

In [29]:
#read in data and helper functions

#skill data
import pandas as pd
import numpy as np
from skill_chunk_matching import extract_noun, tfidf_input_data_format, extracted_noun_tfidf, chunk_matching
import nltk
from nltk.corpus import stopwords
import spacy

nlp = spacy.load('en_core_web_sm')
nltk.download('stopwords')
stop_words = list(set(stopwords.words('english')))

df_techskill = pd.read_excel('Technology Skills.xlsx')
skillset = list(np.unique(df_techskill['Example']))
df_GoogleJob = pd.read_csv('job_skills.csv')
df_GoogleJob['all_req'] = df_GoogleJob['Minimum Qualifications']+' '+df_GoogleJob['Preferred Qualifications']
df_raw = df_GoogleJob[['Title','Category','all_req']]



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [60]:
df_ukjob = pd.read_csv('uk_job_data.csv')
df_ukjob['RequiredQual']

0        To perform this job successfully, an\r\nindivi...
1        - Bachelor's Degree; Master's is preferred;\r\...
2        - Degree in environmentally related field, or ...
3        - Advanced degree in public health, social sci...
4        - University degree; economical background is ...
                               ...                        
18996    - At least 5 years of experience in Interface/...
18997    - University degree, ideally business related;...
18998    - Degree in Business, Marketing or a related f...
18999    - At least 1 year of experience in online sale...
19000    - Higher legal education; Master's degree is a...
Name: RequiredQual, Length: 19001, dtype: object

In [30]:
#test the word chunk fuzzy matching
data_extracted_chunk = df_raw['all_req'].apply(lambda x:extract_noun(x,nlp,1))
# data_high_tfidf = extracted_noun_tfidf(df_raw['all_req'],nlp)
for row_num in [10,20,30]:
  print(df_raw['Title'][row_num])
  print(chunk_matching(row_num,data_extracted_chunk,skillset))
  print('-------------------------------------------------') 

Data Analyst, Consumer Hardware
['a non-technical audience' 'ability' 'an analytical role'
 'business intelligence' 'complex data-sets' 'computer science'
 'data analytics' 'economics' 'engineering' 'experience'
 'scripting languages' 'sql or visualization tools' 'structured analysis'
 'tableau']
-------------------------------------------------
Quantitative Analyst, Ads Quality
['ability' 'data' 'econometrics' 'experience' 'familiarity' 'matlab'
 'modeling' 'others' 'python' 'research experience' 'statistics'
 'the analysis']
-------------------------------------------------
Presales Manager, New Business Development
['ability' 'comfort' 'experience' 'experience planning' 'incentives'
 'metrics' 'process' 'sales management' 'sales operations' 'tools'
 'training']
-------------------------------------------------


## Rule-based extraction

In [48]:
from rule_based_matching import sentence_matching,sentence_processing,run_hardskill_extraction

In [32]:
#test rule-based
output = []
for row_num in [10,20,30]:
  doc = df_raw['all_req'][row_num].replace('\n',' ')
  chunks = chunk_matching(row_num,data_extracted_chunk,skillset)
  hs = run_hardskill_extraction(doc,nlp)
  output.append([df_raw['Title'][row_num],doc,chunks,hs])

pd.DataFrame(output)

Unnamed: 0,0,1,2,3
0,"Data Analyst, Consumer Hardware","BA/BS degree in Computer Science, Engineering,...","[a non-technical audience, ability, an analyti...","[role, Intelligence, Analytics, Statistical, C..."
1,"Quantitative Analyst, Ads Quality",PhD in Statistics or Econometrics or a related...,"[ability, data, econometrics, experience, fami...","[analysis, modeling, data, R, Python, MATLAB, ..."
2,"Presales Manager, New Business Development",BA/BS degree or equivalent practical experienc...,"[ability, comfort, experience, experience plan...","[role, goals, Management, operate, environment]"


In [33]:
nltk. download('punkt')
text = pd.DataFrame(output).iloc[0,1]
text = text.replace(';','.')
a_list = nltk.tokenize.sent_tokenize(text) 
# text.split()
print(a_list)

['BA/BS degree in Computer Science, Engineering, Economics, Business or a related degree, or equivalent practical experience.', '4 years of experience in an Analytical role such as Business Intelligence, Data Analytics, Statistical or Consulting.', '4 years of experience with SQL or visualization tools such as Tableau.', 'Experience in designing and executing structured analysis, deriving business insights, and evaluating the impact of business decisions.', 'Experience working with and developing for non-technical users (defining requirements, explaining technical concepts to non-technical business users, etc).', 'Experience with scripting languages and business intelligence tools.', 'Demonstrated ability to analyze complex data-sets in a fast-paced environment, with the ability to work strategically and objectively.', 'Ability to deal with and prioritize multiple priorities while driving towards pragmatic decisions.', 'Excellent oral and written communication skills, with the ability 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Sentence spliting and De-noising

In [34]:
#split all paragraphs into sentences
nltk. download('punkt')
sent_list = []
for row_num in range(len(df_raw)):
  cur_text = str(df_raw['all_req'][row_num])
  if len(cur_text) == 0:
    continue
  sent_list += nltk.tokenize.sent_tokenize(cur_text.replace('\n',' ').replace(';','.'))
print(sent_list)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['BA/BS degree or equivalent practical experience.', '3 years of experience in program and/or project management in cloud computing, enterprise software and/or marketing technologies.', 'Experience in the business technology market as a program manager in SaaS, cloud computing, and/or emerging technologies.', 'Significant cross-functional experience across engineering, sales, and marketing teams in cloud computing or related technical fields.', 'Proven successful program outcomes from idea to launch in multiple contexts throughout your career.', 'Ability to manage the expectations, demands and priorities of multiple internal stakeholders based on overarching vision and success for global team health.', 'Ability to work under pressure and possess flexibility with changing needs and direction in a rapidly-growing organization.', 'Strong organization and communication skills.', 'BS degree in an Engineering discipline or equivalent practical experience.', '7 years of experience in Cable/Co

In [52]:
hardskill_output = []
# sent_extracted_chunk = pd.Series(sent_list).apply(lambda x:extract_noun(x,nlp,1))
exclude_words = ['ability','experience','teams','team','idea','priority','need','needs','demand','possess','pressure','direction','organization','vision']
from tqdm import tqdm
for i in tqdm(range(len(sent_list))):
  doc = sent_list[i]
  try:
    # chunks = list(chunk_matching(i,sent_extracted_chunk,skillset))
    chunks = []
    hs = run_hardskill_extraction(doc,nlp)
    hardskill_output.append([doc,[x for x in hs+chunks if x not in exclude_words]])
  except:
    pass

# pd.DataFrame(hardskill_output)

100%|██████████| 10153/10153 [02:20<00:00, 72.13it/s]


In [None]:
m = [x for x in hardskill_output if len(x[1])!=0]
m

In [62]:
df_ukjob['RequiredQual'][0:1000]

0      To perform this job successfully, an\r\nindivi...
1      - Bachelor's Degree; Master's is preferred;\r\...
2      - Degree in environmentally related field, or ...
3      - Advanced degree in public health, social sci...
4      - University degree; economical background is ...
                             ...                        
995    - Higher education;\r\n- Knowledge of CorelDra...
996    - Good knowledge of Armenian, Russian and Engl...
997    We are expecting our Developers to have:\r\n- ...
998    - University degree or relevant certified trai...
999    - University degree in International Relations...
Name: RequiredQual, Length: 1000, dtype: object

In [74]:
sent_list_1 = []
for row_num in range(5000):
  cur_text = str(df_ukjob['RequiredQual'][row_num])
  if len(cur_text) == 0:
    continue
  sent_list_1 += nltk.tokenize.sent_tokenize(cur_text.replace('- ','').replace('\r\n',' ').replace('\r',' ').replace('\n',' ').replace(';','.'))
print(sent_list_1)

['To perform this job successfully, an individual must be able to perform each essential duty satisfactorily.', 'The requirements listed below are representative of the knowledge, skill, and/or ability required.', 'Knowledge of: Generally accepted accounting principles.', 'Local accounting standards and legislation.', 'State reporting requirements pertaining to accounting.', 'Principles and practices of financial management and budgeting.', 'Principles and practices of financial systems design and analysis.', 'Principles and practices of contract management, records management, and risk management.', 'Principles and practices of management and supervision.', 'Principles and practices of information systems management.', "Ability to: Apply sound fiscal and administrative practices to the company's activities.", 'Plan, organize and supervise the work of subordinate employees, including training them, assigning and evaluating their work, and providing job performance feedback.', 'Critical

In [75]:
len(sent_list_1)

34409

In [79]:
hardskill_output_1 = []
# sent_extracted_chunk = pd.Series(sent_list).apply(lambda x:extract_noun(x,nlp,1))
exclude_words = ['tasks','projects','priorities','candidates','organizations','sector','people','part','project','deadline','deadlines','time','member','companies','development','field','work','area','one','environment','environments','ability','experience','teams','team','idea','priority','need','needs','demand','possess','pressure','direction','organization','vision']
from tqdm import tqdm
for i in tqdm(range(len(sent_list_1))):
  doc = sent_list_1[i]
  try:
    # chunks = list(chunk_matching(i,sent_extracted_chunk,skillset))
    chunks = []
    hs = run_hardskill_extraction(doc,nlp)
    hardskill_output_1.append([doc,[x for x in hs+chunks if x not in exclude_words]])
  except:
    pass


100%|██████████| 34409/34409 [06:24<00:00, 89.47it/s]


In [80]:
m_1 = [x for x in hardskill_output_1 if len(x[1])!=0]
len(m_1)

6345

In [93]:
# with open('skills.txt', 'w') as f:
#     for line in m_1+m:
#         f.write(str(line[1]))
#         f.write('\n')
with open('sentences.txt') as f:
    # lines = f.readlines()
import pickle 
with open('mylist', 'wb') as f: 
  pickle.dump(m_1+m, f) 
# with open('mylist', 'rb') as f: 
#   mylist = pickle.load(f) 