In [24]:
!pip install --quiet spacy
#!python -m spacy download en_core_web_lg

In [1]:
import pandas as pd
import numpy as np
from flashtext import KeywordProcessor

In [2]:
df = pd.read_csv('titles.txt', header=None, names=['titles'], delimiter=";", on_bad_lines="skip")

In [3]:
keyword_processor = KeywordProcessor()
keyword_dict = {
    "database administrator": ["database administrator", "Database Developer", "Database Manager"],
    "data engineer": ["data processing", "data engineer", "ETL Developer", "Database Engineer"],
    "web developer": ["web developer", "web programmer"],
    "frontend developer": ["front end", "fronteend", "front-end", "front- end", "UI Developer"],
    "backend developer": ["back end", "backend", "back-end", "back- end", "UI Developer"],
    "network engineer": ["network engineer"],
    "software developer": ["software developer", "Software Lead"],
    "web designer": ["web designer"],
    "customer service": ["customer service", "Help Desk", "Helpdesk", "Help desk"],
    "systems administrator": ["systems administrator"],
    "project manager": ["project manager", "Project Coordinator"],
    "python developer": ["python"],
    "java developer": ["java"],
    "RoR developer": ["RoR developer", "ruby on rails", "RoR", "Ruby"],
    "javascript developer": ["javascript"],
    "consultant": ["principal consultant", "senior consultant", "consultant"],
    "analyst": ["analyst"],
    "security engineer": ["security engineer", "security lead", "security manager"],
}
keyword_processor.add_keywords_from_dict(keyword_dict)

In [4]:
df['keywords'] = df['titles'].apply(
    lambda x: next(iter(list(set(keyword_processor.extract_keywords(sentence=x, span_info=False))) or []), np.NaN)
)

df[df['keywords'].isna()]

Unnamed: 0,titles,keywords
7,Authorized to work in the US for any employer ...,
11,Education BS in Computer Information Systems C...,
13,in-house projects ranging from new databases t...,
15,Headquarters,
16,GIS/Data visualization developer GIS/Data visu...,
...,...,...
129389,IT Operations Support Amazon.com,
129391,COURSES/CERTIFICATIONS(ACTIVE) Logical Operat...,
129392,Associate (MTA) Security Palo Alto Networks F...,
129394,Natural Resources,


In [16]:
df[df['keywords'].notna()]

Unnamed: 0,titles,keywords
0,Principal Consultant Principal Consultant Prin...,consultant
1,Work Experience Principal Consultant MariaDB C...,consultant
2,Autodesk Lead Database Administrator Autodesk,database administrator
3,Developer / Database Administrator Gun Accesso...,database administrator
4,"Data Processing Manager ABS Direct, Inc",data engineer
...,...,...
129393,Help desk Analyst Customer Support Help desk A...,customer service
129395,Other responsibilities as assigned by a superv...,project manager
129396,Use of VWARE and Virtualization Accessed Citr...,customer service
129397,Maintained high Customer Service through diffi...,customer service


In [6]:
df.groupby('keywords').count()

Unnamed: 0_level_0,titles
keywords,Unnamed: 1_level_1
RoR developer,262
analyst,7585
backend developer,2547
consultant,2494
customer service,1803
data engineer,300
database administrator,4022
frontend developer,4214
java developer,7413
javascript developer,2257


In [8]:
df2 = pd.read_csv(
    'job_descriptions.csv',
    header=None,
    names=['titles', "description"],
    delimiter=",",
    on_bad_lines="skip"
)

In [9]:
df2['keywords'] = df2['titles'].apply(
    lambda x: next(iter(list(set(keyword_processor.extract_keywords(sentence=x, span_info=False))) or []), np.NaN)
)

In [10]:
import re

# remove bullet point characters
# remove html tags
# remove al quotes
# add space after comma
# remove double spaces
# remove duplicates
df2 = df2[df2[['keywords', 'description']].notnull().all(1)]
df2['description'] = df2['description'].str.strip()
df2['description'] = df2['description'].str.rstrip('"')
df2['description'] = df2['description'].str.replace(r"^(\-|\?|\*|\+|\§|·)", '', regex=True)
df2['description'] = df2['description'].str.replace(r'<[^<>]*>', '', regex=True)
df2['description'] = df2['description'].str.replace('"', ' ')
df2['description'] = df2['description'].str.replace(r'[,]+(?![0-9])', ', ', regex=True)
df2['description'] = df2['description'].str.replace('  ', ' ')
df2['description'] = df2['description'].str.strip()
df2['description'] = df2['description'].drop_duplicates()
df2 = df2[df2['description'].str.split().str.len().gt(2)]  # remove rows where description hass less than 2 words
df2 = df2[df2["description"].str.contains("www.", flags=re.IGNORECASE) == False]  # remove rows where description contains "www."

In [25]:
df2.groupby('keywords').count()

Unnamed: 0_level_0,titles,description
keywords,Unnamed: 1_level_1,Unnamed: 2_level_1
RoR developer,1381,1381
analyst,46086,46086
backend developer,21268,21268
consultant,14797,14797
customer service,6648,6648
data engineer,2953,2953
database administrator,32956,32956
frontend developer,22669,22669
java developer,82282,82282
javascript developer,14007,14007


In [33]:
import spacy

nlp = spacy.load("en_core_web_lg")

df3 = df2.copy()

In [None]:
df3['tag'] = df3['description'].apply(lambda x: nlp(x)[0].tag_)

In [None]:
df3['tag']

In [None]:
doc = nlp(df2['description'][0])
if doc[0].tag_ == 'VBP':
    print(0)

In [289]:
df2["text"] = [f"as a {title}, {description}<|endoftext|>" for title,description in df2[['keywords', 'description']].values]
df2

Unnamed: 0,titles,description,keywords,text
0,Principal Consultant Principal Consultant Prin...,"Architect of robust, highly available and scal...",consultant,"as a consultant, Architect of robust, highly a..."
1,Principal Consultant Principal Consultant Prin...,Publisher of many open source database automat...,consultant,"as a consultant, Publisher of many open source..."
2,Principal Consultant Principal Consultant Prin...,Holds dual citizenship with the right to work ...,consultant,"as a consultant, Holds dual citizenship with t..."
3,Work Experience Principal Consultant MariaDB C...,"This includes replication, Galera, Maxscale, C...",consultant,"as a consultant, This includes replication, Ga..."
4,Work Experience Principal Consultant MariaDB C...,"Design and create dynamically scalable, highly...",consultant,"as a consultant, Design and create dynamically..."
...,...,...,...,...
1116989,Use of VWARE and Virtualization Accessed Citr...,Worked closely with EPIC application owners fo...,customer service,"as a customer service, Worked closely with EPI..."
1116990,Use of VWARE and Virtualization Accessed Citr...,of Family and Children Service November 2011 t...,customer service,"as a customer service, of Family and Children ..."
1116991,Use of VWARE and Virtualization Accessed Citr...,Exhibited strong ability to resolve complex de...,customer service,"as a customer service, Exhibited strong abilit..."
1116992,Use of VWARE and Virtualization Accessed Citr...,"Filled multiple roles when needed; IT Support,...",customer service,"as a customer service, Filled multiple roles w..."


In [283]:
df2.to_csv('dataset_all.csv', columns=['keywords', "description"], index=False, sep=",")

In [291]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df2, test_size=0.1)

train.to_csv('dataset_train.csv', columns=["text"], header=True, index=False, sep=",")
test.to_csv('dataset_test.csv', columns=["text"], header=True, index=False, sep=",")