In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
###### change current path
import os
path = "/content/gdrive/MyDrive/NUS_IRS_Project/"
os.chdir(path)
!ls

data  data_gathering  matching	Modelling  models  webapp_model.zip


# Job - Pre-processing and Modelling Iteration final

In [3]:
# libraries import
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
%matplotlib inline

import re
import datetime
from datetime import date
from time import strptime
import operator


######################################################################################

# Working on Job description Data
######################################################################################   

In [4]:
# reading my sorted job csv
job = pd.read_csv('data/EDA_job.csv')

In [None]:
job.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21996 entries, 0 to 21995
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   company              21996 non-null  object 
 1   education            20004 non-null  object 
 2   experience           21996 non-null  object 
 3   industry             21996 non-null  object 
 4   jobdescription       21996 non-null  object 
 5   joblocation_address  21499 non-null  object 
 6   jobtitle             21996 non-null  object 
 7   numberofpositions    4464 non-null   float64
 8   postdate             21977 non-null  object 
 9   skills               21996 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.7+ MB


###########################################################################################################################
# Understanding Job_description column (using NLP)
###########################################################################################################################


# 1. NLP - NLTK application to understand most used words

In [5]:
#Import all the dependencies
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
nltk.download('stopwords')

import string
stopwords = set(stopwords.words("english"))
import gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# 获取单词的词性
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [None]:
# defining tokenizer 
def my_tokenizer(text):
    # 1. split at whitespace
    text = text.split(' ')
    
    #2. lowercase
    text = [word.lower() for word in text]
    
    #3. Remove puncutation
    #table to replace puncuation
    punc_table = str.maketrans('','',string.punctuation)
    
    #call translate()
    text = [word.translate(punc_table) for word in text]
    
    #4. remove stopwords
    text = [word for word in text if word not in stopwords]
    
    #5. lemmmatize
    tagged_sent = pos_tag(text)     # 获取单词词性

    wnl = WordNetLemmatizer()
    lemmas_sent = []
    for tag in tagged_sent:
      wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
      lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原

    # lemmatizer = WordNetLemmatizer()
    # text = [lemmatizer.lemmatize(word, pos='v') for word in text]
    # text = [lemmatizer.lemmatize(word, pos='n') for word in text]
    # text = [lemmatizer.lemmatize(word, pos='a') for word in text]
    
    #6. remove empty strings
    text = [word for word in lemmas_sent if word !='']
    
    return text 

# 2. NLP - TF-IDF application to get a list of all tokens 
-- This helped to gather what words needed to be in stop-words list

In [6]:
job['jobdescription'][0:5]

0    Job Description   Send me Jobs like this Quali...
1    Job Description   Send me Jobs like this Quali...
2    Job Description   Send me Jobs like this - as ...
3    Job Description   Send me Jobs like this - Inv...
4    Job Description   Send me Jobs like this Pleas...
Name: jobdescription, dtype: object

In [None]:
job['jobdescription'] = job['jobdescription'].map(lambda x: x.strip().strip('Job Description').strip())

In [None]:
job['jobdescription'] = job['jobdescription'].map(lambda x: x.strip('Send me Jobs like this').strip())

In [None]:
# job['jobdescription'] = job.jobdescription.str[40:]
job['jobdescription'][0:5]

0    Qualifications: - == > 10th To Graduation & An...
1    Qualifications: - == > 10th To Graduation & An...
2    - as a developer in providing application desi...
3    - Involved with all stages of indirect taxatio...
4    Please share your Resume on : regina.mary@spir...
Name: jobdescription, dtype: object

In [None]:
df_job_descriptions = job[['jobtitle','company']]
df_job_descriptions['jd_combo'] = job['jobtitle']+" " +job['jobdescription']+" "+job['skills']+" "+job['industry']
df_job_descriptions.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,jobtitle,company,jd_combo
0,walkin data entry operator (night shift),MM Media Pvt Ltd,walkin data entry operator (night shift) Quali...
1,work based onhome based part time.,find live infotech,work based onhome based part time. Qualificati...
2,pl/sql developer - sql,Softtech Career Infosystem Pvt. Ltd,pl/sql developer - sql - as a developer in pro...
3,manager/ad/partner - indirect tax - ca,Onboard HRServices LLP,manager/ad/partner - indirect tax - ca - Invol...
4,java technical lead (6-8 yrs) -,Spire Technologies and Solutions Pvt. Ltd.,java technical lead (6-8 yrs) - Please share y...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('ã¯æ’ëœ')
#Transforms words to TFIDF
vectorizer = TfidfVectorizer(stop_words = stopwords)

index = 0
keys = {}

for jd in df_job_descriptions.itertuples() :
    key = jd[0]
    keys[key] = index
    index += 1
    
#Transforms words to TFIDF
vectorizer = TfidfVectorizer(stop_words = stopwords)

#Fit the vectorizer to the data
vectorizer.fit(df_job_descriptions['jd_combo'].fillna(''))

#Transform the data
tfidf_scores = vectorizer.transform(df_job_descriptions['jd_combo'].fillna(''))

print(tfidf_scores.shape)
print(df_job_descriptions.shape)

  % sorted(inconsistent)


(21996, 58619)
(21996, 3)


In [None]:
type(tfidf_scores)

scipy.sparse.csr.csr_matrix

In [None]:
test = pd.DataFrame(tfidf_scores.toarray(), columns = vectorizer.get_feature_names())



In [None]:
test.head()

Unnamed: 0,00,000,0000,00000,0000gmt,0001pt,00029,00034,000402,00053,...,ïƒ,ïƒ¼,ïƒž,œ100,œmost,œrecognition,œto,šâ,šã,žâ
0,0.0,0.059693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.070372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


As count vectorizer and Tf-Idf are only exploding my column numbers. It might not be wise to proceed with any of these. Moveover, I need to compare job description with Resume, that may not with fair comparison. So I will use these results so far for customizing stop word list. And will later use Doc2Vec to train my model.

# Creating my Stopword list 

### As seen there are so many unwanted tokens like numbers,ïƒ¼ etc , I need to add them in "stop words" list to train model 

In [None]:
############## use the result column of Tf-ldf to generate stop words
#getting list of all tokens
word_list = test.columns.tolist()

In [None]:
##Getting a list of unwanted words as s_words and adding to stopwords
s_words =[]
for word in word_list:
    #print(word)
    if re.search("^\W|^\d",word):
        s_words.append(word)
        

In [None]:
s_words.append('')        
from nltk.corpus import stopwords
stopword_set = set(stopwords.words('english'))
stopword_set = list(stopword_set)
stopword_set.extend(s_words)

# Collecting all text data for DOC2VEC modelling
In final iteration, I only used job title and job description for creating text combo document and got my 20-D vectors. This time I trained my model on 200 epochs. 

As count vectorizer and Tf-Idf are only exploding my column numbers. It might not be wise to proceed with any of these. Moveover, I need to compare job description with Resume, that may not with fair comparison. 

Definately Doc2Vec is the smart choice to make to proceed with matching. Because Doc2Vec has ability to read document as a whole rather than working on each single word. It has a feature to provide n-Dimentional vectors. 

So I am going to use same concept to get my vectors. Then I ll use those vectors to match it against any given resume. 



In [None]:
df_job_descriptions.head()

Unnamed: 0,jobtitle,company,jd_combo
0,walkin data entry operator (night shift),MM Media Pvt Ltd,walkin data entry operator (night shift) Quali...
1,work based onhome based part time.,find live infotech,work based onhome based part time. Qualificati...
2,pl/sql developer - sql,Softtech Career Infosystem Pvt. Ltd,pl/sql developer - sql - as a developer in pro...
3,manager/ad/partner - indirect tax - ca,Onboard HRServices LLP,manager/ad/partner - indirect tax - ca - Invol...
4,java technical lead (6-8 yrs) -,Spire Technologies and Solutions Pvt. Ltd.,java technical lead (6-8 yrs) - Please share y...


In [None]:
docs = df_job_descriptions['jd_combo']
docs_sample = docs.head(10)
docs_sample

0    walkin data entry operator (night shift) Quali...
1    work based onhome based part time. Qualificati...
2    pl/sql developer - sql - as a developer in pro...
3    manager/ad/partner - indirect tax - ca - Invol...
4    java technical lead (6-8 yrs) - Please share y...
5    walk in - as400 developer - pfsweb global serv...
6    php developer xperience/strong knowledge in PH...
7    member technical staff-wire harness/cable harn...
8    team leader Independent handling of entire pro...
9    german translator Overall Purpose of Job and R...
Name: jd_combo, dtype: object

In [None]:
#pre-processing with custom stop word list
def preprocess(text):
    stop_words = stopword_set
    #0. split words by whitespace
    text = text.split()
    
    # 1. lower case
    text = [word.lower() for word in text]
    
    # 2. remove punctuations
    punc_table = str.maketrans('','',string.punctuation)
    text = [word.translate(punc_table) for word in text]
    
    # 3. remove stop words
    text = [word for word in text if word not in stop_words]
    
    return text

In [None]:
# calling my pre-process to tokenize 
tokenized_doc = []
doc = df_job_descriptions['jd_combo']
#doc = docs_sample
for d in doc:
    tokenized_doc.append(preprocess(d))
#tokenized_doc

In [None]:
# Convert tokenized document into gensim formated tagged data
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]

In [None]:
num_doc = len(tagged_data)
num_doc

21996

In [None]:
#settings to show epoch progress
from gensim.test.utils import get_tmpfile
from gensim.models.callbacks import CallbackAny2Vec

class EpochSaver(CallbackAny2Vec):

    def __init__(self, path_prefix):
        self.path_prefix = path_prefix
        self.epoch = 0

    def on_epoch_end(self, model):
        output_path = get_tmpfile('{}_epoch{}.model'.format(self.path_prefix, self.epoch))
        model.save(output_path)
        self.epoch += 1

In [None]:
#settings to show epoch progress
class EpochLogger(CallbackAny2Vec):
    
    def __init__(self):
        self.epoch = 0
        
    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [None]:
#train model - final******** with 200 epochs
epoch_logger = EpochLogger()
## Train doc2vec model
model1 = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4, epochs = 200, callbacks=[epoch_logger])


Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
Epoch #5 start
Epoch #5 end
Epoch #6 start
Epoch #6 end
Epoch #7 start
Epoch #7 end
Epoch #8 start
Epoch #8 end
Epoch #9 start
Epoch #9 end
Epoch #10 start
Epoch #10 end
Epoch #11 start
Epoch #11 end
Epoch #12 start
Epoch #12 end
Epoch #13 start
Epoch #13 end
Epoch #14 start
Epoch #14 end
Epoch #15 start
Epoch #15 end
Epoch #16 start
Epoch #16 end
Epoch #17 start
Epoch #17 end
Epoch #18 start
Epoch #18 end
Epoch #19 start
Epoch #19 end
Epoch #20 start
Epoch #20 end
Epoch #21 start
Epoch #21 end
Epoch #22 start
Epoch #22 end
Epoch #23 start
Epoch #23 end
Epoch #24 start
Epoch #24 end
Epoch #25 start
Epoch #25 end
Epoch #26 start
Epoch #26 end
Epoch #27 start
Epoch #27 end
Epoch #28 start
Epoch #28 end
Epoch #29 start
Epoch #29 end
Epoch #30 start
Epoch #30 end
Epoch #31 start
Epoch #31 end
Epoch #32 start
Epoch #32 end
Epoch #33 start
Epoch #33 end


In [None]:
# Save trained doc2vec model
model1.save("models/my_doc2vec.model")

In [None]:
## Load saved doc2vec model
model1= Doc2Vec.load("models/my_doc2vec.model")

In [None]:
#confirm length (should be 38941)
len(tokenized_doc)

21996

In [None]:
## Get vector value
vec = np.empty([21996,20])

for k,i in enumerate(tokenized_doc):
    #print(i)
    vector = model1.infer_vector(i)
    vec[k] = vector

# reshape into 2D
new_arr = np.reshape(vec,(-1,20))

In [None]:
rng = range(1, 21)
vec_df = pd.DataFrame(new_arr, columns=['vec_' + str(i) for i in rng])

In [None]:
vec_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21996 entries, 0 to 21995
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   vec_1   21996 non-null  float64
 1   vec_2   21996 non-null  float64
 2   vec_3   21996 non-null  float64
 3   vec_4   21996 non-null  float64
 4   vec_5   21996 non-null  float64
 5   vec_6   21996 non-null  float64
 6   vec_7   21996 non-null  float64
 7   vec_8   21996 non-null  float64
 8   vec_9   21996 non-null  float64
 9   vec_10  21996 non-null  float64
 10  vec_11  21996 non-null  float64
 11  vec_12  21996 non-null  float64
 12  vec_13  21996 non-null  float64
 13  vec_14  21996 non-null  float64
 14  vec_15  21996 non-null  float64
 15  vec_16  21996 non-null  float64
 16  vec_17  21996 non-null  float64
 17  vec_18  21996 non-null  float64
 18  vec_19  21996 non-null  float64
 19  vec_20  21996 non-null  float64
dtypes: float64(20)
memory usage: 3.4 MB


In [None]:
con_job_1 = pd.concat([job, vec_df], axis=1)

In [None]:
#saving final csv with additional vectors to match with resume. 
con_job_1.to_csv('data/vectoe_job_20.csv', index=False)