In [3]:
import pandas as pd
import numpy as np
import json
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

import pickle

In this task, we are going to recommend top 10 resumes for particular job posting.

As we know, there are a thousand kind of jobs (e.g: IT, finance, banking, teacher, art, music, movie, etc..). It is not possible for us to know what exactly the tile. Therefore, in this task, we are simply device two type of jobs. IT jobs and non IT jobs. 

IT jobs involving anything about developer, programmer, programming language, database .. etc..

NonIT jobs are the rest.

I provide a training model that I've trained by using the dataset, this model simply classify which job posting from our data belong to IT or nonIT. 

Because we only consider IT and nonIT things, so I construct a keywords if IT jobs, which will help us classify which given resume belong to IT and which are not.

The accuracy of the model I've train was 85%.

In [4]:
#Load the training model
vectorizer_model = pickle.load(open("count_vectorizer_model.pkl",'rb'))
selector = pickle.load(open("reduce_dimension_selector.pkl",'rb'))
gnb = pickle.load(open("gaussian_model.pkl",'rb'))

In [5]:
#construct keywords for IT related
it_keywords = "java c c++ .net javascript database system developer programmer c# python machine learning web-developer \
                web android ios window linux unix cloud computing computer science data embedding ai artificial intelligence \
                big-data bigdata big c/c++ uml software information iphone objective-c oop swift sql hbase hive nosql asp.net \
                mvc framework html css xcode eclipse xml rest-api api mysql sqlserver php bash mining blockchain deep \
                data structure algorithms network tcpip tcp-ip neural j2ee, jsf, spring"

First, we will define the function to read resume in txt format.

In [16]:
#get the id of resume
cv_list = np.arange(1,867)

# a function to get the path name for each particular cv 
def get_path_resume(number):
    """
    This function to get path name for a cv
    :param 
        number: the CV's number
    :result: path name of cv
    """
    return "./resumeTxt/resume_("+str(number)+ ").txt"

def read_cv_txt(path):
    with open(path,'rb') as f:
        data = f.readlines()
    data = [str(d.strip(),'utf-8') for d in data if d.strip() != b'' and len(d.strip()) >1]
    return data

Then, we are going to read all the job posting from json format, which was constructed from the Task 1. 

In [17]:
#read the job posting in the kson format that I construct from Task1
def read_json():
    #open the job posting in json format
    with open('job_posting.json','r') as f: 
        json_result = json.load(f)
        return json_result['listings']['listing']

Explore data in more detail, you will see that there are several job posting has title and required qualification are total different. For instance:

```json
    {
                "_id": 60283,
                "title": "Procurement Officer",
                "location": "Yerevan, Armenia",
                "job_descriptions": {
                    "descriptions": [
                        "\"\"LDT Technology\"\" LLC is looking for highly motivatedperson for the position of Sales Manager who can run daily businesscorrespondence and negotiations with international partner companies."
                    ]
                },
                "job_responsibilities": {
                    "responsibility": [
                        "Develop applications in accordance with given specifications",
                        "Read, understand and modify the existing code",
                        "Work as a part of a software development team."
                    ]
                },
                "required_qualifications": {
                    "qualification": [
                        "University degree in Computer Sciences or a related field",
                        "At least 5 years of experience with production software design anddevelopment",
                        "Expert knowledge and experience of the following:
                        "a) Advanced C/C++, embedded firmware development",
                        "b) Data structures and algorithms",
                        "c) Experience with Microcontroller (ATMEL, PIC) application development",
                        "d) Knowledge of signal processing algorithms and data processing",
                        "e) Knowledge of software development libraries",
                        "Good problem solving and debugging skills",
                        "Knowledge of the Software Development Life Cycle and UML",
                        "Good time-management and organizational abilities that facilitatestructured teamwork",
                        "Good written and oral communication skills",
                        "Interest in the industrial automation and semiconductor testingprocesses would be an asset."
                    ]
                },
            },
```

Looking at the json data above, apparently, the title and the job required qualifications are not related. Because the title has less meaning than required qualifications, so we will rely on job qualifications in order to classify which type of cluster the job belong to (IT or nonIT).

In the following script, a function json_to_df are defined, which will extract only ID and required_qualification fields from json file and put it into a dataframe

In [18]:
# Transfrom json format to data frame
def json_to_df(json_array):
    _id = []
    qualifications = []
    for json in json_array:
        qualification = json['required_qualifications']['qualification']
        if qualification != 'N/A':
            _id.append(json["_id"])
            qualif = " ".join(qualification)
            qualifications.append(qualif)
    df = pd.DataFrame({"_id":_id,"qualification":qualifications})
    return df

After constructing a data frame, we are going to clean and extract the token from job required qualifications

In [19]:
# extract and clean the data frame
def extract_clean_token(data):
    # get token
    pattern = "[a-zA-Z0-9_.#+]+(?:[-'][a-zA-Z0-9_.#+]+)??"
    tokenizer = RegexpTokenizer(pattern)
    tokens = tokenizer.tokenize(data)
    stem = SnowballStemmer("english")
    # remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [stem.stem(token) for token in tokens if token not in stop_words and len(token)>1]
    return " ".join(tokens)
    

By now, we are able to read, extract and transform the data by incoporating all the function above.

In [20]:
json_array = read_json()
df = json_to_df(json_array)
df['qualification'] = df['qualification'].map(lambda x : extract_clean_token(x))
df.head()

Unnamed: 0,_id,qualification
0,82387,univers degre prefer financ credit bank otherr...
1,47791,bs ms busi administr comput scienc microelectr...
2,59252,univers degre honor mba desir good knowledg wr...
3,42111,fifth year colleg univers program certif from ...
4,73969,univers degre econom manag market financ stron...


Our perparing and cleaning data for job posting is done. At this point, we are going to predict the title IT or nonIT for each job posting.

The predict_label function will predict the label for particular jobs posting by using required_qualifications. We assign the labels of each job posting to the new columns of dataframe, then transform the data frame to dictionary

In [21]:
def predict_label(df_job_posting):
    qualif = vectorizer_model.transform(df_job_posting['qualification'])
    qualif = selector.transform(qualif).toarray()
    labels = gnb.predict(qualif)
    return labels

# assign title
df['IT'] = predict_label(df)

# convert dataframe to dictionary for the convenience
job_posting_dic = df.to_dict('records')

  if np.issubdtype(mask.dtype, np.int):


In the following script, we gonna assign IT job posting to an array named *it_content* and assign nonIT job posting to an array, named *non_it_content*.

The reason to do that because we will have two seperated job posting list, one for IT job and one for nonIT job, which will be used to construct a TfIdf vector for two different job. Using two vector to calculate the score of resume against job posting.

In [22]:
it_content = []
non_it_content = []
for j in job_posting_dic:
    if j['IT'] == 1:
        it_content.append(j['qualification'])
    else:
        non_it_content.append(j['qualification'])

This is crucial step. In this step, I am gonna extract experience from each resume. The new variable cvs contain id of resume and its experience in the tuple format. 

E.g: (_id,'experience')

In [23]:
def cvs_read_and_extraction(cv_list):
    cvs = []
    for i in cv_list:
        path = get_path_resume(i)
        resume_content = read_cv_txt(path)
        resume_content = " ".join(resume_content)
        cv_tokens = extract_clean_token(resume_content)
        
        cv_summary = []
        experience = re.findall("(?:experi).*$",cv_tokens)
        experience = " ".join(experience)
        
        qualification = re.findall("(?:qualif).*$",cv_tokens)
        qualification = " ".join(qualification)
        
        certification =  re.findall("(?:certif).*$",cv_tokens)
        certification = " ".join(certification)
        
        if qualification in experience:
            experience = experience.replace(qualification,"")
            
        if certification in experience:
            experience = experience.replace(certification,"")
            
        cvs.append((i,experience.strip()))
    return cvs

cvs = cvs_read_and_extraction(cv_list)

Now, we are going to cluster resume. IT resume will be appended to the variable *cvs_it* while nonIT resume will be appended to the variable *cvs_non_it*.

To cluster resume to IT resume or nonIT resume, we are going to use cosin measure.

More about cosine measure, please find the link:
    
http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/

We set threshold is 0.4


In [32]:
snow = SnowballStemmer("english")
keywords = [snow.stem(key) for key in it_keywords.split(" ")]
keywords = " ".join(keywords)
tfidf = TfidfVectorizer()

cluster_it_model = tfidf.fit([keywords])
keyword_transform = cluster_it_model.transform([keywords])

cvs_it = []
cvs_non_it = []

#using cosine measure to calculate score
for cv in cvs:
    _cv = cv[1]
    cv_transform = cluster_it_model.transform([_cv])
    score = cosine_similarity(keyword_transform,cv_transform)
    if score > 0.4:
        cvs_it.append(cv)
    else:
        cvs_non_it.append(cv)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


This step construct job posting TfIdf vector for IT jobs and nonIT jobs

In [33]:
tf_idf_it = TfidfVectorizer(max_df=0.98,min_df=0.02)
tf_it_model = tf_idf_it.fit(it_content)

tf_idf_non_it = TfidfVectorizer(max_df=0.98,min_df=0.02)
tf_non_it_model = tf_idf_non_it.fit(non_it_content)

The following script define a fuction to get the most relevant resume for particular job posting. If a job posting is a IT job, TfIdf vector of IT job will be taken into transformation. On the other hand, TfIdf vector of non IT job will be used to transform a resume.

Again, we use consin score to calculate the score of resume against a specific job posting. 

In [34]:
# a function to get the highest score of single job posting against resume
# we calculate by using cosine measure
def top_match(job_posting,cvs,isIT):
    job_vector = None
    if isIT == 0:
        job_vector = tf_non_it_model.transform([job_posting])
    if isIT == 1:
        job_vector = tf_it_model.transform([job_posting])
    top_score = []
    for cv in cvs:
        _cv = cv[1]
        if isIT == 0:
            _cv = tf_non_it_model.transform([_cv])
        if isIT == 1:
            _cv = tf_it_model.transform([_cv])
        score = cosine_similarity(job_vector,_cv)
        top_score.append((cv[0],score))
    return [(score[0],score[1][0][0]) for score in top_score]

Last but not least, we are going to get the top 10 cvs for first 500 jobs posting

In [35]:
count = 0
top_cvs = None
recommendations = []
for job in job_posting_dic:
    count +=1
    _id = job['_id']
    if job["IT"] == 0:
        top_cvs = top_match(job['qualification'],cvs_non_it,0)
        top_cvs.sort(key=lambda x: x[1])
        top_cvs = top_cvs[::-1][0:10]
        recommendations.append((_id,top_cvs))
    if job["IT"] == 1:
        top_cvs = top_match(job['qualification'],cvs_it,1)
        top_cvs.sort(key=lambda x: x[1])
        top_cvs = top_cvs[::-1][0:10]
        recommendations.append((_id,top_cvs))
    if count == 500:
        break
        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [36]:
recommendations

[(82387,
  [(473, 0.31632860819390357),
   (7, 0.2700185287724231),
   (824, 0.26318967483237),
   (502, 0.2512969356621228),
   (279, 0.24742264763551572),
   (749, 0.22912654332503105),
   (569, 0.2279674288268574),
   (604, 0.22734408475394302),
   (680, 0.22719339326314714),
   (555, 0.22005964894963082)]),
 (47791,
  [(388, 0.2629686611790353),
   (791, 0.24507850319510655),
   (319, 0.2338697673829756),
   (249, 0.22210506244149664),
   (412, 0.20936523041712002),
   (558, 0.2022814803318726),
   (187, 0.19198944795894116),
   (2, 0.18834812562724576),
   (763, 0.17657693306499037),
   (707, 0.1687040125399474)]),
 (59252,
  [(749, 0.22593004108622797),
   (398, 0.217483236725832),
   (361, 0.20615685733332512),
   (366, 0.18450636964979866),
   (7, 0.15695673033454938),
   (598, 0.15029833066408427),
   (397, 0.14891601508673577),
   (852, 0.14050759594094886),
   (3, 0.13989682705779505),
   (123, 0.13859851619117625)]),
 (42111,
  [(31, 0.2479970261489159),
   (501, 0.24511071

Finally, we will write a result to the txt file.

In [37]:
with open("cv_recommendation.txt",'w') as f:
    for r in recommendations:
        line = ""
        line +=str(r[0]) +": "
        for id_re in r[1]:
            line += str(id_re[0]) +", "
        line +="\n"
        f.write(line)