#### Importing the needed libraries

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
from nltk.util import ngrams
import os
import docx2txt
from geopy.geocoders import Nominatim
from geotext import GeoText
from geopy.distance import great_circle

#### Skills data

In [3]:
#Reading skills csv file
skill_data=pd.read_csv("skills_lower1 - Sheet1.csv")

#Collecting skills
skill_list=[]
for i in range(2262):
    if(skill_data['skill_or_not'][i]=="skill"):
        skill_list.append(skill_data['essential:'][i])
                
#Converting skills data into unique list of skills
unique_skill_list=set(skill_list)
unique_skill_list=list(unique_skill_list)
unique_skill_list

['cp',
 'user interface',
 'vmware',
 'cx',
 'hive',
 'scikit learn',
 'data warehousing',
 'xgboost',
 'software development',
 'azure',
 '.net',
 'firewalls',
 'sharepoint',
 'iq',
 'apache',
 'azure cloud',
 'rest/soap api',
 'dtp',
 'business analysis',
 'java',
 'artifactory',
 'flask',
 'numba',
 'rdbms',
 'kano model',
 'koin',
 'vuejs',
 'blueprism',
 'neural network',
 'backend development',
 'optimisation technique',
 'django',
 'e-commerce',
 'ux',
 'data engineering',
 'digital finance',
 'devops',
 'impala',
 'pyramid',
 'dask',
 'powerbi',
 'node js',
 'citrix',
 'scrum',
 'hyper-v',
 'salesforce',
 'information retrieval',
 'snowflake',
 'telecommunication',
 'http',
 'ecs',
 'restful',
 'agile',
 'conda',
 'stata',
 'zeplin',
 'aris',
 'ux/ui',
 'regression',
 'gocd',
 'numpy',
 'api',
 'blockchain',
 'oracle db',
 'visual basic',
 'pfmea',
 'athena',
 'data curation',
 'nosql',
 'scala',
 'cqrs',
 'pl/sql',
 'ruby',
 'rest',
 'js',
 'xamarin',
 'animation design',
 'an

#### Languages data

In [4]:
#Reading languages csv file
language_data=pd.read_csv("languages.csv")

#Collecting languages and converting to list
language_list=language_data['Language']
language_list=list(language_list)

#All the languages in the list are converted to lower case (normalization)
for i in range(len(language_list)):
    language_list[i]=language_list[i].lower()
language_list

['danish',
 'dutch',
 'french',
 'italian',
 'norwegian',
 'portuguese',
 'romanian',
 'spanish',
 'swedish',
 'german',
 'haitian creole',
 'indonesian',
 'malay',
 'swahili',
 'albanian',
 'amharic',
 'armenian',
 'azerbaijani',
 'bengali',
 'bulgarian',
 'burmese',
 'czech',
 'dari',
 'estonian',
 'farsi',
 'finnish',
 'georgian',
 'greek',
 'gujarati',
 'hausa',
 'hebrew',
 'hindi',
 'hungarian',
 'icelandic',
 'kazakh',
 'khmer',
 'kurdish',
 'kyrgyz',
 'lao',
 'latvian',
 'lithuanian',
 'macedonian',
 'mongolian',
 'nepali',
 'pashto',
 'polish',
 'russian',
 'serbo-croatian',
 'sinhala',
 'slovak',
 'slovenian',
 'somali',
 'tagalog',
 'tajiki',
 'tamil',
 'telugu',
 'thai',
 'tibetan',
 'turkish',
 'turkmen',
 'ukranian',
 'urdu',
 'uzbek',
 'vietnamese',
 'arabic',
 'chinese - cantonese',
 'chinese - mandarin',
 'japanese',
 'korean']

#### Reading the data of different names of the skills

In [5]:
#Reading the csv file of different namings for the skills
diff_naming=pd.read_csv("skill_naming1.csv")

#Converted into a dataframe
diff_naming_df=pd.DataFrame(diff_naming)

#The incorrect naming is stored in wrong 
wrong=diff_naming_df['wrong']

#The proper naming is stored in correct for the corresponding skills
correct=diff_naming_df['correct']

#### Reading the Job information Data

In [67]:
#Reading the job information file scrapped from linkedin
job_information_data = pd.read_csv("Data jobs.csv")
job_information_dataframe=pd.DataFrame(job_information_data)

#Can enter a particular row for the corresponding job informations 
single_company_data=job_information_dataframe.iloc[1600]

#Job Description
job_description_data=single_company_data['Description']

#Location of the company
company_location=single_company_data['Location']

#Industry type
industry=single_company_data['Industry']

#Seniority level
level=single_company_data['Level']

#### Text processing

In [8]:
def processing(data):
    
    #Removing punctuations from the text
    cleaned_data = re.sub(r'[,.;'':@#?!&$()/]', ' ', data)
    
    #tokenization of the text
    tokenized_data = nltk.word_tokenize(cleaned_data)
    
    #Removing Stop words
    filtered_words = [word for word in tokenized_data if word not in stopwords.words('english')]
    
    #All the unigram words are converted to lower case
    for i in range(len(filtered_words)):
        filtered_words[i] = filtered_words[i]. lower()
        
    #Bigram of words
    bigram_data=list(nltk.bigrams(tokenized_data))
    final_words=list()
    for i in bigram_data:
        test_string=''
        test_string=' '.join(i)
        final_words.append(test_string)

    #Trigram of words
    trigram_data=list(nltk.trigrams(tokenized_data))
    for i in trigram_data:
        test_string=''
        test_string=' '.join(i)
        final_words.append(test_string)
    
    #Collection of unigrams, bigrams, trigrams
    for i in filtered_words:    
        final_words.append(i)
    
    #All words are converted to lower case
    for i in range(len(final_words)):
        final_words[i] = final_words[i]. lower()
    return final_words

#### Finding different types of analytics 

In [9]:
def diff_types_analytics(final_words):
    
    #Collects the previous word of "analytics" in the list of words
    string=''
    for i in range(len(final_words)):
        if(final_words[i]=="analytics"):
            string=string+' '+(str(final_words[i-1]))
    
    #tokenization
    text = nltk.word_tokenize(string)
                              
    #POS_tagging
    #Tells for all the words, whether it is verb, noun, adjective, etc
    pos_tagged_text=nltk.pos_tag(text)
    tagged_words = nltk.ConditionalFreqDist((tag, word) for (word, tag) in pos_tagged_text)
    
    #Mentioned the types of noun taggings
    noun=['NN','NN$','NN$-HL','NN$-TL','NN-HL','NN-NC','NN-TL','NN-TL-HL','NNS','NNS$','NNS$-HL','NNS$-TL','NNS-HL','NNS-TL','NNS-TL-HL']
    
    #Collects the previous words of 'analytics' which are noun
    final_analytics=[]
    for i in range(len(noun)):
        for key in tagged_words[noun[i]].keys():
            final_analytics.append(key)
    
    #Appending those noun words like 'data', 'predictive' with 'analytics' ==> 'data analytics', 'predictive analytics'
    for i in range(len(final_analytics)):
        final_analytics[i]=str(final_analytics[i])+' '+"analytics"
    return final_analytics

#### Collecting languages from a list of words

In [10]:
def languages(final_words):
    #from the list of words, collects the languages
    language=[]
    for i in range(len(final_words)):
        for j in range(len(language_list)):
            if(final_words[i]==language_list[j]):
                language.append(language_list[j])
                
    #converting it into unique list
    language=set(language)
    language=list(language)
    #if(len(language)==0):
    #    return "No languages mentioned"
    #else:
    return language

#### Finding difference between R language and R&D

In [59]:
def diffbet_Rlang_RndD(final_words):
    r_count=0
    rd_count=0
    for i in range(len(final_words)):
        if(final_words[i]=="r"):
            r_count=r_count+1
            if(i==len(final_words)-1):
                break;
            elif(final_words[i+1]=='d'):
                rd_count=rd_count+1
    if(rd_count>0):
        if(r_count==rd_count):
            final_words.remove("r")
    return final_words

#### Collecting list of skills from the list of words

In [12]:
def matching_skill_list(concluded_words,unique_skill_list):
    matching_skills=[]
    for i in range(len(concluded_words)):
        for j in range(len(unique_skill_list)):
            if(concluded_words[i]==unique_skill_list[j]):
                matching_skills.append(concluded_words[i])
    
    matching_skills=set(matching_skills)
    matching_skills=list(matching_skills)
    return matching_skills

#### Removing the incorrectly mentioned skill and appending the skill list with correct name

In [13]:
def remove_wrong_namings(matching_skills,wrong,correct):
    for i in range(len(matching_skills)):
        for j in range(len(wrong)):
            if(matching_skills[i]==wrong[j]):
                matching_skills.remove(wrong[j])
                matching_skills.append(correct[j])
    
    matching_skills=set(matching_skills)
    matching_skills=list(matching_skills)
    return matching_skills

#### Finding difference between 'analytics' and different types of analytics like 'data analytics'

In [14]:
def processing_skills(matching_skills,corrected_skills,final_analytics):
    diff_analytics_count=0
    analytics_count=0
    for i in range(len(matching_skills)):
        if(matching_skills[i]=="analytics"):
            analytics_count=analytics_count+1
        for j in range(len(final_analytics)):
            if(matching_skills[i]==final_analytics[j]):
                diff_analytics_count=diff_analytics_count+1
    if(diff_analytics_count>0):
        if(analytics_count==diff_analytics_count):
            corrected_skills.remove("analytics")
    return corrected_skills

#### Concluding informations from job data by calling the above functions

In [77]:
#calling the above functions
words_job_description=processing(job_description_data)

final_analytics_jd=diff_types_analytics(words_job_description)    
final_languages_jd=languages(words_job_description)
concluded_words_jd=diffbet_Rlang_RndD(words_job_description)

matching_skills_jd=matching_skill_list(concluded_words_jd,unique_skill_list)
corrected_skills_jd=remove_wrong_namings(matching_skills_jd,wrong,correct)
processed_skills_jd=processing_skills(matching_skills_jd,corrected_skills_jd,final_analytics_jd)

#### Industry Type 

In [71]:
#reading industries names file
names_data=pd.read_csv("industries1.csv")
names_dataframe=pd.DataFrame(names_data)
one=names_dataframe['one']
two=names_dataframe['two']
three=names_dataframe['three']
four=names_dataframe['four']

In [72]:
#making changes in the words which are wrongly displayed
#eg.: SoftwareStaffingRecruiting => Software Staffing Recruting
ind=[]

string=''
for j in industry:
    if(j==j.lower()):
        string=string+j
    if(j==j.upper()):
        string = string+" "+j
ind.append(string)

for i in range(len(ind)):
    if("& &" in ind[i]):
        ind[i]=ind[i].replace(" & ",' ')
    if(", ," in ind[i]):
        ind[i]=ind[i].replace(", ,",' ')
    if("/" in ind[i]):
        ind[i]=ind[i].replace("/",' ')
print(ind)

['   ']


In [73]:
#tokenization        
for i in range(len(ind)):
    test=nltk.word_tokenize(ind[i])

    #finds the industry type of the company    
    if(len(test)==1):
        for l in range(len(one)):
            if(test[0]==one[l]):
                print(test[0])

    elif(len(test)==2):
        for l in range(len(one)):
            if(test[0]==one[l] and test[1]==two[l]):
                print(test[0],test[1])
            elif(test[1]==one[l] and two[l]==' '):
                print(test[1])

    elif(len(test)==3):
        for l in range(len(one)):
            if(test[0]==one[l] and test[1]==two[l] and test[2]==three[l]):
                print(test[0],test[1],test[2]) 
            elif(test[1]==one[l] and test[2]==two[l]):
                print(test[1],test[2]) 
            elif(test[2]==one[l]):
                print(test[2]) 

    elif(len(test)==0):
        continue;

    else:
        ind1=[]
        for j in range(len(test),len(test)-4,-1):
            ind1.append(test[j-1])
        ind1=ind1[::-1]
        for l in range(len(one)):
            if(ind1[0]==one[l] and ind1[1]==two[l] and ind1[2]==three[l] and ind1[3]==four[l]):
                print(ind1[0],ind1[1],ind1[2],ind1[3]) 
            elif(ind1[1]==one[l] and ind1[2]==two[l] and ind1[3]==three[l]):
                print(ind1[1],ind1[2],ind1[3]) 
            elif(ind1[2]==one[l] and ind1[3]==two[l]):
                print(ind1[2],ind1[3]) 
            elif(ind1[3]==one[l]):
                print(ind1[3])

#### Outputting informations from the resumes

In [57]:
#skills and languages are collected from the job description
print(single_company_data)
print("\nRequired languages:")
print(final_languages_jd)
print("\nRequired skills:")
print(processed_skills_jd)
print("\n")

candidate_skills=[]
candidate={}
a,b,c=0,0,0
name=[]

#Reading all the resumes from the 'resumes' folder
entries = os.listdir('resumes')

#for loop for each resume
for entry in entries:
    
    #converts docs file to text
    resume_data = docx2txt.process(entry)
    
    #name of the candidate is appended in the list 'name'
    name.append(entry)
    
    #calling the above functions
    words_resume=processing(resume_data)
    
    final_analytics_resume=diff_types_analytics(words_resume)
    final_languages_resume=languages(words_resume)
    concluded_words_resume=diffbet_Rlang_RndD(words_resume)        
    
    matching_skills_resume=matching_skill_list(concluded_words_resume,unique_skill_list)
    corrected_skills_resume=remove_wrong_namings(matching_skills_resume,wrong,correct)
    processed_skills_resume=processing_skills(matching_skills_resume,corrected_skills_resume,final_analytics_resume)
    
    #skills are collected from the resume
    candidate_skills.append(processed_skills_resume)
    candidate[name[a]]=processed_skills_resume
    a=a+1
    
    #places = GeoText(resume_data)
    #print(places.cities)
    #places1=places.cities
    #places1=str(places1)

    #location of the candidate
    candidate_location="Charleroi"
    places=[]
    places.append(company_location)
    places.append(candidate_location)
    latitude=[]
    longitude=[]
      
    #finding the distance between the candidate location and the company's location
    geolocator = Nominatim(user_agent="http")
    for i in range(len(places)):
        locate = geolocator.geocode(places[i])
        latitude.append(locate.latitude)
        longitude.append(locate.longitude)
           
    first = (latitude[0], longitude[0])
    second = (latitude[1], longitude[1])
    print("Distance in km:")
    print(name[b],":",great_circle(first, second).km)
    b=b+1
    
    #collects the languages known by the candidate
    #if(len(final_languages_resume)==0):
    #    print("No languages mentioned")
    #else:
    print("Languages known :",final_languages_resume)    
    
    #finds the matching languages with the job description and the candidate's resume
    matching_languages=[]
    for i in range(len(final_languages_jd)):
        for j in range(len(final_languages_resume)):
            if(final_languages_jd[i]==final_languages_resume[j]):
                matching_languages.append(final_languages_jd[i])
    print("Matching languages : ")
    print(name[c],":",matching_languages)
    c=c+1
    
    #level of experience matching
    years=[]
    for j in range(len(words_resume)):
        if(words_resume[j]=='years' and words_resume[j-2]!='over'):
            years.append(words_resume[j-1])
    if(len(years)>0):
        year=str(years[0])
        year=int(''.join(list(filter(lambda c: c.isdigit(), year))))
        print("Experience level :",year)
    else:
        year=0
        print("Experience level : Not mentioned")
        
    
    # Job position's needed level        candidate's years of experience  
    #-------------------------------------------------------------------
    # Not Applicable                =>   greater than or equal to 0
    # Entry level or Associate      =>   greater than or equal to 0
    # Mid-Senior level              =>   greater than 3
    # Executive                     =>   greater than 5
    # Director                      =>   greater than 7
    
    
    if(level=="Not Applicable" or level==' '):
        if(year>=0):
            print("level matches")
            
    elif(level=="Entry level" or level=="Associate" or 3>int(level)>=0):
        if(3>year>=0):
            print("level matches")
        else:
            print("level does not match")
            
    elif(level=="Mid-Senior level" or 5>=int(level)>=3):
        if(5>=year>=3):
            print("level matches")
        else:
            print("level does not match")
            
    elif(level=="Executive" or 7>=int(level)>=5):
        if(7>=year>=5):
            print("level matches")
        else:
            print("level does not match")  
            
    elif(level=="Director" or int(level)>=7):
        if(year>=7):
            print("level matches")
        else:
            print("level does not match")   
    print("\n")

Job ID                                                            NaN
Date                                                     30+ days ago
Company Name                                                    Smals
Post                                                   Data Scientist
Location                                                     Brussels
No.of Applicants                                                     
Description         Chez Smals, plus de 1900 professionnels façonn...
Level                                                                
Type                                                                 
Function                                                             
Industry                                                             
Link                https://be.indeed.com/pagead/clk?mo=r&ad=-6NYl...
Review                                                            3.3
Name: 1500, dtype: object

Required languages:
[]

Required skills:
['r', 'big data', 'pyt

#### Matching skills with the job description and the candidate's resume

In [58]:
final_rank={}
matching={}
def fun(candidate,comp,st_n):

    set1=set(candidate)
    set2=set(comp)
    
    set3=set1.intersection(set2)
    
    final_rank.update({st_n:len(set3)})

    matching.update({st_n:set3})
    
for i,j in candidate.items():
    fun(processed_skills_jd,j,i)
     
#Final ranking tells the count of the matching skills 
print("\nFinal ranking : " ,final_rank)

#Matching skills
print('Matching skills : ' , matching)


Final ranking :  {'Geoffrey Brown.docx': 1, 'LyanneGibson.docx': 2}
Matching skills :  {'Geoffrey Brown.docx': {'python'}, 'LyanneGibson.docx': {'r', 'python'}}
