---------

In [1]:
import pandas as pd

def select_and_save_job_descriptions(input_file, output_file, num_descriptions=300):
    df = pd.read_csv(input_file)
    subset_df = df.sample(n=num_descriptions)
    
    jobpost_df = subset_df[['jobpost']]
    
    jobpost_df.to_csv(output_file, index=False)


In [2]:
input_file = '/home/rooting/code/Chanel/Job posts/data job posts.csv'
output_file = '/home/rooting/code/Chanel/Job posts/subset_data_job.csv'

df_test = select_and_save_job_descriptions(input_file, output_file)

In [3]:
df_test = pd.read_csv(output_file)
df_test

Unnamed: 0,jobpost
0,Ernst & Young CJSC\r\nTITLE: Accounting Speci...
1,SAS Group LLC\r\nTITLE: Web Developer\r\nSTAR...
2,Inecobank CJSC\r\nTITLE: Risk Management Spec...
3,Accept Employment Agency\r\nTITLE: Accountant...
4,"""IT Master"" LLC\r\nTITLE: Sales Manager\r\nOP..."
...,...
295,MLL Industries\r\nTITLE: Bid Manager\r\nTERM:...
296,Partner Organization of Career Center\r\nTITLE...
297,Spayka LLC\r\nTITLE: Sales and Marketing Spec...
298,CQG-Yerevan\r\nTITLE: SCM Engineer\r\nLOCATIO...


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

def preprocess_text(text):
    text = text.lower()
    
    text = re.sub(r'\W+', ' ', text) # carc special

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)


In [5]:
df_test.iloc[0][0]

'Ernst & Young CJSC\r\nTITLE:  Accounting Specialist, Tax and Law Department\r\nLOCATION:  Yerevan, Armenia\r\nJOB DESCRIPTION:  Ernst & Young CJSC invites applications from qualified\r\ncandidates to fill the position of Accounting Specialist for EYs Tax &\r\nLaw practice in Armenia.\r\nJOB RESPONSIBILITIES:\r\n- Provide statutory accounting and Tax services to EY clients;\r\n- Provide payroll services to EY clients;\r\n- Provide various tax consultancy services to EY clients, within the\r\nscope of the Armenian taxation law;\r\n- Provide support to audit services delivery to EY clients;\r\n- Support preparation of tax publications.\r\nREQUIRED QUALIFICATIONS:\r\n- Higher education in Accounting, Finance, Economics or a related field;\r\n- At least 2 years of experience in accounting;\r\n- Good knowledge of Armenian tax legislation;\r\n- Knowledge of international financial reporting standards (IFRS);\r\n- Fluency in business Armenian and English (both verbal and written);\r\nfluency 

In [6]:
processed_text = preprocess_text(df_test.iloc[0][0])
print(processed_text)

ernst young cjsc titl account specialist tax law depart locat yerevan armenia job descript ernst young cjsc invit applic qualifi candid fill posit account specialist ey tax law practic armenia job respons provid statutori account tax servic ey client provid payrol servic ey client provid variou tax consult servic ey client within scope armenian taxat law provid support audit servic deliveri ey client support prepar tax public requir qualif higher educ account financ econom relat field least 2 year experi account good knowledg armenian tax legisl knowledg intern financi report standard ifr fluenci busi armenian english verbal written fluenci russian languag analyt critic think abil work team flexibl abil travel frequent strong commun interperson skill literaci ms word excel powerpoint high level self motiv commit result applic procedur interest applic submit cv cv armenia deadlin pleas specifi subject line email applic posit account specialist ey tax law practic pleas clearli mention ap

In [7]:
df_test['processed'] = df_test['jobpost'].apply(preprocess_text)

In [8]:
df_test

Unnamed: 0,jobpost,processed
0,Ernst & Young CJSC\r\nTITLE: Accounting Speci...,ernst young cjsc titl account specialist tax l...
1,SAS Group LLC\r\nTITLE: Web Developer\r\nSTAR...,sa group llc titl web develop start date time ...
2,Inecobank CJSC\r\nTITLE: Risk Management Spec...,inecobank cjsc titl risk manag specialist loca...
3,Accept Employment Agency\r\nTITLE: Accountant...,accept employ agenc titl account announc code ...
4,"""IT Master"" LLC\r\nTITLE: Sales Manager\r\nOP...",master llc titl sale manag open elig criteria ...
...,...,...
295,MLL Industries\r\nTITLE: Bid Manager\r\nTERM:...,mll industri titl bid manag term full time loc...
296,Partner Organization of Career Center\r\nTITLE...,partner organ career center titl firemen locat...
297,Spayka LLC\r\nTITLE: Sales and Marketing Spec...,spayka llc titl sale market specialist locat y...
298,CQG-Yerevan\r\nTITLE: SCM Engineer\r\nLOCATIO...,cqg yerevan titl scm engin locat yerevan armen...


______

In [9]:
import os
import docx

def read_docx(file_path):
    doc = docx.Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

def read_cv_texts(cv_folder_path):
    cv_texts = []
    for file in os.listdir(cv_folder_path):
        if file.endswith(".docx"):
            file_path = os.path.join(cv_folder_path, file)
            text = read_docx(file_path)
            cv_texts.append(text)
    return cv_texts


cv_folder_path = '/home/rooting/code/Chanel/Resumes'
cv_texts = read_cv_texts(cv_folder_path)

In [10]:
# use same fnc as job disc
preprocessed_cvs = [preprocess_text(cv) for cv in cv_texts]


In [11]:
cv_df = pd.DataFrame({
    'original_cv': cv_texts,
    'processed_cv': preprocessed_cvs
})

cv_df

Unnamed: 0,original_cv,processed_cv
0,\nPROFESSIONAL SUMMARY\nMore than 7 years of c...,profession summari 7 year comprehens experi fi...
1,\nPROFESSIONAL SUMMARY: \n\n9+ years of experi...,profession summari 9 year experi develop imple...
2,\nB Shaker\t\t\t\t\t\t \t\t\nshaker.0516@g...,b shaker shaker 0516 gmail com 714 200 9052 in...
3,Sanjay\nEmail: sanjay.j0828@gmail.com\nContact...,sanjay email sanjay j0828 gmail com contact 1 ...
4,\nDerik Howarth\t\nSr. Java Developer \n derik...,derik howarth sr java develop derik howarth gm...
...,...,...
219,SATISH U \t\tPhone: 810-620-7820\nSr. Java Dev...,satish u phone 810 620 7820 sr java develop em...
220,Navneet Gupta\nproject/ program manager (PMP® ...,navneet gupta project program manag pmp 185113...
221,JAGADEESH\nJagadeesh14917@gmail.com\n(872) 216...,jagadeesh jagadeesh14917 gmail com 872 216 667...
222,"Jamil M, PMP®, PSM\nFremont, CA 94555\n\n\nRes...",jamil pmp psm fremont ca 94555 result driven s...


-----

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# tf idf with n gram
def vectorize_text_with_tfidf(text):
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform(text)
    return tfidf_matrix, vectorizer

# combine CVs and jobs
combined_texts = df_test['processed'].tolist() + cv_df['processed_cv'].tolist()

# Vectorize combined texts
tfidf_combined_matrix, vectorizer = vectorize_text_with_tfidf(combined_texts)

# Separate the matrices
num_jobs = df_test.shape[0]
tfidf_matrix = tfidf_combined_matrix[:num_jobs]
tfidf_matrix_cv = tfidf_combined_matrix[num_jobs:]


In [13]:
# TODO : we can explore more advanced techniques like word embeddings (Word2Vec, GloVe, Doc2Vec) or BERT embeddings...

-----

In [14]:
df_test

Unnamed: 0,jobpost,processed
0,Ernst & Young CJSC\r\nTITLE: Accounting Speci...,ernst young cjsc titl account specialist tax l...
1,SAS Group LLC\r\nTITLE: Web Developer\r\nSTAR...,sa group llc titl web develop start date time ...
2,Inecobank CJSC\r\nTITLE: Risk Management Spec...,inecobank cjsc titl risk manag specialist loca...
3,Accept Employment Agency\r\nTITLE: Accountant...,accept employ agenc titl account announc code ...
4,"""IT Master"" LLC\r\nTITLE: Sales Manager\r\nOP...",master llc titl sale manag open elig criteria ...
...,...,...
295,MLL Industries\r\nTITLE: Bid Manager\r\nTERM:...,mll industri titl bid manag term full time loc...
296,Partner Organization of Career Center\r\nTITLE...,partner organ career center titl firemen locat...
297,Spayka LLC\r\nTITLE: Sales and Marketing Spec...,spayka llc titl sale market specialist locat y...
298,CQG-Yerevan\r\nTITLE: SCM Engineer\r\nLOCATIO...,cqg yerevan titl scm engin locat yerevan armen...


In [15]:
tfidf_matrix, vectorizer

(<300x180181 sparse matrix of type '<class 'numpy.float64'>'
 	with 120198 stored elements in Compressed Sparse Row format>,
 TfidfVectorizer(ngram_range=(1, 2)))

In [16]:
cv_df

Unnamed: 0,original_cv,processed_cv
0,\nPROFESSIONAL SUMMARY\nMore than 7 years of c...,profession summari 7 year comprehens experi fi...
1,\nPROFESSIONAL SUMMARY: \n\n9+ years of experi...,profession summari 9 year experi develop imple...
2,\nB Shaker\t\t\t\t\t\t \t\t\nshaker.0516@g...,b shaker shaker 0516 gmail com 714 200 9052 in...
3,Sanjay\nEmail: sanjay.j0828@gmail.com\nContact...,sanjay email sanjay j0828 gmail com contact 1 ...
4,\nDerik Howarth\t\nSr. Java Developer \n derik...,derik howarth sr java develop derik howarth gm...
...,...,...
219,SATISH U \t\tPhone: 810-620-7820\nSr. Java Dev...,satish u phone 810 620 7820 sr java develop em...
220,Navneet Gupta\nproject/ program manager (PMP® ...,navneet gupta project program manag pmp 185113...
221,JAGADEESH\nJagadeesh14917@gmail.com\n(872) 216...,jagadeesh jagadeesh14917 gmail com 872 216 667...
222,"Jamil M, PMP®, PSM\nFremont, CA 94555\n\n\nRes...",jamil pmp psm fremont ca 94555 result driven s...


In [17]:
tfidf_matrix_cv, vectorizer

(<224x180181 sparse matrix of type '<class 'numpy.float64'>'
 	with 487281 stored elements in Compressed Sparse Row format>,
 TfidfVectorizer(ngram_range=(1, 2)))

In [18]:
tfidf_matrix, tfidf_matrix_cv

(<300x180181 sparse matrix of type '<class 'numpy.float64'>'
 	with 120198 stored elements in Compressed Sparse Row format>,
 <224x180181 sparse matrix of type '<class 'numpy.float64'>'
 	with 487281 stored elements in Compressed Sparse Row format>)

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

# cosine similarity : tfidf_matrix = job descriptions / tfidf_matrix_cv = CVs
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix_cv)

In [20]:
def get_top_matches_for_each_job(similarity_matrix, top_n=10):
    top_matches = {}
    for i in range(similarity_matrix.shape[0]):  # For each job description
        sorted_indices = similarity_matrix[i].argsort()[::-1][:top_n]  # Sort and get top indices
        top_matches[f'Job_{i+1}'] = sorted_indices
    return top_matches

# Get top 10 matches for each job description
top_matches = get_top_matches_for_each_job(similarity_matrix)

In [21]:
top_matches

{'Job_1': array([214, 105,  73, 149,  25, 162, 179, 145, 167, 204]),
 'Job_2': array([  8, 213, 160,  20, 152, 186,   4, 101,  95, 203]),
 'Job_3': array([105,  99, 220, 201, 197,  25, 149,  71,  79, 104]),
 'Job_4': array([162,  93, 204,  80,  25, 157,  64, 141, 201,  47]),
 'Job_5': array([168, 175, 180, 197, 162, 211,  30, 220, 123, 201]),
 'Job_6': array([188, 174, 209,  25, 105,  48, 130, 104, 145, 134]),
 'Job_7': array([157, 223, 130, 119, 175, 168, 211,  84, 138,  69]),
 'Job_8': array([105,  25,  79, 216,  44, 137,  47, 220, 119, 212]),
 'Job_9': array([ 73,  44, 137,   0, 165, 105,  96, 130, 201, 214]),
 'Job_10': array([ 54, 136,  89, 166,  79,  87, 201, 168, 211, 122]),
 'Job_11': array([ 56, 107,  49, 170,  63,  67, 101, 169,   3,   8]),
 'Job_12': array([213,  41, 160, 215, 189,  83,  58, 124,  42,  20]),
 'Job_13': array([105,  79,  44, 137, 211,  25, 130, 223,  22, 162]),
 'Job_14': array([ 41,  78,  48,   8,  58, 162, 160, 199, 189,  49]),
 'Job_15': array([ 88, 188, 1

In [32]:
for job, top_indices in top_matches.items():
    print(f"Top Matches for {job} with Scores:")

    job_index = int(job.split('_')[1]) - 1

    for index in top_indices:
        score = similarity_matrix[job_index, index]
        print(f"CV {index}: Score = {score}")
    print()

Top Matches for Job_1 with Scores:
CV 214: Score = 0.04049534108963563
CV 105: Score = 0.0362376671447087
CV 73: Score = 0.03613152924397601
CV 149: Score = 0.03401372026249372
CV 25: Score = 0.033430518865518094
CV 162: Score = 0.033095828332658124
CV 179: Score = 0.032955621750342945
CV 145: Score = 0.032637180065292704
CV 167: Score = 0.032214072732952424
CV 204: Score = 0.0319360307806766

Top Matches for Job_2 with Scores:
CV 8: Score = 0.11759446705541887
CV 213: Score = 0.11478964035703595
CV 160: Score = 0.11303575174210129
CV 20: Score = 0.11112233115633989
CV 152: Score = 0.1067709305360878
CV 186: Score = 0.1053993372262169
CV 4: Score = 0.1053993372262169
CV 101: Score = 0.10490294800463323
CV 95: Score = 0.10363426653807238
CV 203: Score = 0.10346294204653118

Top Matches for Job_3 with Scores:
CV 105: Score = 0.10267423305083538
CV 99: Score = 0.09578996568522456
CV 220: Score = 0.08727522741916142
CV 201: Score = 0.08694607850918354
CV 197: Score = 0.08681673620607884
CV

------

In [28]:
job_1_description = df_test.iloc[15]['jobpost'] 

print(job_1_description)

Energize Global Services CJSC
TITLE:  C Software Developer
TERM:  Full time
START DATE/ TIME:  ASAP
DURATION:  Permanent
LOCATION:  Yerevan, Armenia
JOB DESCRIPTION:  Energize Global Services CJSC is looking for
experienced C Software Developers with good knowledge of Java and Python
to be engaged in different long term projects. Within the software
development team, the successful candidates will be responsible for a
significant part of the development cycle of applications which includes
understanding of the requirements, performing the functional analysis,
design, programming and testing of software solutions. The C Software
Developers will participate in the development of different software
applications for various markets. The successful candidates will work for
3 months in Brussels both for help and getting trained.
JOB RESPONSIBILITIES:
- Participate in software development in C;
- Write unit tests and functional tests;
- Work as a part of cross functional software development 

In [29]:
# job_1
matched_cv_indices = top_matches['Job_14']
matched_cvs = cv_df.iloc[matched_cv_indices]
matched_cvs

Unnamed: 0,original_cv,processed_cv
41,\t\t\t\t\t\t\n ...,kumar raj 407 218 6395 expert java11 gmail com...
78,\nReddemma Lankipalle\nSoftware Test Engineer ...,reddemma lankipal softwar test engin crozer ke...
48,Chandler Robert Durairaj Joshua\nBusiness Anal...,chandler robert durairaj joshua busi analyst 6...
8,Siddharth\nSr. Full Stack Java Developer\n ...,siddharth sr full stack java develop email sid...
58,...,vema reddi sr java develop ph 865 375 0056 e m...
162,----------------------------------------------...,8 year experi busi analyt oper experi enterpri...
160,...,pierr john lead java j2ee develop work remot m...
199,Anil Krishna Mogalaturthi\n681-888-2999\nKrish...,anil krishna mogalaturthi 681 888 2999 krish j...
189,Harika\nSR. Java/J2ee Developer\nEmail: reddyh...,harika sr java j2ee develop email reddyharika2...
49,...,gopi sr java full stack develop email gopijava...


In [30]:
print(matched_cvs['original_cv'].iloc[0]) 


 						
                                                               
                                                                          Kumar Raj
                                                                         407-218-6395  
                                                               Expert.java11@gmail.com


PROFESSIONAL SUMMARY:

9 years of experience in Software Development Life Cycle (SDLC) with nice blend of expertise and leadership, be it requirements gathering, analysis (OOA), prototyping, design (OOD), development (OOP) or maintenance and testing of client-server applications.
Strong experience in designing and developing component based reusable GUI based applications.
Strong experience in Full Stack Web technologies like Core Java, J2EE, MVC Architecture (Struts and spring), Servlets, Swing, Hibernate, mybatis, EJB, JBOSS, JMS, MQ Series, AJAX, JQuery, JSF, Web services(REST and SOAP), Java Script, JSP, JSON, JQuery, JDBC, HTML, DHTML, CSS, and Custom Ta

______

In [25]:
new_job_desc = """
SAS Group
TITLE:  IT Specialist/ 1C Supporter
LOCATION:  Yerevan, Armenia
JOB DESCRIPTION:  SAS Group is seeking for an experienced IT Specialist/
1C Supporter with rich background.
JOB RESPONSIBILITIES:
- Carry out network implementation, ensure smooth working process;
- Install, set up and maintain network devices;
- Control and maintain the servers, systems and services;
- Find out hardware and software problems and solve them;
- Follow the IT industry innovations and supply the office with new
programs;
- Make some reports concerning the issues under responsibility;
- Respond to all kind of problems concerning the IT department and solve
them;
- Develop new ideas and proposals;
- Implement 1C program/ 1C program maintenance.
REQUIRED QUALIFICATIONS:
- Higher education in IT sphere;
- At least 2 years of working experience in the related sphere;
- Smart and creative way of thinking;
- Good team player.
REMUNERATION/ SALARY:  Highly competitive
APPLICATION PROCEDURES:  All interested candidates should e-mail
applications to: career@... . Please indicate the position title
in the subject field of the message.
Please clearly mention in your application letter that you learned of
this job opportunity through Career Center and mention the URL of its
website - www.careercenter.am, Thanks.
OPENING DATE:  26 May 2014
APPLICATION DEADLINE:  25 June 2014
----------------------------------
To place a free posting for job or other career-related opportunities
available in your organization, just go to the www.careercenter.am
website and follow the "Post an Announcement" link.
"""

In [26]:
# Preprocess the new job description
processed_job_desc = preprocess_text(new_job_desc)

# Vectorize the preprocessed job description
new_job_vector = vectorizer.transform([processed_job_desc]) 

# Calculate cosine similarity with the CVs
similarity_scores = cosine_similarity(new_job_vector, tfidf_matrix_cv).flatten()

# Get indices of top 10 matching CVs
top_cv_indices = similarity_scores.argsort()[::-1][:10]

# Retrieve top CVs
top_cvs = cv_df.iloc[top_cv_indices]

# Display the results
print("Top Matching CVs for the New Job Description:")
print(top_cvs['original_cv'])


Top Matching CVs for the New Job Description:
143    \nVishal S.\nvishalsingh19851@gmail.com \n732 ...
146    Praveen Kumar Bugalaia\n60-949-5710 praveen.bu...
216           Ashok Jayakumar                        ...
162    ----------------------------------------------...
130    NAVNEET \n407-401-9023 / 11baconsultant@outloo...
147    \n\nSivanaga G\n571-346-2112\nsivagatiganti@gm...
169         PAVAN KUMAR\t\t\t               Pavank068...
92     Vinay\nvinay.donthoju1@gmail.com\n\t\t+1 (515)...
170     Nithin Reddy                                 ...
58                                                   ...
Name: original_cv, dtype: object


In [27]:
print(top_cvs['original_cv'].iloc[0]) 


Vishal S.
vishalsingh19851@gmail.com 
732 645 2590


Infrastructure Project Manager
An Innovative and solutions-focused IT Manager with over 14 years in servers and network infrastructure experience. Over 11 years of senior–level leadership spanning IT Infrastructure, Data Center migration, infrastructure build-out with Windows OS upgrade, COTS and Business Users application & SQL database migration for production, pre-production, disaster recovery environments, IT operations and service support management. Drive product from concept/ideation stage through the product development governance process with Agile and Waterfall SDLC practices. Infrastructure project management methodologies-ITIL, ITSM based Service Delivery.
Professional Experience
S&P Global, New York              					                               May 2017 to Present
Sr. Technical Project Manager
Internal cloud infrastructure scanning for Threat and Vulnerability management
Users migration from One Drive, Google Drive t