In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [2]:
blacklist = ["career", "opportunity", "work", "team", "mention", "experience", "knowledge", "skill", "ability", "company", "date", "qualification", "website", "open", "develop",
             "title", "excellent", "position", "email", "letter", "language", "post", "english", "center", "title", "salary","follow", "thank", "job", "good", "time", "great", "project", "required", "year", "month", "day",
             "form"]

def clean_text(text):
    # Remove URLs and anything starting with http or www
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    
    # Lowercase the text
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Remove multiple whitespaces
    text = re.sub(r'\s+', ' ', text)

    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token.strip() for token in tokens if token not in stop_words]

    # Remove single-character tokens and blacklist words
    filtered_tokens = [token for token in filtered_tokens if len(token) > 1 and not any([x in token for x in blacklist])]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Change '-ing' verbs to root form
    final_tokens = [lemmatizer.lemmatize(token[:-3], pos='v') if token.endswith('ing') and lemmatizer.lemmatize(token, pos='v') != token else token for token in lemmatized_tokens]
    
    # Join the tokens back into a string
    cleaned_text = ' '.join(final_tokens)
    
    return cleaned_text

In [3]:
df = pd.read_csv('C:/Users/user/Desktop/recommender/jobs.csv')
df



Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\r\...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,...,,Please submit a cover letter and resume to:\r\...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
2,Caucasus Environmental NGO Network (CENN)\r\nJ...,"Jan 7, 2004",Country Coordinator,Caucasus Environmental NGO Network (CENN),,,,,,Renewable annual contract\r\nPOSITION,...,,Please send resume or CV toursula.kazarian@......,,20 January 2004\r\nSTART DATE: February 2004,,The Caucasus Environmental NGO Network is a\r\...,,2004,1,False
3,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,"Jan 7, 2004",BCC Specialist,Manoff Group,,,,,,,...,,Please send cover letter and resume to Amy\r\n...,,23 January 2004\r\nSTART DATE: Immediate,,,,2004,1,False
4,Yerevan Brandy Company\r\nJOB TITLE: Software...,"Jan 10, 2004",Software Developer,Yerevan Brandy Company,,,,,,,...,,Successful candidates should submit\r\n- CV; \...,,"20 January 2004, 18:00",,,,2004,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18996,Technolinguistics NGO\r\n\r\n\r\nTITLE: Senio...,"Dec 28, 2015",Senior Creative UX/ UI Designer,Technolinguistics NGO,,Full-time,,,,Long-term,...,Competitive,"To apply for this position, please send your\r...",29 December 2015,28 January 2016,,As a company Technolinguistics has a mandate t...,,2015,12,False
18997,"""Coca-Cola Hellenic Bottling Company Armenia"" ...","Dec 30, 2015",Category Development Manager,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",,Full-time,All interested professionals.,,ASAP,Long-term with a probation period of 3 months.,...,,All interested candidates are kindly requested...,30 December 2015,20 January 2016,,,,2015,12,False
18998,"""Coca-Cola Hellenic Bottling Company Armenia"" ...","Dec 30, 2015",Operational Marketing Manager,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",,Full-time,All interested professionals.,,ASAP,Long-term with a probation period of 3 months.,...,,All interested candidates are kindly requested...,30 December 2015,20 January 2016,,,,2015,12,False
18999,San Lazzaro LLC\r\n\r\n\r\nTITLE: Head of O...,"Dec 30, 2015",Head of Online Sales Department,San Lazzaro LLC,,,,,,Long-term,...,Highly competitive,Interested candidates can send their CVs to:\r...,30 December 2015,29 January 2016,,San Lazzaro LLC works with several internation...,,2015,12,False


In [4]:
resumes = pd.read_csv("C:/Users/user/Desktop/recommender/resume.csv")
print(resumes)
print(resumes.columns)


         Category                                             Resume
0    Data Science  Skills * Programming Languages: Python (pandas...
1    Data Science  Education Details \r\nMay 2013 to May 2017 B.E...
2    Data Science  Areas of Interest Deep Learning, Control Syste...
3    Data Science  Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4    Data Science  Education Details \r\n MCA   YMCAUST,  Faridab...
..            ...                                                ...
957       Testing  Computer Skills: â¢ Proficient in MS office (...
958       Testing  â Willingness to accept the challenges. â ...
959       Testing  PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...
960       Testing  COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...
961       Testing  Skill Set OS Windows XP/7/8/8.1/10 Database MY...

[962 rows x 2 columns]
Index(['Category', 'Resume'], dtype='object')


In [5]:
df = pd.read_csv('C:/Users/user/Desktop/recommender/jobs.csv')
df["cleaned"] = df["jobpost"].apply(clean_text)
df = df[["jobpost", "cleaned"]]
df.dropna(inplace=True)
resumes = pd.read_csv("C:/Users/user/Desktop/recommender/resume.csv")
resumes.dropna(subset=["Resume"], inplace=True)
resumes["cleaned"] = resumes.Resume.apply(clean_text)
df = pd.concat([df, resumes], join="inner", axis=0)
df


Unnamed: 0,cleaned
0,ameria investment consult chief financial offi...
1,international research exchange board irex ful...
2,caucasus environmental ngo cenn country coordi...
3,manoff group bcc specialist location manila ph...
4,yerevan brandy software location yerevan armen...
...,...
957,computer proficient m office word basic excel ...
958,willingness accept challenge positive think le...
959,personal quick learner eagerness learn new thi...
960,computer software m power point m office proti...


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(analyzer='word',       
                             min_df=3,                       
                             stop_words='english',             
                             lowercase=True,                   
                             token_pattern='[a-zA-Z0-9]{3,}',  
                             max_features=5000,          
                            )

data_vectorized = vectorizer.fit_transform(df.cleaned)

lda_model = LatentDirichletAllocation(n_components=20, # Number of topics
                                      learning_method='online',
                                      random_state=0,       
                                      n_jobs = -1  # Use all available CPUs
                                     )
#training 
lda_output = lda_model.fit_transform(data_vectorized)

In [7]:
print("Log likelihood",lda_model.score(data_vectorized))
print("Perplexity",lda_model.perplexity(data_vectorized))
print(lda_model.get_params())

Log likelihood -22451014.766953222
Perplexity 797.666463988691
{'batch_size': 128, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.7, 'learning_method': 'online', 'learning_offset': 10.0, 'max_doc_update_iter': 100, 'max_iter': 10, 'mean_change_tol': 0.001, 'n_components': 20, 'n_jobs': -1, 'perp_tol': 0.1, 'random_state': 0, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


In [8]:
test_job = """Job Title: Machine Learning Engineer

Job Description:

Our company is looking for a highly skilled Machine Learning Engineer to join our team. As a Machine Learning Engineer, you will be responsible for developing and implementing cutting-edge machine learning models and algorithms that will power our products and services.

Responsibilities:

Design, develop and deploy scalable and efficient machine learning models and algorithms to solve complex business problems
Collaborate with cross-functional teams to gather and analyze data, identify key insights, and develop solutions that meet business requirements
Conduct research to stay up-to-date with the latest advancements in machine learning and data science
Implement data pre-processing, feature engineering, and model selection techniques to optimize performance of machine learning models
Deploy machine learning models to production environments and monitor their performance in real-time
Develop tools and systems to automate data collection, cleaning, and analysis processes
Requirements:

Bachelor's or Master's degree in Computer Science, Data Science, Mathematics, Statistics, or related field
At least 3 years of experience in developing and deploying machine learning models and algorithms in a production environment
Strong programming skills in Python and proficiency in machine learning libraries such as TensorFlow, Keras, and PyTorch
Experience with data pre-processing, feature engineering, and model selection techniques
Solid understanding of statistics, linear algebra, and probability theory
Experience with cloud-based machine learning platforms such as AWS, Azure, or GCP is a plus
Strong problem-solving skills, ability to work independently and as part of a team, and excellent communication skills.
If you are passionate about machine learning and want to work on cutting-edge technology to solve real-world problems, then we encourage you to apply for this exciting opportunity."""

test_resume = """
Name: John Doe
Email: johndoe@email.com
Phone: 555-555-5555
Location: Anytown, USA

Summary:
I am a skilled Software Testing and Automation professional with over 5 years of experience in creating and executing software testing plans and automated test scripts. I have experience with a variety of tools and technologies, including Selenium, Appium, Jenkins, and JIRA. I am a quick learner, a team player, and I am always striving to improve my skills.

Skills:

Experience with automated testing tools such as Selenium, Appium, and TestNG.
Familiarity with Agile methodologies and experience working in Agile teams.
Knowledge of programming languages such as Java and Python.
Experience with continuous integration tools such as Jenkins and TeamCity.
Experience with defect tracking and project management tools such as JIRA.
Strong communication and collaboration skills.
Bachelor's degree in Computer Science, Software Engineering, or a related field.
Experience:

Software Testing and Automation Engineer
ABC Software Solutions, Anytown, USA
June 2019 - Present

Developed and executed automated test scripts using Selenium, Appium, and TestNG.
Created and executed manual test cases.
Worked in Agile teams to ensure high-quality software releases.
Collaborated with developers and other stakeholders to identify and resolve defects.
Conducted regression testing and provided testing reports to the project team.
Software Test Engineer
XYZ Software Inc., Anytown, USA
January 2017 - May 2019

Developed and executed manual test cases.
Reported defects and tracked them using JIRA.
Assisted with the development of automated test scripts.
Worked closely with developers to ensure the timely resolution of defects."""

In [9]:
### TESTING
new_text_bow = vectorizer.transform([clean_text(test_job)])

topic_probabilities = lda_model.transform(new_text_bow)


# Get the most likely topic and its probability
most_likely_topic = topic_probabilities.argmax()
probability_of_topic = topic_probabilities[0, most_likely_topic]

# Print the result
print(f"The most likely topic for the job is topic {most_likely_topic}, with a probability of {probability_of_topic}")

new_text_bow = vectorizer.transform([clean_text(test_resume)])
topic_probabilities = lda_model.transform(new_text_bow)
# Get the most likely topic and its probability
most_likely_topic = topic_probabilities.argmax()
probability_of_topic = topic_probabilities[0, most_likely_topic]

# Print the result
print(f"The most likely topic for the resume is topic {most_likely_topic}, with a probability of {probability_of_topic}")

The most likely topic for the job is topic 7, with a probability of 0.5981045453818771
The most likely topic for the resume is topic 7, with a probability of 0.8168912135175306


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def calculate_similarity(job_description, resume):
    # Create a TF-IDF vectorizer object
    vectorizer = TfidfVectorizer(stop_words='english')
    
    # Fit and transform the job description and resume to TF-IDF matrices
    job_description_tfidf = vectorizer.fit_transform([job_description])
    resume_tfidf = vectorizer.transform([resume])
    
    # Calculate the cosine similarity between the two TF-IDF matrices
    cosine_similarity = np.dot(job_description_tfidf.toarray(), resume_tfidf.toarray().T)
    
    # Return the cosine similarity score
    return cosine_similarity[0][0]

calculate_similarity(clean_text(test_job), clean_text(test_resume))

0.23719139812819728