# JOB RECOMMENDATION SYSTEM USING CONTENT BASED FILTERING

# Installing Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


# Importing Libraries

In [3]:
import spacy
from spacy.matcher import Matcher
import PyPDF2
import os
import numpy as np
import pandas as pd
import random
import csv
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from sklearn.metrics.pairwise import cosine_similarity

#Skills Extracting from PDF

In [4]:
# 'en_core_web_sm' is a small English language model provided by spaCy
# It includes vocabulary, word vectors, and various linguistic annotations
nlp = spacy.load('en_core_web_sm')

In [5]:
# predefining common skills for data science,machine learning,devops, data analyst,software engineer job roles
dsa = [
    "SA", "Data Wrangling", "DataViz", "Data Cleaning",
    "Data Mining", "ML", "DL", "NLP", "CV", "AI", "Python", "R", "SQL", "NoSQL", "MATLAB",
    "Tableau", "Power BI", "Excel", "Hadoop", "Spark", "TensorFlow", "Keras", "PyTorch",
    "Scikit-Learn", "Pandas", "NumPy", "SciPy", "Git", "Docker", "Agile", "Scrum",
    "AWS", "GCP", "Azure", "CI/CD", "JSON", "XML", "ETL", "BI", "SVM", "PCA", "LSTM", "GANs",
    "ANN", "CRUD", "API", "UX/UI", "SDLC", "MVP", "ORM", "MVC", "K8s"
]

mla = [
    "ML", "DL", "NLP", "CV", "AI", "Python", "R", "Scikit-Learn", "TensorFlow", "Keras",
    "PyTorch", "Data Mining", "Feature Engineering", "Model Evaluation", "Ensemble Learning",
    "Neural Networks", "Deep Learning", "Gradient Boosting", "Clustering", "Dimensionality Reduction",
    "Regression", "Classification", "Natural Language Processing", "Computer Vision", "Reinforcement Learning",
    "Supervised Learning", "Unsupervised Learning", "SVM", "Decision Trees", "Random Forest", "XGBoost"
]

daa = [
    "Data Analysis", "Excel", "SQL", "Python", "R", "Data Cleaning", "Data Visualization",
    "Statistical Analysis", "Business Intelligence", "Tableau", "Power BI", "Pandas", "NumPy",
    "Matplotlib", "Seaborn", "Exploratory Data Analysis", "Descriptive Statistics",
    "Regression Analysis", "Hypothesis Testing", "Data Wrangling", "ETL", "Data Mining",
    "Dashboard Creation", "Google Analytics", "Microsoft Excel", "Google Sheets", "Database Management"
]

sea = [
    "Java", "C++", "Python", "JS", "HTML", "CSS", "RESTful APIs",
    "Git", "Docker", "K8s", "Agile", "Scrum", "SQL", "NoSQL",
    "AWS", "GCP", "CI/CD", "JSON", "XML", "OAuth", "JWT", "SaaS", "PaaS", "IaaS",
    "BaaS", "FaaS", "RDBMS", "DBMS", "ACID", "CAP", "BASE", "DDoS", "GDPR", "HIPAA",
    "IDE", "SVN", "VCS", "GUI", "CLI", "NFS", "SAN", "NAS", "I/O", "HTTP", "HTTPS",
    "SSH", "FTP", "SMTP", "IMAP", "VoIP", "VLAN", "VPN", "NAT", "TCP", "UDP", "IP",
    "OS", "VM"
]

doa = [
    "Linux", "Unix", "Shell Scripting", "Docker", "K8s", "Ansible", "Jenkins",
    "Git", "CI/CD", "AWS", "GCP", "Azure", "Terraform", "Puppet", "Chef", "Nagios", "ELK Stack",
    "Prometheus", "Grafana", "Version Control", "Monitoring", "Containerization", "Orchestration",
    "IaC", "Microservices", "Serverless", "Automation", "Load Balancing", "Scripting", "Networking",
    "Security", "Log Management", "Continuous Monitoring", "Collaboration Tools", "Agile", "Scrum", "SDLC"
]


dsf = [
    "Statistical Analysis", "Data Wrangling", "Data Visualization", "Data Cleaning",
    "Data Mining", "Machine Learning", "Deep Learning", "Natural Language Processing",
    "Computer Vision", "Artificial Intelligence", "Python", "R", "SQL", "NoSQL", "MATLAB",
    "Tableau", "Power BI", "Excel", "Hadoop", "Spark", "TensorFlow", "Keras", "PyTorch",
    "Scikit-Learn", "Pandas", "NumPy", "SciPy", "Git", "Docker", "Agile Methodology", "Scrum",
    "AWS", "GCP", "Azure", "Continuous Integration/Continuous Deployment", "JSON", "XML", "ETL", "Business Intelligence",
    "Support Vector Machines", "Principal Component Analysis", "Long Short Term Memory", "Generative Adversarial Networks",
    "Artificial Neural Networks", "Create, Read, Update, Delete", "Application Programming Interface", "User Experience/User Interface",
    "Software Development Life Cycle", "Minimum Viable Product", "Object-Relational Mapping", "Model-View-Controller", "Kubernetes"
]

mlf = [
    "Machine Learning", "Deep Learning", "Natural Language Processing", "Computer Vision", "Artificial Intelligence",
    "Python", "R", "Scikit-Learn", "TensorFlow", "Keras", "PyTorch", "Data Mining", "Feature Engineering", "Model Evaluation",
    "Ensemble Learning", "Neural Networks", "Deep Learning", "Gradient Boosting", "Clustering", "Dimensionality Reduction",
    "Regression", "Classification", "Natural Language Processing", "Computer Vision", "Reinforcement Learning",
    "Supervised Learning", "Unsupervised Learning", "Support Vector Machines", "Decision Trees", "Random Forest", "XGBoost"
]

daf = [
    "Data Analysis", "Excel", "SQL", "Python", "R", "Data Cleaning", "Data Visualization",
    "Statistical Analysis", "Business Intelligence", "Tableau", "Power BI", "Pandas", "NumPy",
    "Matplotlib", "Seaborn", "Exploratory Data Analysis", "Descriptive Statistics",
    "Regression Analysis", "Hypothesis Testing", "Data Wrangling", "Extract, Transform, Load", "Data Mining",
    "Dashboard Creation", "Google Analytics", "Microsoft Excel", "Google Sheets", "Database Management"
]

sef = [
    "Java", "C++", "Python", "JavaScript", "HTML", "CSS", "RESTful APIs",
    "Git", "Docker", "Kubernetes", "Agile", "Scrum", "SQL", "NoSQL",
    "Amazon Web Services", "Google Cloud Platform", "Continuous Integration/Continuous Deployment", "JSON", "XML",
    "Open Authorization", "JSON Web Token", "Software as a Service", "Platform as a Service", "Infrastructure as a Service",
    "Backend as a Service", "Function as a Service", "Relational Database Management System", "Database Management System",
    "Atomicity, Consistency, Isolation, Durability", "Consistency, Availability, Partition tolerance",
    "Basically Available, Soft state, Eventually consistent", "Distributed Denial of Service", "General Data Protection Regulation", "Health Insurance Portability and Accountability Act",
    "Integrated Development Environment", "Subversion", "Version Control System", "Graphical User Interface", "Command Line Interface",
    "Network File System", "Storage Area Network", "Network Attached Storage", "Input/Output", "Hypertext Transfer Protocol", "Hypertext Transfer Protocol Secure",
    "Secure Shell", "File Transfer Protocol", "Simple Mail Transfer Protocol", "Internet Message Access Protocol", "Voice over Internet Protocol",
    "Virtual Local Area Network", "Virtual Private Network", "Network Address Translation", "Transmission Control Protocol", "User Datagram Protocol", "Internet Protocol",
    "Operating System", "Virtual Machine"
]

dof = [
    "Linux", "Unix", "Shell Scripting", "Docker", "Kubernetes", "Ansible", "Jenkins",
    "Git", "Continuous Integration/Continuous Deployment", "Amazon Web Services", "Google Cloud Platform", "Azure", "Terraform",
    "Puppet", "Chef", "Nagios", "ELK Stack", "Prometheus", "Grafana", "Version Control", "Monitoring", "Containerization",
    "Orchestration", "Infrastructure as Code", "Microservices", "Serverless", "Automation", "Load Balancing", "Scripting",
    "Networking", "Security", "Log Management", "Continuous Monitoring", "Collaboration Tools", "Agile", "Scrum", "Software Development Life Cycle"
]

# combining all those list in to one universal skill list and removing the duplicates while combining
f = dsa + mla + daa + sea + doa + dsf + mlf + daf + sef + dof
print(len(f))
s=set(f)
print(len(s))

410
218


In [6]:
#making patterns for above created predefined list
skills=list(s)
matcher = Matcher(nlp.vocab)
# Create pattern dictionaries from skills
for i in skills:
  skill_patterns= [{'LOWER': skill.lower()} for skill in i.split()]
  matcher.add(i, [skill_patterns])


In [7]:
#converting resume pdf to text
def convert_pdf_to_text(f):
  with open(file_path,'rb') as f:
    pdf_reader = PyPDF2.PdfReader(f)
    text = ''
    for page in pdf_reader.pages:
      text += page.extract_text()
    return text.lower()

In [8]:
#extracting skills from resume text
def skills_extract(s):
  doc=nlp(s)
  matches=matcher(doc)
  fs=set()
  for match_id,start,end in matches:
    skill=doc[start:end].text
    fs.add(skill)
  return list(fs)

# **User's Input collection**

In [9]:
#user input
users=int(input("Enter no of users: "))
users_df1=pd.DataFrame()
skills_users=[]
users_names=[]
locs=[]
for i in range(users):
  file_path=input("Enter the path for user resume: ")
  loc=input("Enter User location Preference as city and state code: ")
  loc=loc.strip().lower()
  t=convert_pdf_to_text(file_path)
  s=skills_extract(t)
  s1=[]
  users_names.append(i+1)
  s1.append(s)
  skills_users.append(s)
  locs.append(loc)

users_df1['userNames']=users_names
users_df1['skillsofusers']=skills_users
users_df1['userPreferedloc']=locs


Enter no of users: 4
Enter the path for user resume: /content/drive/MyDrive/Data_Mining_Project/Jagadeesh_RESUME.pdf
Enter User location Preference as city and state code: new york
Enter the path for user resume: /content/drive/MyDrive/Data_Mining_Project/Navya_Chalamalasetty_Resume.pdf
Enter User location Preference as city and state code: san francisco
Enter the path for user resume: /content/drive/MyDrive/Data_Mining_Project/Saketh Resume.pdf
Enter User location Preference as city and state code: united states
Enter the path for user resume: /content/drive/MyDrive/Data_Mining_Project/Data_Science_intern_Sriram_1104.pdf
Enter User location Preference as city and state code: austin


In [10]:
users_df1.set_index('userNames',inplace=True)

In [11]:
users_df1

Unnamed: 0_level_0,skillsofusers,userPreferedloc
userNames,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[python, generative adversarial networks, nlp,...",new york
2,"[aws, deep learning, java, sql, css, oauth, nu...",san francisco
3,"[aws, c++, deep learning, java, sql, neural ne...",united states
4,"[sql, monitoring, xgboost, artificial intellig...",austin


In [12]:
users_df1.columns

Index(['skillsofusers', 'userPreferedloc'], dtype='object')

# Job Posting Data Collection and Preprocessing

In [13]:
df1=pd.read_excel('/content/drive/MyDrive/Data_Mining_Project/datascience.xlsx')
df2=pd.read_excel('/content/drive/MyDrive/Data_Mining_Project/devops.xlsx')
df3=pd.read_excel('/content/drive/MyDrive/Data_Mining_Project/Software_Engineer.xlsx')
df4=pd.read_excel('/content/drive/MyDrive/Data_Mining_Project/Linkedin_Data.xlsx')


df = pd.concat([df1, df2,df3,df4], ignore_index=True)

In [14]:
df.head()

Unnamed: 0,Keyword,Location,Job_title,Job_link,Company,Company_link,Job_location,Post_time,Applicants_count,Job_description,Seniority_level,Employment_type,Job_function,Industries
0,Data Science,United States,Data Scientist BF,https://www.linkedin.com/jobs/view/data-scient...,INSPYR Solutions,https://www.linkedin.com/company/inspyrsolutio...,"Vienna, VA",2 hours ago,Be among the first 25 applicants,"Title: Data Scientist\n\nLocation: Vienna, VA\...",Entry level,Full-time,Engineering and Information Technology,Staffing and Recruiting
1,Data Science,United States,"Data Science Intern, Analytics",https://www.linkedin.com/jobs/view/data-scienc...,Discord,https://www.linkedin.com/company/discord?trk=p...,"San Francisco, CA",2 days ago,Over 200 applicants,"The Summer Internship\n\nAt Discord, we offer ...",Internship,Internship,Engineering and Information Technology,Software Development
2,Data Science,United States,Data Scientist,https://www.linkedin.com/jobs/view/data-scient...,Fenway Group,https://www.linkedin.com/company/fenway-group?...,"Southlake, TX",7 hours ago,30 applicants,"Job Description\n\nRole, Responsibilities, and...",Entry level,Full-time,Engineering and Information Technology,IT Services and IT Consulting
3,Data Science,United States,Assistant Data Scientist,https://www.linkedin.com/jobs/view/assistant-d...,AXA XL,https://bm.linkedin.com/company/axaxl?trk=publ...,"New York, NY",20 hours ago,184 applicants,"New York, NY I Stamford, CT I USA\n\nAXA XL re...",Not Applicable,Full-time,Engineering and Information Technology,Insurance
4,Data Science,United States,Data Scientist,https://www.linkedin.com/jobs/view/data-scient...,Microsoft,https://www.linkedin.com/company/microsoft?trk...,United States,1 hour ago,78 applicants,Description\n\nData Scientist (Contract)\n\n\n...,Associate,Contract,Engineering and Information Technology,Software Development


In [15]:
df.shape

(3801, 14)

In [16]:
df.duplicated().sum()

117

In [17]:
df=df.drop_duplicates()

In [18]:
df.columns=df.columns.str.lower()

In [19]:
df.dtypes

keyword             object
location            object
job_title           object
job_link            object
company             object
company_link        object
job_location        object
post_time           object
applicants_count    object
job_description     object
seniority_level     object
employment_type     object
job_function        object
industries          object
dtype: object

In [20]:
df.columns

Index(['keyword', 'location', 'job_title', 'job_link', 'company',
       'company_link', 'job_location', 'post_time', 'applicants_count',
       'job_description', 'seniority_level', 'employment_type', 'job_function',
       'industries'],
      dtype='object')

In [21]:
#dropping keyword, location and some other columns from the dataset
df=df.drop(columns=['keyword','location','applicants_count','post_time'])

In [22]:
object_columns = df.select_dtypes(include=['object'])
for i in object_columns:
  df[i]=df[i].str.strip()
  df[i]=df[i].str.lower()

In [23]:
df.columns

Index(['job_title', 'job_link', 'company', 'company_link', 'job_location',
       'job_description', 'seniority_level', 'employment_type', 'job_function',
       'industries'],
      dtype='object')

In [24]:
l=df['job_description'].tolist()
len(l)

3684

In [None]:
js=[]
for i in l:
  js.append(skills_extract(i))
len(js)

3684

In [26]:
print(js[0])

['python', 'data visualization', 'tableau', 'clustering', 'sql', 'etl', 'agile', 'hadoop', 'reinforcement learning', 'natural language processing', 'r', 'data wrangling', 'power bi', 'bi', 'spark', 'aws', 'matplotlib', 'azure', 'data mining', 'machine learning', 'classification', 'excel', 'regression']


In [27]:
df['skills']=js

In [28]:
df.head()

Unnamed: 0,job_title,job_link,company,company_link,job_location,job_description,seniority_level,employment_type,job_function,industries,skills
0,data scientist bf,https://www.linkedin.com/jobs/view/data-scient...,inspyr solutions,https://www.linkedin.com/company/inspyrsolutio...,"vienna, va","title: data scientist\n\nlocation: vienna, va\...",entry level,full-time,engineering and information technology,staffing and recruiting,"[python, data visualization, tableau, clusteri..."
1,"data science intern, analytics",https://www.linkedin.com/jobs/view/data-scienc...,discord,https://www.linkedin.com/company/discord?trk=p...,"san francisco, ca","the summer internship\n\nat discord, we offer ...",internship,internship,engineering and information technology,software development,"[python, r, sql]"
2,data scientist,https://www.linkedin.com/jobs/view/data-scient...,fenway group,https://www.linkedin.com/company/fenway-group?...,"southlake, tx","job description\n\nrole, responsibilities, and...",entry level,full-time,engineering and information technology,it services and it consulting,"[computer vision, python, data analysis, api, ..."
3,assistant data scientist,https://www.linkedin.com/jobs/view/assistant-d...,axa xl,https://bm.linkedin.com/company/axaxl?trk=publ...,"new york, ny","new york, ny i stamford, ct i usa\n\naxa xl re...",not applicable,full-time,engineering and information technology,insurance,"[pandas, numpy, ai, classification, machine le..."
4,data scientist,https://www.linkedin.com/jobs/view/data-scient...,microsoft,https://www.linkedin.com/company/microsoft?trk...,united states,description\n\ndata scientist (contract)\n\n\n...,associate,contract,engineering and information technology,software development,"[clustering, etl, scripting, natural language ..."


In [29]:
#OneHot Encoding
# Vectorizing Location values
loc=df['job_location'].to_list()
loc=[str(i).split(',')[0] for i in loc]
df['job_location']=loc

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
le=encoder.fit_transform(df[['job_location']])
df['location_vectors'] = list(le)




In [30]:
users_df1['location_vector'] = list(users_df1['userPreferedloc'].apply(lambda x: encoder.transform([[x]])[0]))



In [31]:
df.head()

Unnamed: 0,job_title,job_link,company,company_link,job_location,job_description,seniority_level,employment_type,job_function,industries,skills,location_vectors
0,data scientist bf,https://www.linkedin.com/jobs/view/data-scient...,inspyr solutions,https://www.linkedin.com/company/inspyrsolutio...,vienna,"title: data scientist\n\nlocation: vienna, va\...",entry level,full-time,engineering and information technology,staffing and recruiting,"[python, data visualization, tableau, clusteri...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"data science intern, analytics",https://www.linkedin.com/jobs/view/data-scienc...,discord,https://www.linkedin.com/company/discord?trk=p...,san francisco,"the summer internship\n\nat discord, we offer ...",internship,internship,engineering and information technology,software development,"[python, r, sql]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,data scientist,https://www.linkedin.com/jobs/view/data-scient...,fenway group,https://www.linkedin.com/company/fenway-group?...,southlake,"job description\n\nrole, responsibilities, and...",entry level,full-time,engineering and information technology,it services and it consulting,"[computer vision, python, data analysis, api, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,assistant data scientist,https://www.linkedin.com/jobs/view/assistant-d...,axa xl,https://bm.linkedin.com/company/axaxl?trk=publ...,new york,"new york, ny i stamford, ct i usa\n\naxa xl re...",not applicable,full-time,engineering and information technology,insurance,"[pandas, numpy, ai, classification, machine le...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,data scientist,https://www.linkedin.com/jobs/view/data-scient...,microsoft,https://www.linkedin.com/company/microsoft?trk...,united states,description\n\ndata scientist (contract)\n\n\n...,associate,contract,engineering and information technology,software development,"[clustering, etl, scripting, natural language ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [32]:
df['job_location'].unique()

array(['vienna', 'san francisco', 'southlake', 'new york',
       'united states', 'indianapolis', 'washington', 'greater houston',
       'san diego', 'greater st. louis', 'colorado springs', 'atlanta',
       'chicago', 'coral gables', 'bethesda', 'newport beach',
       'chantilly', 'palo alto', 'boston',
       'new york city metropolitan area', 'jacksonville',
       'san francisco bay area', 'san jose', 'newark', 'foster city',
       'mountain view', 'sunnyvale', 'fremont', 'roseville',
       'philadelphia', 'cambridge', 'lake mary', 'pataskala',
       'california', 'san francisco county', 'boca raton', 'northbrook',
       'austin', 'tualatin', 'burbank', 'tampa', 'oklahoma city',
       'south san francisco', 'los gatos', 'leesburg', 'menlo park',
       'palmdale', 'new jersey', 'germantown', 'irvine', 'florida',
       'annapolis junction', 'new haven', 'santa monica', 'ridgefield',
       'los angeles', 'huntsville', 'houston', 'thousand oaks',
       'richardson', 'texas

In [33]:
df1=df[df['job_location'] == 'california']
len(df1)

30

In [34]:
users_df1

Unnamed: 0_level_0,skillsofusers,userPreferedloc,location_vector
userNames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"[python, generative adversarial networks, nlp,...",new york,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[aws, deep learning, java, sql, css, oauth, nu...",san francisco,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[aws, c++, deep learning, java, sql, neural ne...",united states,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[sql, monitoring, xgboost, artificial intellig...",austin,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [35]:
skills_pd = [item.lower().replace(',',' ').replace(' ','_').split() for item in skills]
model = Word2Vec(skills_pd, vector_size=100,window=5,min_count=1)

In [36]:
print(skills)
print(skills_pd)

['NumPy', 'JSON', 'Supervised Learning', 'AI', 'Seaborn', 'VLAN', 'Data Wrangling', 'Serverless', 'Artificial Intelligence', 'DBMS', 'Data Analysis', 'Open Authorization', 'UDP', 'FTP', 'Network Attached Storage', 'Java', 'DDoS', 'Business Intelligence', 'GDPR', 'Chef', 'Descriptive Statistics', 'Dashboard Creation', 'BASE', 'Shell Scripting', 'Log Management', 'Collaboration Tools', 'XML', 'Command Line Interface', 'Application Programming Interface', 'Subversion', 'Deep Learning', 'SQL', 'PyTorch', 'SA', 'Scikit-Learn', 'User Experience/User Interface', 'Google Cloud Platform', 'File Transfer Protocol', 'Simple Mail Transfer Protocol', 'VCS', 'HTTP', 'Hypothesis Testing', 'Continuous Monitoring', 'PCA', 'Ensemble Learning', 'Distributed Denial of Service', 'HIPAA', 'Linux', 'Tableau', 'Version Control System', 'Regression', 'Decision Trees', 'JavaScript', 'Grafana', 'IDE', 'VM', 'GUI', 'Prometheus', 'Machine Learning', 'Excel', 'JWT', 'Internet Message Access Protocol', 'Extract, Tra

In [37]:
len(model.wv['python'])

100

In [38]:
jds = df['location_vectors'].to_list()
j=float(input("Enter the threshold to filter out the jobs "))
def get_vector(l):
    vectors = [model.wv[i] for i in l if i in model.wv]
    if(len(vectors)==0):
      return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

def similarity_vectors(sl,jl):
  similarity=[]
  my_skills_l = [i.lower().replace(' ','_') for i in sl]
  my_skills_vector=np.array(get_vector(my_skills_l)).reshape(1,-1)
  jl=jl.reshape(1,-1)
  for i in range(len(js)):
    jbs_skill_l=[j.lower().replace(' ','_') for j in js[i]]
    jbs_skills_vector=np.array(get_vector(jbs_skill_l)).reshape(1,-1)
    cs=cosine_similarity(my_skills_vector,jbs_skills_vector)[0][0]
    jc=cosine_similarity(jl,jds[i].reshape(1,-1))[0][0]
    final_similarity = 0.8*cs + 0.2*jc
    similarity.append(final_similarity)
  return similarity

def recommendation(cf):
  recom=[]
  for i in range(users):
    s=similarity_vectors(cf['skillsofusers'][i+1],cf['location_vector'][i+1])
    df1=df
    df1['similarity']=s
    df1=df1.sort_values('similarity',ascending=False)
    df1=df1[df1['similarity'] >= j]
    recom.append(df1[['job_title','company','job_link','job_location']])
  return recom

recommend_l=recommendation(users_df1)
for i in range(users):
  print("---------------------------------------------------USER:",i+1," RECOMMENDATAIONS ------------------------------------------------------------")
  display(recommend_l[i])

Enter the threshold to filter out the jobs 0.5
---------------------------------------------------USER: 1  RECOMMENDATAIONS ------------------------------------------------------------


Unnamed: 0,job_title,company,job_link,job_location
1751,web developer intern,eulerity,https://www.linkedin.com/jobs/view/web-develop...,new york
15,data science intern,chubb,https://www.linkedin.com/jobs/view/data-scienc...,new york
2015,software quality assurance engineer,paramount+,https://www.linkedin.com/jobs/view/software-qu...,new york
1315,senior site reliability engineer,interex group,https://www.linkedin.com/jobs/view/senior-site...,new york
1309,senior site reliability engineer,interex group,https://www.linkedin.com/jobs/view/senior-site...,new york
2239,software engineer,cipher mining,https://www.linkedin.com/jobs/view/tech-lead-s...,new york
2238,software engineer,cipher mining,https://www.linkedin.com/jobs/view/software-en...,new york
489,machine learning engineer,altice usa,https://www.linkedin.com/jobs/view/machine-lea...,new york
3469,"machine learning software engineer, phd univer...",google,https://www.linkedin.com/jobs/view/machine-lea...,new york


---------------------------------------------------USER: 2  RECOMMENDATAIONS ------------------------------------------------------------


Unnamed: 0,job_title,company,job_link,job_location
2524,full stack software engineer (contractor),arine,https://www.linkedin.com/jobs/view/full-stack-...,san francisco
3193,senior data scientist,cleanlab,https://www.linkedin.com/jobs/view/senior-data...,san francisco
3440,vice president of data & analytics platform en...,forge,https://www.linkedin.com/jobs/view/vice-presid...,san francisco
1866,entry level software engineer,skillstorm,https://www.linkedin.com/jobs/view/entry-level...,phoenix
1812,software engineer - frontend,docusign,https://www.linkedin.com/jobs/view/software-en...,san francisco
1863,"software engineer, fullstack (frontend)",docusign,https://www.linkedin.com/jobs/view/software-en...,san francisco
139,data analyst,numeric technologies,https://www.linkedin.com/jobs/view/data-analys...,san francisco
2483,software engineer at educational software (rem...,oneteamanywhere,https://www.linkedin.com/jobs/view/software-en...,san francisco
344,"software engineer, platform data science",terray therapeutics,https://www.linkedin.com/jobs/view/software-en...,monrovia
2827,"software engineer, platform data science",terray therapeutics,https://www.linkedin.com/jobs/view/software-en...,monrovia


---------------------------------------------------USER: 3  RECOMMENDATAIONS ------------------------------------------------------------


Unnamed: 0,job_title,company,job_link,job_location
2685,software engineer,bayside solutions,https://www.linkedin.com/jobs/view/software-en...,united states
417,climate data scientist,leidos,https://www.linkedin.com/jobs/view/climate-dat...,united states
3280,sr. project analytics specialist,episource,https://www.linkedin.com/jobs/view/sr-project-...,united states
3263,sr. project analytics specialist,episource,https://www.linkedin.com/jobs/view/sr-project-...,united states
3356,"senior data scientist with azure/aws, machine ...","it & ebusiness consulting services, inc.",https://www.linkedin.com/jobs/view/senior-data...,united states
...,...,...,...,...
2753,"looking for data scientist - santa clara, ca, ...",extend information systems inc.,https://www.linkedin.com/jobs/view/looking-for...,united states
2907,"looking for data scientist - santa clara, ca, ...",extend information systems inc.,https://www.linkedin.com/jobs/view/looking-for...,united states
1849,software engineer - full stack,lacework,https://www.linkedin.com/jobs/view/software-en...,united states
2406,senior software engineer,oracle,https://www.linkedin.com/jobs/view/senior-soft...,united states


---------------------------------------------------USER: 4  RECOMMENDATAIONS ------------------------------------------------------------


Unnamed: 0,job_title,company,job_link,job_location
63,senior data analyst,roku inc.,https://www.linkedin.com/jobs/view/senior-data...,austin
1701,software engineer (university grad),meta,https://www.linkedin.com/jobs/view/software-en...,austin
3568,"vp, data science",the knot worldwide,https://www.linkedin.com/jobs/view/vp-data-sci...,austin


## **Evaluations**

In [40]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
def cals(recommended_jobs,relevant_jobs):
  recommended_jobs = set(recommended_jobs)
  relevant_jobs = set(relevant_jobs)
  tp=len(recommended_jobs.intersection(relevant_jobs))
  fp=len(recommended_jobs-relevant_jobs)
  fn=len(relevant_jobs-recommended_jobs)
  return tp,fp,fn

def find_scores(tp,fp,fn,relevant_jobs):
  if(tp+fp > 0):
    p= tp/(tp+fp)
  else:
    p=0
  if(tp+fn > 0):
    r=tp/(tp+fn)
  else:
    r=0
  if(p+r >0):
    f1= 2*p*r/(p+r)
  else:
    f1=0
  return p,f1,r




In [41]:
def Evaluation(cf):
  eval_user=[]
  t=[]
  f1s=[]
  pre=[]
  re=[]
  accu=[]
  for i in range(users):
    recommend_jobs_l=recommend_l[i]['job_link'].to_list()
    df2=df[df['job_location'] == users_df1['userPreferedloc'][i+1]]
    relevant_jobs = df2['job_link'].to_list()
    tp,fp,fn=cals(recommend_jobs_l,relevant_jobs)
    p,f1,r=find_scores(tp,fp,fn,relevant_jobs)
    pre.append(p)
    re.append(r)
    f1s.append(f1)
    eval_user.append(i+1)

  eval_df=pd.DataFrame()
  eval_df['User'] = eval_user
  eval_df['Precision'] = pre
  return eval_df

evl_df=Evaluation(users_df1)
evl_df.set_index('User', inplace=True)
print(evl_df)

      Precision
User           
1      1.000000
2      0.600000
3      0.924242
4      1.000000


In [42]:
#Precision for each user
for i in range(users):
  print(f"Precison for User {i+1}: {(evl_df['Precision'][i+1])*100: .2f}%")

Precison for User 1:  100.00%
Precison for User 2:  60.00%
Precison for User 3:  92.42%
Precison for User 4:  100.00%


In [43]:
# Average Precision of our recommendation system
avg = sum(evl_df['Precision']/len(evl_df))
print(f"Precision of recommendation system:{avg*100: .2f}%")

Precision of recommendation system: 88.11%


## Implementing the Extra Job Recommendations

In [44]:
extra_skill_df=pd.DataFrame()
es=[]
us=[]
ls=[]
lv=[]
extra_skill=[]
for i in range(users):
  e=(set(skills)-set(users_df1['skillsofusers'][i+1]))
  k=random.sample(e, min(10, len(e)))
  es.append(k+users_df1['skillsofusers'][i+1])
  us.append(i+1)
  ls.append(users_df1['userPreferedloc'][i+1])
  lv.append(users_df1['location_vector'][i+1])
  extra_skill.append(k)


extra_skill_df['users']=us
extra_skill_df['skillsofusers']=es
extra_skill_df['extra_skills'] = extra_skill
extra_skill_df['userPreferedloc'] = ls
extra_skill_df['location_vector'] = lv

since Python 3.9 and will be removed in a subsequent version.
  k=random.sample(e, min(10, len(e)))


In [45]:
extra_skill_df.set_index('users',inplace=True)
extra_skill_df.head()

Unnamed: 0_level_0,skillsofusers,extra_skills,userPreferedloc,location_vector
users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"[SaaS, Backend as a Service, ANN, SVN, Linux, ...","[SaaS, Backend as a Service, ANN, SVN, Linux, ...",new york,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[SciPy, SaaS, Scripting, GUI, CAP, CV, RESTful...","[SciPy, SaaS, Scripting, GUI, CAP, CV, RESTful...",san francisco,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[Deep Learning, Data Mining, Business Intellig...","[Deep Learning, Data Mining, Business Intellig...",united states,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[VLAN, Deep Learning, Business Intelligence, J...","[VLAN, Deep Learning, Business Intelligence, J...",austin,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [46]:
extra_recommeded_l=recommendation(extra_skill_df)

In [47]:
initial_job_links = [set(recommend_l[i]['job_link']) for i in range(len(recommend_l))]

for i in range(len(extra_recommeded_l)):
    extra_recommeded_l[i] = extra_recommeded_l[i][~extra_recommeded_l[i]['job_link'].isin(initial_job_links[i])]
    print(f"---------------------------------------------------USER: {i+1} EXTRA RECOMMENDATIONS ------------------------------------------------------------")
    display(extra_recommeded_l[i])


---------------------------------------------------USER: 1 EXTRA RECOMMENDATIONS ------------------------------------------------------------


Unnamed: 0,job_title,company,job_link,job_location
1510,senior cloud ops engineer / lead - new york - ...,hawksworth,https://www.linkedin.com/jobs/view/senior-clou...,new york


---------------------------------------------------USER: 2 EXTRA RECOMMENDATIONS ------------------------------------------------------------


Unnamed: 0,job_title,company,job_link,job_location
3136,"staff, applied scientist",nax group,https://www.linkedin.com/jobs/view/staff-appli...,san francisco


---------------------------------------------------USER: 3 EXTRA RECOMMENDATIONS ------------------------------------------------------------


Unnamed: 0,job_title,company,job_link,job_location
166,senior director of data science,trustengine,https://www.linkedin.com/jobs/view/senior-dire...,united states
1593,senior devops engineer,paubox,https://www.linkedin.com/jobs/view/senior-devo...,united states
92,data scientist,workforce connections,https://www.linkedin.com/jobs/view/data-scient...,united states
892,100% remote role – devops engineer,stellent it,https://www.linkedin.com/jobs/view/100%25-remo...,united states
800,sr. devops engineer (aws),apn consulting inc.,https://www.linkedin.com/jobs/view/sr-devops-e...,united states
829,senior devops engineer,perficient,https://www.linkedin.com/jobs/view/senior-devo...,united states
424,security analytics engineer (l4),netflix,https://www.linkedin.com/jobs/view/security-an...,united states
711,"scientist, computational biology",flagship pioneering,https://www.linkedin.com/jobs/view/scientist-c...,cambridge
716,"scientist, computational biology",flagship pioneering,https://www.linkedin.com/jobs/view/scientist-c...,cambridge
485,data scientist - mid,dt professional services,https://www.linkedin.com/jobs/view/data-scient...,united states


---------------------------------------------------USER: 4 EXTRA RECOMMENDATIONS ------------------------------------------------------------


Unnamed: 0,job_title,company,job_link,job_location
