<a href="https://colab.research.google.com/github/merry2121/AI-Based-Job-Matching-and-Skill-gap-Analysis/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

project_path = "/content/drive/MyDrive/Thesis_Sentiment_SBERT"
os.makedirs(project_path, exist_ok=True)

project_path

'/content/drive/MyDrive/Thesis_Sentiment_SBERT'

In [3]:
!pip install sentence-transformers
!pip install datasets
!pip install scikit-learn
!pip install pandas numpy



In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [5]:
import pandas as pd

jobs = [
    "We are looking for a software engineer with skills in Python, Machine Learning, SQL, and Django.",
    "Seeking a front-end developer skilled in HTML, CSS, JavaScript, React, and UI/UX design.",
    "Data analyst required with strong Excel, Python, Power BI, Statistics, and communication skills."
]

candidates = [
    "I am a computer engineering student skilled in Python, HTML, CSS, and SQL with basic machine learning experience.",
    "I have experience in JavaScript, React, UI design, and Figma for frontend projects.",
    "My skills include Python, Power BI, Excel, Statistics, and data visualization."
]

df_jobs = pd.DataFrame({"job_description": jobs})
df_candidates = pd.DataFrame({"resume": candidates})

df_jobs, df_candidates

(                                     job_description
 0  We are looking for a software engineer with sk...
 1  Seeking a front-end developer skilled in HTML,...
 2  Data analyst required with strong Excel, Pytho...,
                                               resume
 0  I am a computer engineering student skilled in...
 1  I have experience in JavaScript, React, UI des...
 2  My skills include Python, Power BI, Excel, Sta...)

In [6]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
job_embeddings = model.encode(df_jobs['job_description'], convert_to_numpy=True)
resume_embeddings = model.encode(df_candidates['resume'], convert_to_numpy=True)

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(resume_embeddings, job_embeddings) * 100
similarity_matrix

array([[73.05046 , 55.264812, 54.054333],
       [25.56507 , 72.09152 , 15.276262],
       [59.43911 , 46.058308, 70.70142 ]], dtype=float32)

In [9]:
for i, resume in enumerate(df_candidates['resume']):
    print(f"\nCandidate {i+1} best match:")
    best_job_index = similarity_matrix[i].argmax()
    score = similarity_matrix[i][best_job_index]
    print(f"  Job: {df_jobs.iloc[best_job_index]['job_description']}")
    print(f"  Match Score: {score:.2f}%")


Candidate 1 best match:
  Job: We are looking for a software engineer with skills in Python, Machine Learning, SQL, and Django.
  Match Score: 73.05%

Candidate 2 best match:
  Job: Seeking a front-end developer skilled in HTML, CSS, JavaScript, React, and UI/UX design.
  Match Score: 72.09%

Candidate 3 best match:
  Job: Data analyst required with strong Excel, Python, Power BI, Statistics, and communication skills.
  Match Score: 70.70%


In [10]:
skill_dict = [
    "python","machine learning","sql","django","html","css","javascript",
    "react","ui","ux","excel","power bi","statistics","communication"
]

In [11]:
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_skills(text):
    text = text.lower()
    return [skill for skill in skill_dict if skill in text]

df_jobs['skills'] = df_jobs['job_description'].apply(extract_skills)
df_candidates['skills'] = df_candidates['resume'].apply(extract_skills)
df_jobs, df_candidates

(                                     job_description  \
 0  We are looking for a software engineer with sk...   
 1  Seeking a front-end developer skilled in HTML,...   
 2  Data analyst required with strong Excel, Pytho...   
 
                                               skills  
 0            [python, machine learning, sql, django]  
 1             [html, css, javascript, react, ui, ux]  
 2  [python, ui, excel, power bi, statistics, comm...  ,
                                               resume  \
 0  I am a computer engineering student skilled in...   
 1  I have experience in JavaScript, React, UI des...   
 2  My skills include Python, Power BI, Excel, Sta...   
 
                                        skills  
 0  [python, machine learning, sql, html, css]  
 1                     [javascript, react, ui]  
 2       [python, excel, power bi, statistics]  )

In [12]:
def skill_gap(candidate_skills, job_skills):
    candidate_set = set(candidate_skills)
    job_set = set(job_skills)
    missing = job_set - candidate_set
    return list(missing)

for i in range(len(df_candidates)):
    print(f"\nCandidate {i+1}:")
    job_index = similarity_matrix[i].argmax()
    gaps = skill_gap(df_candidates.iloc[i]['skills'], df_jobs.iloc[job_index]['skills'])
    print("Missing skills:", gaps)


Candidate 1:
Missing skills: ['django']

Candidate 2:
Missing skills: ['ux', 'css', 'html']

Candidate 3:
Missing skills: ['communication', 'ui']


In [13]:
def recommend_skills(missing_skills):
    recommendations = {}
    for skill in missing_skills:
        recommendations[skill] = f"Take an online course on {skill}"
    return recommendations

In [14]:
from google.colab import drive
drive.mount('/content/drive')

!pip install sentence-transformers
!pip install requests beautifulsoup4
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install scikit-learn pandas numpy

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_ethiojobs(pages=3):
    job_data = []
    base = "https://www.ethiojobs.net/search-results-jobs/?searchId=&page="

    for p in range(1, pages+1):
        url = base + str(p)
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')

        listings = soup.select('.listing')

        for job in listings:
            title = job.select_one('.media-heading a').text.strip()
            summary = job.select_one('.listing-summary').text.strip()
            job_data.append([title, summary])

        print(f"Scraped page {p}")

    df_jobs = pd.DataFrame(job_data, columns=["title", "description"])
    return df_jobs

df_jobs = scrape_ethiojobs(3)
df_jobs.head()

Scraped page 1
Scraped page 2
Scraped page 3


Unnamed: 0,title,description


In [16]:
import random

skills_pool = [
    "Python","Machine Learning","Deep Learning","SQL","Django","Flask","C++",
    "Networking","PLC","Excel","Power BI","Data Analysis","Java","HTML","CSS",
    "JavaScript","React","Node.js","Communication","Teamwork"
]

education_pool = [
    "BSc in Computer Engineering, Mekelle University",
    "BSc in Software Engineering, Addis Ababa University",
    "BSc in Information Technology, AAIT",
    "Electrical Engineering, Adama University",
]

experience_pool = [
    "Intern at Ethio Telecom",
    "Worked on IoT System for Smart Agriculture",
    "Intern at Commercial Bank of Ethiopia (IT Dept)",
    "Research assistant in NLP lab",
]

def generate_resume():
    return {
        "resume": f"""
        Education: {random.choice(education_pool)}
        Skills: {', '.join(random.sample(skills_pool, 5))}
        Experience: {random.choice(experience_pool)}
        """
    }

resumes = [generate_resume() for _ in range(10)]
df_resumes = pd.DataFrame(resumes)
df_resumes.head()

Unnamed: 0,resume
0,\n Education: BSc in Computer Engineeri...
1,\n Education: BSc in Software Engineeri...
2,\n Education: BSc in Information Techno...
3,\n Education: BSc in Software Engineeri...
4,"\n Education: Electrical Engineering, A..."


In [17]:
import spacy
nlp = spacy.load("en_core_web_sm")

skill_dict = [s.lower() for s in skills_pool]

def extract_skills(text):
    text = text.lower()
    return [skill for skill in skill_dict if skill in text]

df_jobs["skills"] = df_jobs["description"].apply(extract_skills)
df_resumes["skills"] = df_resumes["resume"].apply(extract_skills)

df_jobs.head(), df_resumes.head()

(Empty DataFrame
 Columns: [title, description, skills]
 Index: [],
                                               resume  \
 0  \n        Education: BSc in Computer Engineeri...   
 1  \n        Education: BSc in Software Engineeri...   
 2  \n        Education: BSc in Information Techno...   
 3  \n        Education: BSc in Software Engineeri...   
 4  \n        Education: Electrical Engineering, A...   
 
                                               skills  
 0  [machine learning, django, networking, java, h...  
 1  [deep learning, django, c++, react, communicat...  
 2         [machine learning, c++, plc, css, node.js]  
 3  [python, deep learning, excel, power bi, java,...  
 4  [machine learning, django, java, javascript, c...  )

In [18]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

job_embeddings = model.encode(df_jobs["description"].tolist(), convert_to_numpy=True)
resume_embeddings = model.encode(df_resumes["resume"].tolist(), convert_to_numpy=True)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [23]:
import pandas as pd

ethiopian_jobs = [
    ["Software Engineer", "A reputable IT company in Addis Ababa is seeking a software engineer skilled in Python, Django, REST APIs, SQL, Git, and cloud deployment."],
    ["Backend Developer", "We are hiring a backend developer with experience in Node.js, Express, MongoDB, PostgreSQL, and API development."],
    ["Frontend Developer", "Looking for a front-end developer with skills in HTML, CSS, JavaScript, React, Figma, and UI/UX best practices."],
    ["Full Stack Developer", "Candidates must be fluent in both front-end and back-end technologies including React, Node.js, Django, SQL, and cloud platforms."],
    ["Mobile App Developer", "Hiring a Flutter/Dart mobile developer with experience building Android/iOS apps, Firebase integration, and UI design."],
    ["IT Technician", "Responsibilities include troubleshooting computers, installing software, diagnosing networks, and maintaining hardware systems."],
    ["Network Administrator", "Required skills: CCNA, router configuration, firewall management, network monitoring tools, IP subnetting, VLAN setup."],
    ["Electrical Engineer", "Looking for an electrical engineer with PLC programming, industrial automation, AutoCAD design, and installation supervision experience."],
    ["Mechanical Engineer", "Experience in machine design, maintenance, SolidWorks, CAD, manufacturing processes is required."],
    ["Civil Engineer", "Required skills: site supervision, AutoCAD drafting, quantity surveying, project management, structural analysis."],
    ["Data Analyst", "Seeking a data analyst skilled in Excel, Power BI, SQL, Python, statistics, and dashboard creation."],
    ["AI/ML Engineer", "Looking for machine learning engineer with Python, TensorFlow, Keras, NLP, deep learning, and model deployment experience."],
    ["Cybersecurity Analyst", "Knowledge in Linux security, SIEM tools, penetration testing, incident response, and network defense is required."],
    ["Database Administrator", "Skills required: MySQL, PostgreSQL, performance tuning, backups, stored procedures, query optimization."],
    ["System Administrator", "Experience with Linux, Windows Server, virtualization, cloud management, user administration, and DevOps tools."],
    ["Accountant", "Looking for an accountant with IFRS knowledge, Peachtree, tax preparation, financial reporting, and analysis."],
    ["Finance Officer", "Skills: budgeting, auditing, Excel, reporting, financial modeling, ERP systems."],
    ["HR Manager", "Experience in recruitment, HR policies, payroll management, staff training, and performance evaluation."],
    ["Marketing Specialist", "Required: digital marketing, content creation, SEO, social media management, Google Ads expertise."],
    ["Graphics Designer", "Skills: Adobe Photoshop, Illustrator, motion graphics, branding, UI design, creative visuals."],
] * 5  # repeat to reach 100 rows

df_jobs = pd.DataFrame(ethiopian_jobs, columns=["title", "description"])
df_jobs.shape, df_jobs.head()

((100, 2),
                   title                                        description
 0     Software Engineer  A reputable IT company in Addis Ababa is seeki...
 1     Backend Developer  We are hiring a backend developer with experie...
 2    Frontend Developer  Looking for a front-end developer with skills ...
 3  Full Stack Developer  Candidates must be fluent in both front-end an...
 4  Mobile App Developer  Hiring a Flutter/Dart mobile developer with ex...)

In [24]:
import random

skills_pool = [
    "Python","Machine Learning","Deep Learning","SQL","Django","Flask",
    "C++","Java","Networking","PLC","Excel","Power BI","Data Analysis",
    "JavaScript","React","Node.js","Flutter","Dart","Linux","DevOps",
    "Communication","Teamwork","Leadership"
]

education_pool = [
    "BSc in Computer Engineering, Mekelle University",
    "BSc in Software Engineering, Addis Ababa University",
    "BSc in Information Technology, AAIT",
    "BSc in Electrical Engineering, Adama Science and Technology University",
    "Computer Science, Unity University"
]

experience_pool = [
    "Intern at Ethio Telecom",
    "Intern at Commercial Bank of Ethiopia",
    "IoT project developer in campus",
    "Junior IT support technician",
    "Software developer intern at local company"
]

def generate_resume():
    return {
        "resume": f"""
Education: {random.choice(education_pool)}
Skills: {', '.join(random.sample(skills_pool, 6))}
Experience: {random.choice(experience_pool)}
"""
    }

df_resumes = pd.DataFrame([generate_resume() for _ in range(50)])
df_resumes.head()

Unnamed: 0,resume
0,"\nEducation: Computer Science, Unity Universit..."
1,"\nEducation: BSc in Software Engineering, Addi..."
2,"\nEducation: Computer Science, Unity Universit..."
3,"\nEducation: BSc in Information Technology, AA..."
4,"\nEducation: BSc in Information Technology, AA..."


In [25]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

job_embeddings = model.encode(df_jobs["description"].tolist(), convert_to_numpy=True)
resume_embeddings = model.encode(df_resumes["resume"].tolist(), convert_to_numpy=True)

similarity_matrix = cosine_similarity(resume_embeddings, job_embeddings) * 100
similarity_matrix

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


array([[59.36878 , 41.478218, 38.617485, ..., 40.534927, 47.300816,
        39.007187],
       [62.272358, 45.7487  , 49.09045 , ..., 35.54269 , 46.14472 ,
        50.06737 ],
       [53.644825, 49.469055, 47.29434 , ..., 40.298466, 46.34184 ,
        49.51083 ],
       ...,
       [66.67756 , 38.989483, 32.99092 , ..., 37.341854, 45.24635 ,
        38.20428 ],
       [70.66976 , 44.711155, 40.855194, ..., 36.166634, 44.77736 ,
        41.698055],
       [50.609352, 46.65809 , 49.327812, ..., 39.683826, 42.917324,
        47.149952]], dtype=float32)

In [26]:
for i, res in enumerate(df_resumes["resume"]):
    best = similarity_matrix[i].argmax()
    score = similarity_matrix[i][best]

    print(f"\n===== Candidate {i+1} =====")
    print("BEST MATCH:", df_jobs.iloc[best]["title"])
    print("Score:", round(score,2))


===== Candidate 1 =====
BEST MATCH: Software Engineer
Score: 59.37

===== Candidate 2 =====
BEST MATCH: Software Engineer
Score: 62.27

===== Candidate 3 =====
BEST MATCH: Mobile App Developer
Score: 56.05

===== Candidate 4 =====
BEST MATCH: Software Engineer
Score: 51.46

===== Candidate 5 =====
BEST MATCH: Software Engineer
Score: 51.44

===== Candidate 6 =====
BEST MATCH: Software Engineer
Score: 65.27

===== Candidate 7 =====
BEST MATCH: Software Engineer
Score: 50.29

===== Candidate 8 =====
BEST MATCH: Software Engineer
Score: 56.26

===== Candidate 9 =====
BEST MATCH: AI/ML Engineer
Score: 54.62

===== Candidate 10 =====
BEST MATCH: Mobile App Developer
Score: 49.41

===== Candidate 11 =====
BEST MATCH: Software Engineer
Score: 59.49

===== Candidate 12 =====
BEST MATCH: Software Engineer
Score: 52.08

===== Candidate 13 =====
BEST MATCH: AI/ML Engineer
Score: 52.96

===== Candidate 14 =====
BEST MATCH: Network Administrator
Score: 50.35

===== Candidate 15 =====
BEST MATCH: S

In [27]:
from sentence_transformers import InputExample
import random

train_examples = []

for i in range(len(df_resumes)):
    # pick one good match
    pos_job_idx = random.randint(0, len(df_jobs)-1)
    pos_job = df_jobs.iloc[pos_job_idx]["description"]
    resume = df_resumes.iloc[i]["resume"]

    # positive example (label close to 1.0)
    train_examples.append(InputExample(texts=[resume, pos_job], label=0.9))

    # negative example (choose unrelated job)
    neg_job_idx = random.randint(0, len(df_jobs)-1)
    while neg_job_idx == pos_job_idx:
        neg_job_idx = random.randint(0, len(df_jobs)-1)

    neg_job = df_jobs.iloc[neg_job_idx]["description"]

    # negative example (label close to 0.0)
    train_examples.append(InputExample(texts=[resume, neg_job], label=0.1))

In [28]:
from sentence_transformers import SentenceTransformer, losses
from torch.utils.data import DataLoader

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
train_loss = losses.CosineSimilarityLoss(model)

# fine-tune
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=20
)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  super().__init__(loader)


Step,Training Loss


In [29]:
model.save("/content/drive/MyDrive/ethiopian_sbert_model")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
custom_resume_emb = model.encode(df_resumes["resume"].tolist())
custom_job_emb = model.encode(df_jobs["description"].tolist())

from sklearn.metrics.pairwise import cosine_similarity
custom_similarity = cosine_similarity(custom_resume_emb, custom_job_emb)

custom_similarity[:5, :5]

array([[0.39296556, 0.3754648 , 0.4784585 , 0.28924504, 0.4670297 ],
       [0.38599908, 0.3494052 , 0.4888014 , 0.249954  , 0.591425  ],
       [0.35105696, 0.39199796, 0.49180123, 0.30780917, 0.6546689 ],
       [0.36581457, 0.4078589 , 0.46377626, 0.3299577 , 0.6431651 ],
       [0.30062968, 0.39863747, 0.5107539 , 0.27887535, 0.5498313 ]],
      dtype=float32)

In [31]:
# Example: save a file to Google Drive
file_path = '/content/drive/MyDrive/demo.py'  # path in your Drive
with open(file_path, 'w') as f:
    f.write("# Your Python code here\nprint('Hello Drive!')")

# Example: save a DataFrame
import pandas as pd
df = pd.DataFrame({'A':[1,2,3]})
df.to_csv('/content/drive/MyDrive/results.csv', index=False)

In [32]:
import os

folder_path = '/content/drive/MyDrive/MyColabProject'
os.makedirs(folder_path, exist_ok=True)  # creates folder if it doesn't exist

# Save notebook outputs
df.to_csv(os.path.join(folder_path, 'results.csv'), index=False)